tokenizers 0.21.0

Provides an implementation of today's most used tokenizers, with a focus on performances and versatility.
Documentation
[[bench]]
harness = false
name = "bert_benchmark"
path = "benches/bert_benchmark.rs"

[[bench]]
harness = false
name = "bpe_benchmark"
path = "benches/bpe_benchmark.rs"

[[bench]]
harness = false
name = "layout_benchmark"
path = "benches/layout_benchmark.rs"

[[bench]]
harness = false
name = "llama3"
path = "benches/llama3.rs"
required-features = ["http"]

[[bench]]
harness = false
name = "unigram_benchmark"
path = "benches/unigram_benchmark.rs"

[dependencies.aho-corasick]
version = "1.1"

[dependencies.derive_builder]
version = "0.20"

[dependencies.esaxx-rs]
default-features = false
features = []
version = "0.1.10"

[dependencies.fancy-regex]
optional = true
version = "0.13"

[dependencies.getrandom]
version = "0.2.10"

[dependencies.hf-hub]
optional = true
version = "0.3.2"

[dependencies.indicatif]
optional = true
version = "0.17"

[dependencies.itertools]
version = "0.12"

[dependencies.lazy_static]
version = "1.4"

[dependencies.log]
version = "0.4"

[dependencies.macro_rules_attribute]
version = "0.2.0"

[dependencies.monostate]
version = "0.1.12"

[dependencies.onig]
default-features = false
optional = true
version = "6.4"

[dependencies.paste]
version = "1.0.14"

[dependencies.rand]
version = "0.8"

[dependencies.rayon]
version = "1.10"

[dependencies.rayon-cond]
version = "0.3"

[dependencies.regex]
version = "1.10"

[dependencies.regex-syntax]
version = "0.8"

[dependencies.serde]
features = ["derive"]
version = "1.0"

[dependencies.serde_json]
version = "1.0"

[dependencies.spm_precompiled]
version = "0.1.3"

[dependencies.thiserror]
version = "1.0.49"

[dependencies.unicode-normalization-alignments]
version = "0.1"

[dependencies.unicode-segmentation]
version = "1.11"

[dependencies.unicode_categories]
version = "0.1"

[dev-dependencies.assert_approx_eq]
version = "1.1"

[dev-dependencies.criterion]
version = "0.5"

[dev-dependencies.tempfile]
version = "3.10"

[dev-dependencies.tracing]
version = "0.1"

[dev-dependencies.tracing-subscriber]
version = "0.3.18"

[[example]]
name = "encode_batch"
path = "examples/encode_batch.rs"
required-features = ["http"]

[[example]]
name = "serialization"
path = "examples/serialization.rs"

[features]
default = ["progressbar", "onig", "esaxx_fast"]
esaxx_fast = ["esaxx-rs/cpp"]
http = ["hf-hub"]
progressbar = ["indicatif"]
unstable_wasm = ["fancy-regex", "getrandom/js"]

[lib]
bench = false
name = "tokenizers"
path = "src/lib.rs"

[package]
authors = ["Anthony MOI <m.anthony.moi@gmail.com>", "Nicolas Patry <patry.nicolas@protonmail.com>"]
autobenches = false
autobins = false
autoexamples = false
autotests = false
build = false
description = """
Provides an implementation of today's most used tokenizers,
with a focus on performances and versatility.
"""
documentation = "https://docs.rs/tokenizers/"
edition = "2018"
exclude = ["rust-toolchain", "target/*", "Cargo.lock", "benches/*.txt", "benches/*.json", "data/*"]
homepage = "https://github.com/huggingface/tokenizers"
keywords = ["tokenizer", "NLP", "huggingface", "BPE", "WordPiece"]
license = "Apache-2.0"
name = "tokenizers"
readme = "README.md"
repository = "https://github.com/huggingface/tokenizers"
version = "0.21.0"

[profile.release]
lto = "fat"

[[test]]
name = "added_tokens"
path = "tests/added_tokens.rs"

[[test]]
name = "documentation"
path = "tests/documentation.rs"

[[test]]
name = "from_pretrained"
path = "tests/from_pretrained.rs"

[[test]]
name = "offsets"
path = "tests/offsets.rs"

[[test]]
name = "serialization"
path = "tests/serialization.rs"

[[test]]
name = "training"
path = "tests/training.rs"

[[test]]
name = "unigram"
path = "tests/unigram.rs"