diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 89d8d9b..256129b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -34,7 +34,7 @@ jobs: cd .. - name: Cargo Test With Release Build - run: ORT_LIB_LOCATION="$(pwd)/onnxruntime/build/Linux/Release" cargo test --release --no-default-features --features online + run: ORT_LIB_LOCATION="$(pwd)/onnxruntime/build/Linux/Release" cargo test --release --no-default-features --features hf-hub-native-tls - name: Cargo Test Offline run: ORT_LIB_LOCATION="$(pwd)/onnxruntime/build/Linux/Release" cargo test --no-default-features diff --git a/Cargo.toml b/Cargo.toml index 3221b58..10ab8cc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fastembed" -version = "4.4.0" +version = "4.5.0" edition = "2021" description = "Rust implementation of https://github.com/qdrant/fastembed" license = "Apache-2.0" @@ -23,7 +23,7 @@ homepage = "https://crates.io/crates/fastembed" [dependencies] anyhow = { version = "1" } -hf-hub = { version = "0.3", default-features = false } +hf-hub = { version = "0.4.1", default-features = false, optional = true } image = "0.25.2" ndarray = { version = "0.16", default-features = false } ort = { version = "=2.0.0-rc.9", default-features = false, features = [ @@ -31,13 +31,21 @@ ort = { version = "=2.0.0-rc.9", default-features = false, features = [ ] } rayon = { version = "1.10", default-features = false } serde_json = { version = "1" } -tokenizers = { version = "0.19", default-features = false, features = ["onig"] } +tokenizers = { version = "0.21", default-features = false, features = ["onig"] } [features] -default = ["ort-download-binaries", "online"] -online = ["hf-hub/online"] +default = ["ort-download-binaries", "hf-hub-native-tls"] + +hf-hub = ["dep:hf-hub", "hf-hub?/ureq"] +hf-hub-native-tls = ["hf-hub", "hf-hub?/native-tls"] +hf-hub-rustls-tls = ["hf-hub", "hf-hub?/rustls-tls"] + ort-download-binaries = ["ort/download-binaries"] ort-load-dynamic = ["ort/load-dynamic"] + # This feature does not change any code, but is used to limit tests if # the user does not have `optimum-cli` or even python installed. optimum-cli = [] + +# For compatibility recommend using hf-hub-native-tls +online = ["hf-hub-native-tls"] diff --git a/README.md b/README.md index 5dd9da0..4c7dc0e 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ - [**sentence-transformers/all-MiniLM-L12-v2**](https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2) - [**sentence-transformers/paraphrase-MiniLM-L12-v2**](https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L12-v2) - [**sentence-transformers/paraphrase-multilingual-mpnet-base-v2**](https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2) +- [**lightonai/ModernBERT-embed-large**](https://huggingface.co/lightonai/modernbert-embed-large) - [**nomic-ai/nomic-embed-text-v1**](https://huggingface.co/nomic-ai/nomic-embed-text-v1) - [**nomic-ai/nomic-embed-text-v1.5**](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) - pairs with the `nomic-embed-vision-v1.5` image model for image-to-text search - [**intfloat/multilingual-e5-small**](https://huggingface.co/intfloat/multilingual-e5-small) diff --git a/src/common.rs b/src/common.rs index f1441d0..1c0ce8c 100644 --- a/src/common.rs +++ b/src/common.rs @@ -1,5 +1,5 @@ use anyhow::Result; -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use hf_hub::api::sync::ApiRepo; use std::io::Read; use std::{fs::File, path::PathBuf}; @@ -29,7 +29,7 @@ pub struct TokenizerFiles { /// The procedure for loading tokenizer files from the hugging face hub is separated /// from the main load_tokenizer function (which is expecting bytes, from any source). -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] pub fn load_tokenizer_hf_hub(model_repo: ApiRepo, max_length: usize) -> Result { let tokenizer_files: TokenizerFiles = TokenizerFiles { tokenizer_file: read_file_to_bytes(&model_repo.get("tokenizer.json")?)?, @@ -49,7 +49,7 @@ pub fn load_tokenizer(tokenizer_files: TokenizerFiles, max_length: usize) -> Res let base_error_message = "Error building TokenizerFiles for UserDefinedEmbeddingModel. Could not read {} file."; - // Serialise each tokenizer file + // Deserialize each tokenizer file let config: serde_json::Value = serde_json::from_slice(&tokenizer_files.config_file).map_err(|_| { std::io::Error::new( @@ -88,7 +88,7 @@ pub fn load_tokenizer(tokenizer_files: TokenizerFiles, max_length: usize) -> Res let pad_id = config["pad_token_id"].as_u64().unwrap_or(0) as u32; let pad_token = tokenizer_config["pad_token"] .as_str() - .expect("Error reading pad_token from tokenier_config.json") + .expect("Error reading pad_token from tokenizer_config.json") .into(); let mut tokenizer = tokenizer diff --git a/src/image_embedding/impl.rs b/src/image_embedding/impl.rs index fe6f979..c526e30 100644 --- a/src/image_embedding/impl.rs +++ b/src/image_embedding/impl.rs @@ -1,4 +1,4 @@ -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use hf_hub::{ api::sync::{ApiBuilder, ApiRepo}, Cache, @@ -8,7 +8,7 @@ use ort::{ session::{builder::GraphOptimizationLevel, Session}, value::Value, }; -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use std::path::PathBuf; use std::{path::Path, thread::available_parallelism}; @@ -17,10 +17,10 @@ use crate::{ ModelInfo, }; use anyhow::anyhow; -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use anyhow::Context; -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use super::ImageInitOptions; use super::{ init::{ImageInitOptionsUserDefined, UserDefinedImageEmbeddingModel}, @@ -35,7 +35,7 @@ impl ImageEmbedding { /// Uses the highest level of Graph optimization /// /// Uses the total number of CPUs available as the number of intra-threads - #[cfg(feature = "online")] + #[cfg(feature = "hf-hub")] pub fn try_new(options: ImageInitOptions) -> anyhow::Result { let ImageInitOptions { model_name, @@ -104,7 +104,7 @@ impl ImageEmbedding { } /// Return the ImageEmbedding model's directory from cache or remote retrieval - #[cfg(feature = "online")] + #[cfg(feature = "hf-hub")] fn retrieve_model( model: ImageEmbeddingModel, cache_dir: PathBuf, diff --git a/src/image_embedding/utils.rs b/src/image_embedding/utils.rs index 47bfe9b..dd7293d 100644 --- a/src/image_embedding/utils.rs +++ b/src/image_embedding/utils.rs @@ -2,7 +2,7 @@ use anyhow::{anyhow, Result}; use image::{imageops::FilterType, DynamicImage, GenericImageView}; use ndarray::{Array, Array3}; use std::ops::{Div, Sub}; -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use std::{fs::read_to_string, path::Path}; pub enum TransformData { @@ -171,7 +171,7 @@ impl Compose { Self { transforms } } - #[cfg(feature = "online")] + #[cfg(feature = "hf-hub")] pub fn from_file>(file: P) -> anyhow::Result { let content = read_to_string(file)?; let config = serde_json::from_str(&content)?; diff --git a/src/lib.rs b/src/lib.rs index e8f316a..190b164 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,7 +3,7 @@ //! The library provides the TextEmbedding struct to interface with text embedding models. //! #![cfg_attr( - feature = "online", + feature = "hf-hub", doc = r#" ### Instantiating [TextEmbedding](crate::TextEmbedding) ``` @@ -28,7 +28,7 @@ //! Find more info about the available options in the [InitOptions](crate::InitOptions) documentation. //! #![cfg_attr( - feature = "online", + feature = "hf-hub", doc = r#" ### Embeddings generation ``` @@ -67,23 +67,33 @@ pub use ort::execution_providers::ExecutionProviderDispatch; pub use crate::common::{ read_file_to_bytes, Embedding, Error, SparseEmbedding, TokenizerFiles, DEFAULT_CACHE_DIR, }; -pub use crate::image_embedding::{ - ImageEmbedding, ImageInitOptions, ImageInitOptionsUserDefined, UserDefinedImageEmbeddingModel, -}; -pub use crate::models::image_embedding::ImageEmbeddingModel; -pub use crate::models::reranking::{RerankerModel, RerankerModelInfo}; pub use crate::models::{ - model_info::ModelInfo, quantization::QuantizationMode, text_embedding::EmbeddingModel, + model_info::ModelInfo, model_info::RerankerModelInfo, quantization::QuantizationMode, }; pub use crate::output::{EmbeddingOutput, OutputKey, OutputPrecedence, SingleBatchOutput}; pub use crate::pooling::Pooling; -pub use crate::reranking::{ - OnnxSource, RerankInitOptions, RerankInitOptionsUserDefined, RerankResult, TextRerank, - UserDefinedRerankingModel, + +// For Text Embedding +pub use crate::models::text_embedding::EmbeddingModel; +pub use crate::text_embedding::{ + InitOptions, InitOptionsUserDefined, TextEmbedding, UserDefinedEmbeddingModel, }; + +// For Sparse Text Embedding +pub use crate::models::sparse::SparseModel; pub use crate::sparse_text_embedding::{ SparseInitOptions, SparseTextEmbedding, UserDefinedSparseModel, }; -pub use crate::text_embedding::{ - InitOptions, InitOptionsUserDefined, TextEmbedding, UserDefinedEmbeddingModel, + +// For Image Embedding +pub use crate::image_embedding::{ + ImageEmbedding, ImageInitOptions, ImageInitOptionsUserDefined, UserDefinedImageEmbeddingModel, +}; +pub use crate::models::image_embedding::ImageEmbeddingModel; + +// For Reranking +pub use crate::models::reranking::RerankerModel; +pub use crate::reranking::{ + OnnxSource, RerankInitOptions, RerankInitOptionsUserDefined, RerankResult, TextRerank, + UserDefinedRerankingModel, }; diff --git a/src/models/model_info.rs b/src/models/model_info.rs index 67d71f2..4d7b184 100644 --- a/src/models/model_info.rs +++ b/src/models/model_info.rs @@ -1,3 +1,5 @@ +use crate::RerankerModel; + /// Data struct about the available models #[derive(Debug, Clone)] pub struct ModelInfo { @@ -8,3 +10,13 @@ pub struct ModelInfo { pub model_file: String, pub additional_files: Vec, } + +/// Data struct about the available reranker models +#[derive(Debug, Clone)] +pub struct RerankerModelInfo { + pub model: RerankerModel, + pub description: String, + pub model_code: String, + pub model_file: String, + pub additional_files: Vec, +} diff --git a/src/models/reranking.rs b/src/models/reranking.rs index 59764e3..6a5dd31 100644 --- a/src/models/reranking.rs +++ b/src/models/reranking.rs @@ -1,5 +1,7 @@ use std::fmt::Display; +use crate::RerankerModelInfo; + #[derive(Debug, Clone, PartialEq, Eq)] pub enum RerankerModel { /// BAAI/bge-reranker-base @@ -46,16 +48,6 @@ pub fn reranker_model_list() -> Vec { reranker_model_list } -/// Data struct about the available reanker models -#[derive(Debug, Clone)] -pub struct RerankerModelInfo { - pub model: RerankerModel, - pub description: String, - pub model_code: String, - pub model_file: String, - pub additional_files: Vec, -} - impl Display for RerankerModel { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let model_info = reranker_model_list() diff --git a/src/models/sparse.rs b/src/models/sparse.rs index 9811dc3..1e82ccf 100644 --- a/src/models/sparse.rs +++ b/src/models/sparse.rs @@ -1,7 +1,6 @@ use std::fmt::Display; -use crate::{common::SparseEmbedding, ModelInfo}; -use ndarray::{ArrayViewD, Axis, CowArray, Dim}; +use crate::ModelInfo; #[derive(Debug, Clone, PartialEq, Eq)] pub enum SparseModel { @@ -20,48 +19,6 @@ pub fn models_list() -> Vec> { }] } -impl SparseModel { - pub fn post_process( - &self, - model_output: &ArrayViewD, - attention_mask: &CowArray>, - ) -> Vec { - match self { - SparseModel::SPLADEPPV1 => { - // Apply ReLU and logarithm transformation - let relu_log = model_output.mapv(|x| (1.0 + x.max(0.0)).ln()); - - // Convert to f32 and expand the dimensions - let attention_mask = attention_mask.mapv(|x| x as f32).insert_axis(Axis(2)); - - // Weight the transformed values by the attention mask - let weighted_log = relu_log * attention_mask; - - // Get the max scores - let scores = weighted_log.fold_axis(Axis(1), f32::NEG_INFINITY, |r, &v| r.max(v)); - - scores - .rows() - .into_iter() - .map(|row_scores| { - let mut values: Vec = Vec::with_capacity(scores.len()); - let mut indices: Vec = Vec::with_capacity(scores.len()); - - row_scores.into_iter().enumerate().for_each(|(idx, f)| { - if *f > 0.0 { - values.push(*f); - indices.push(idx); - } - }); - - SparseEmbedding { values, indices } - }) - .collect() - } - } - } -} - impl Display for SparseModel { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let model_info = models_list() diff --git a/src/models/text_embedding.rs b/src/models/text_embedding.rs index 7113223..89778ae 100644 --- a/src/models/text_embedding.rs +++ b/src/models/text_embedding.rs @@ -1,11 +1,7 @@ -use crate::pooling::Pooling; +use std::{collections::HashMap, fmt::Display, sync::OnceLock}; use super::model_info::ModelInfo; -use super::quantization::QuantizationMode; - -use std::{collections::HashMap, fmt::Display, sync::OnceLock}; - /// Lazy static list of all available models. static MODEL_MAP: OnceLock>> = OnceLock::new(); @@ -45,6 +41,8 @@ pub enum EmbeddingModel { ParaphraseMLMpnetBaseV2, /// BAAI/bge-small-zh-v1.5 BGESmallZHV15, + /// lightonai/modernbert-embed-large + ModernBertEmbedLarge, /// intfloat/multilingual-e5-small MultilingualE5Small, /// intfloat/multilingual-e5-base @@ -214,6 +212,14 @@ fn init_models_map() -> HashMap> { model_file: String::from("onnx/model.onnx"), additional_files: Vec::new(), }, + ModelInfo { + model: EmbeddingModel::ModernBertEmbedLarge, + dim: 1024, + description: String::from("Large model of ModernBert Text Embeddings"), + model_code: String::from("lightonai/modernbert-embed-large"), + model_file: String::from("onnx/model.onnx"), + additional_files: Vec::new(), + }, ModelInfo { model: EmbeddingModel::MultilingualE5Small, dim: 384, @@ -338,78 +344,6 @@ pub fn models_list() -> Vec> { models_map().values().cloned().collect() } -impl EmbeddingModel { - pub fn get_default_pooling_method(&self) -> Option { - match self { - EmbeddingModel::AllMiniLML6V2 => Some(Pooling::Mean), - EmbeddingModel::AllMiniLML6V2Q => Some(Pooling::Mean), - EmbeddingModel::AllMiniLML12V2 => Some(Pooling::Mean), - EmbeddingModel::AllMiniLML12V2Q => Some(Pooling::Mean), - - EmbeddingModel::BGEBaseENV15 => Some(Pooling::Cls), - EmbeddingModel::BGEBaseENV15Q => Some(Pooling::Cls), - EmbeddingModel::BGELargeENV15 => Some(Pooling::Cls), - EmbeddingModel::BGELargeENV15Q => Some(Pooling::Cls), - EmbeddingModel::BGESmallENV15 => Some(Pooling::Cls), - EmbeddingModel::BGESmallENV15Q => Some(Pooling::Cls), - EmbeddingModel::BGESmallZHV15 => Some(Pooling::Cls), - - EmbeddingModel::NomicEmbedTextV1 => Some(Pooling::Mean), - EmbeddingModel::NomicEmbedTextV15 => Some(Pooling::Mean), - EmbeddingModel::NomicEmbedTextV15Q => Some(Pooling::Mean), - - EmbeddingModel::ParaphraseMLMiniLML12V2 => Some(Pooling::Mean), - EmbeddingModel::ParaphraseMLMiniLML12V2Q => Some(Pooling::Mean), - EmbeddingModel::ParaphraseMLMpnetBaseV2 => Some(Pooling::Mean), - - EmbeddingModel::MultilingualE5Base => Some(Pooling::Mean), - EmbeddingModel::MultilingualE5Small => Some(Pooling::Mean), - EmbeddingModel::MultilingualE5Large => Some(Pooling::Mean), - - EmbeddingModel::MxbaiEmbedLargeV1 => Some(Pooling::Cls), - EmbeddingModel::MxbaiEmbedLargeV1Q => Some(Pooling::Cls), - - EmbeddingModel::GTEBaseENV15 => Some(Pooling::Cls), - EmbeddingModel::GTEBaseENV15Q => Some(Pooling::Cls), - EmbeddingModel::GTELargeENV15 => Some(Pooling::Cls), - EmbeddingModel::GTELargeENV15Q => Some(Pooling::Cls), - - EmbeddingModel::ClipVitB32 => Some(Pooling::Mean), - - EmbeddingModel::JinaEmbeddingsV2BaseCode => Some(Pooling::Mean), - } - } - - /// Get the quantization mode of the model. - /// - /// Any models with a `Q` suffix in their name are quantized models. - /// - /// Currently only 6 supported models have dynamic quantization: - /// - Alibaba-NLP/gte-base-en-v1.5 - /// - Alibaba-NLP/gte-large-en-v1.5 - /// - mixedbread-ai/mxbai-embed-large-v1 - /// - nomic-ai/nomic-embed-text-v1.5 - /// - Xenova/all-MiniLM-L12-v2 - /// - Xenova/all-MiniLM-L6-v2 - /// - // TODO: Update this list when more models are added - pub fn get_quantization_mode(&self) -> QuantizationMode { - match self { - EmbeddingModel::AllMiniLML6V2Q => QuantizationMode::Dynamic, - EmbeddingModel::AllMiniLML12V2Q => QuantizationMode::Dynamic, - EmbeddingModel::BGEBaseENV15Q => QuantizationMode::Static, - EmbeddingModel::BGELargeENV15Q => QuantizationMode::Static, - EmbeddingModel::BGESmallENV15Q => QuantizationMode::Static, - EmbeddingModel::NomicEmbedTextV15Q => QuantizationMode::Dynamic, - EmbeddingModel::ParaphraseMLMiniLML12V2Q => QuantizationMode::Static, - EmbeddingModel::MxbaiEmbedLargeV1Q => QuantizationMode::Dynamic, - EmbeddingModel::GTEBaseENV15Q => QuantizationMode::Dynamic, - EmbeddingModel::GTELargeENV15Q => QuantizationMode::Dynamic, - _ => QuantizationMode::None, - } - } -} - impl Display for EmbeddingModel { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let model_info = get_model_info(self).expect("Model not found."); diff --git a/src/output/embedding_output.rs b/src/output/embedding_output.rs index aa732d9..ac7dcc4 100644 --- a/src/output/embedding_output.rs +++ b/src/output/embedding_output.rs @@ -62,7 +62,7 @@ impl SingleBatchOutput<'_, '_> { // If there is none pooling, default to cls so as not to break the existing implementations // TODO: Consider return output as is to support custom model that has built-in pooling layer: - // - [] Add model with built-in pooling to the list of supported model in ``models::text_embdding::models_list`` + // - [] Add model with built-in pooling to the list of supported model in ``models::text_embedding::models_list`` // - [] Write unit test for new model // - [] Update ``pooling::Pooling`` to include None type // - [] Change the line below to return output as is diff --git a/src/reranking/impl.rs b/src/reranking/impl.rs index cc458fb..4b27a51 100644 --- a/src/reranking/impl.rs +++ b/src/reranking/impl.rs @@ -1,4 +1,4 @@ -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use anyhow::Context; use anyhow::Result; use ort::{ @@ -7,19 +7,19 @@ use ort::{ }; use std::thread::available_parallelism; -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use crate::common::load_tokenizer_hf_hub; use crate::{ common::load_tokenizer, models::reranking::reranker_model_list, RerankerModel, RerankerModelInfo, }; -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use hf_hub::{api::sync::ApiBuilder, Cache}; use ndarray::{s, Array}; use rayon::{iter::ParallelIterator, slice::ParallelSlice}; use tokenizers::Tokenizer; -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use super::RerankInitOptions; use super::{ OnnxSource, RerankInitOptionsUserDefined, RerankResult, TextRerank, UserDefinedRerankingModel, @@ -50,7 +50,7 @@ impl TextRerank { reranker_model_list() } - #[cfg(feature = "online")] + #[cfg(feature = "hf-hub")] pub fn try_new(options: RerankInitOptions) -> Result { use super::RerankInitOptions; @@ -122,7 +122,7 @@ impl TextRerank { Ok(Self::new(tokenizer, session)) } - /// Reranks documents using the reranker model and returns the results sorted by score in descending order. + /// Rerank documents using the reranker model and returns the results sorted by score in descending order. pub fn rerank + Send + Sync>( &self, query: S, @@ -151,16 +151,16 @@ impl TextRerank { let mut ids_array = Vec::with_capacity(max_size); let mut mask_array = Vec::with_capacity(max_size); - let mut typeids_array = Vec::with_capacity(max_size); + let mut type_ids_array = Vec::with_capacity(max_size); encodings.iter().for_each(|encoding| { let ids = encoding.get_ids(); let mask = encoding.get_attention_mask(); - let typeids = encoding.get_type_ids(); + let type_ids = encoding.get_type_ids(); ids_array.extend(ids.iter().map(|x| *x as i64)); mask_array.extend(mask.iter().map(|x| *x as i64)); - typeids_array.extend(typeids.iter().map(|x| *x as i64)); + type_ids_array.extend(type_ids.iter().map(|x| *x as i64)); }); let inputs_ids_array = @@ -170,7 +170,7 @@ impl TextRerank { Array::from_shape_vec((batch_size, encoding_length), mask_array)?; let token_type_ids_array = - Array::from_shape_vec((batch_size, encoding_length), typeids_array)?; + Array::from_shape_vec((batch_size, encoding_length), type_ids_array)?; let mut session_inputs = ort::inputs![ "input_ids" => Value::from_array(inputs_ids_array)?, diff --git a/src/sparse_text_embedding/impl.rs b/src/sparse_text_embedding/impl.rs index 2bd4256..13ccc28 100644 --- a/src/sparse_text_embedding/impl.rs +++ b/src/sparse_text_embedding/impl.rs @@ -1,29 +1,29 @@ -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use crate::common::load_tokenizer_hf_hub; use crate::{ models::sparse::{models_list, SparseModel}, ModelInfo, SparseEmbedding, }; -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use anyhow::Context; use anyhow::Result; -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use hf_hub::{ api::sync::{ApiBuilder, ApiRepo}, Cache, }; -use ndarray::{Array, CowArray}; +use ndarray::{Array, ArrayViewD, Axis, CowArray, Dim}; use ort::{session::Session, value::Value}; -#[cfg_attr(not(feature = "online"), allow(unused_imports))] +#[cfg_attr(not(feature = "hf-hub"), allow(unused_imports))] use rayon::{iter::ParallelIterator, slice::ParallelSlice}; -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use std::path::PathBuf; use tokenizers::Tokenizer; -#[cfg_attr(not(feature = "online"), allow(unused_imports))] +#[cfg_attr(not(feature = "hf-hub"), allow(unused_imports))] use std::thread::available_parallelism; -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use super::SparseInitOptions; use super::{SparseTextEmbedding, DEFAULT_BATCH_SIZE}; @@ -33,7 +33,7 @@ impl SparseTextEmbedding { /// Uses the highest level of Graph optimization /// /// Uses the total number of CPUs available as the number of intra-threads - #[cfg(feature = "online")] + #[cfg(feature = "hf-hub")] pub fn try_new(options: SparseInitOptions) -> Result { use super::SparseInitOptions; use ort::{session::builder::GraphOptimizationLevel, session::Session}; @@ -70,7 +70,7 @@ impl SparseTextEmbedding { } /// Private method to return an instance - #[cfg_attr(not(feature = "online"), allow(dead_code))] + #[cfg_attr(not(feature = "hf-hub"), allow(dead_code))] fn new(tokenizer: Tokenizer, session: Session, model: SparseModel) -> Self { let need_token_type_ids = session .inputs @@ -84,7 +84,7 @@ impl SparseTextEmbedding { } } /// Return the SparseTextEmbedding model's directory from cache or remote retrieval - #[cfg(feature = "online")] + #[cfg(feature = "hf-hub")] fn retrieve_model( model: SparseModel, cache_dir: PathBuf, @@ -138,19 +138,19 @@ impl SparseTextEmbedding { // Preallocate arrays with the maximum size let mut ids_array = Vec::with_capacity(max_size); let mut mask_array = Vec::with_capacity(max_size); - let mut typeids_array = Vec::with_capacity(max_size); + let mut type_ids_array = Vec::with_capacity(max_size); // Not using par_iter because the closure needs to be FnMut encodings.iter().for_each(|encoding| { let ids = encoding.get_ids(); let mask = encoding.get_attention_mask(); - let typeids = encoding.get_type_ids(); + let type_ids = encoding.get_type_ids(); // Extend the preallocated arrays with the current encoding // Requires the closure to be FnMut ids_array.extend(ids.iter().map(|x| *x as i64)); mask_array.extend(mask.iter().map(|x| *x as i64)); - typeids_array.extend(typeids.iter().map(|x| *x as i64)); + type_ids_array.extend(type_ids.iter().map(|x| *x as i64)); }); // Create CowArrays from vectors @@ -161,7 +161,7 @@ impl SparseTextEmbedding { let attention_mask_array = CowArray::from(&owned_attention_mask); let token_type_ids_array = - Array::from_shape_vec((batch_size, encoding_length), typeids_array)?; + Array::from_shape_vec((batch_size, encoding_length), type_ids_array)?; let mut session_inputs = ort::inputs![ "input_ids" => Value::from_array(inputs_ids_array)?, @@ -186,7 +186,11 @@ impl SparseTextEmbedding { let output_data = outputs[last_hidden_state_key].try_extract_tensor::()?; - let embeddings = self.model.post_process(&output_data, &attention_mask_array); + let embeddings = SparseTextEmbedding::post_process( + &self.model, + &output_data, + &attention_mask_array, + ); Ok(embeddings) }) @@ -197,4 +201,44 @@ impl SparseTextEmbedding { Ok(output) } + + fn post_process( + model_name: &SparseModel, + model_output: &ArrayViewD, + attention_mask: &CowArray>, + ) -> Vec { + match model_name { + SparseModel::SPLADEPPV1 => { + // Apply ReLU and logarithm transformation + let relu_log = model_output.mapv(|x| (1.0 + x.max(0.0)).ln()); + + // Convert to f32 and expand the dimensions + let attention_mask = attention_mask.mapv(|x| x as f32).insert_axis(Axis(2)); + + // Weight the transformed values by the attention mask + let weighted_log = relu_log * attention_mask; + + // Get the max scores + let scores = weighted_log.fold_axis(Axis(1), f32::NEG_INFINITY, |r, &v| r.max(v)); + + scores + .rows() + .into_iter() + .map(|row_scores| { + let mut values: Vec = Vec::with_capacity(scores.len()); + let mut indices: Vec = Vec::with_capacity(scores.len()); + + row_scores.into_iter().enumerate().for_each(|(idx, f)| { + if *f > 0.0 { + values.push(*f); + indices.push(idx); + } + }); + + SparseEmbedding { values, indices } + }) + .collect() + } + } + } } diff --git a/src/text_embedding/impl.rs b/src/text_embedding/impl.rs index b52cf5e..23222c7 100644 --- a/src/text_embedding/impl.rs +++ b/src/text_embedding/impl.rs @@ -1,6 +1,6 @@ //! The definition of the main struct for text embeddings - [`TextEmbedding`]. -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use crate::common::load_tokenizer_hf_hub; use crate::{ common::load_tokenizer, @@ -8,10 +8,10 @@ use crate::{ pooling::Pooling, Embedding, EmbeddingModel, EmbeddingOutput, ModelInfo, QuantizationMode, SingleBatchOutput, }; -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use anyhow::Context; use anyhow::Result; -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use hf_hub::{ api::sync::{ApiBuilder, ApiRepo}, Cache, @@ -25,12 +25,12 @@ use rayon::{ iter::{FromParallelIterator, ParallelIterator}, slice::ParallelSlice, }; -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use std::path::PathBuf; use std::thread::available_parallelism; use tokenizers::Tokenizer; -#[cfg(feature = "online")] +#[cfg(feature = "hf-hub")] use super::InitOptions; use super::{ output, InitOptionsUserDefined, TextEmbedding, UserDefinedEmbeddingModel, DEFAULT_BATCH_SIZE, @@ -42,7 +42,7 @@ impl TextEmbedding { /// Uses the highest level of Graph optimization /// /// Uses the total number of CPUs available as the number of intra-threads - #[cfg(feature = "online")] + #[cfg(feature = "hf-hub")] pub fn try_new(options: InitOptions) -> Result { let InitOptions { model_name, @@ -75,7 +75,7 @@ impl TextEmbedding { } // prioritise loading pooling config if available, if not (thanks qdrant!), look for it in hardcoded - let post_processing = model_name.get_default_pooling_method(); + let post_processing = TextEmbedding::get_default_pooling_method(&model_name); let session = Session::builder()? .with_execution_providers(execution_providers)? @@ -88,7 +88,7 @@ impl TextEmbedding { tokenizer, session, post_processing, - model_name.get_quantization_mode(), + TextEmbedding::get_quantization_mode(&model_name), )) } @@ -142,7 +142,7 @@ impl TextEmbedding { } } /// Return the TextEmbedding model's directory from cache or remote retrieval - #[cfg(feature = "online")] + #[cfg(feature = "hf-hub")] fn retrieve_model( model: EmbeddingModel, cache_dir: PathBuf, @@ -157,6 +157,78 @@ impl TextEmbedding { Ok(repo) } + pub fn get_default_pooling_method(model_name: &EmbeddingModel) -> Option { + match model_name { + EmbeddingModel::AllMiniLML6V2 => Some(Pooling::Mean), + EmbeddingModel::AllMiniLML6V2Q => Some(Pooling::Mean), + EmbeddingModel::AllMiniLML12V2 => Some(Pooling::Mean), + EmbeddingModel::AllMiniLML12V2Q => Some(Pooling::Mean), + + EmbeddingModel::BGEBaseENV15 => Some(Pooling::Cls), + EmbeddingModel::BGEBaseENV15Q => Some(Pooling::Cls), + EmbeddingModel::BGELargeENV15 => Some(Pooling::Cls), + EmbeddingModel::BGELargeENV15Q => Some(Pooling::Cls), + EmbeddingModel::BGESmallENV15 => Some(Pooling::Cls), + EmbeddingModel::BGESmallENV15Q => Some(Pooling::Cls), + EmbeddingModel::BGESmallZHV15 => Some(Pooling::Cls), + + EmbeddingModel::NomicEmbedTextV1 => Some(Pooling::Mean), + EmbeddingModel::NomicEmbedTextV15 => Some(Pooling::Mean), + EmbeddingModel::NomicEmbedTextV15Q => Some(Pooling::Mean), + + EmbeddingModel::ParaphraseMLMiniLML12V2 => Some(Pooling::Mean), + EmbeddingModel::ParaphraseMLMiniLML12V2Q => Some(Pooling::Mean), + EmbeddingModel::ParaphraseMLMpnetBaseV2 => Some(Pooling::Mean), + + EmbeddingModel::ModernBertEmbedLarge => Some(Pooling::Mean), + + EmbeddingModel::MultilingualE5Base => Some(Pooling::Mean), + EmbeddingModel::MultilingualE5Small => Some(Pooling::Mean), + EmbeddingModel::MultilingualE5Large => Some(Pooling::Mean), + + EmbeddingModel::MxbaiEmbedLargeV1 => Some(Pooling::Cls), + EmbeddingModel::MxbaiEmbedLargeV1Q => Some(Pooling::Cls), + + EmbeddingModel::GTEBaseENV15 => Some(Pooling::Cls), + EmbeddingModel::GTEBaseENV15Q => Some(Pooling::Cls), + EmbeddingModel::GTELargeENV15 => Some(Pooling::Cls), + EmbeddingModel::GTELargeENV15Q => Some(Pooling::Cls), + + EmbeddingModel::ClipVitB32 => Some(Pooling::Mean), + + EmbeddingModel::JinaEmbeddingsV2BaseCode => Some(Pooling::Mean), + } + } + + /// Get the quantization mode of the model. + /// + /// Any models with a `Q` suffix in their name are quantized models. + /// + /// Currently only 6 supported models have dynamic quantization: + /// - Alibaba-NLP/gte-base-en-v1.5 + /// - Alibaba-NLP/gte-large-en-v1.5 + /// - mixedbread-ai/mxbai-embed-large-v1 + /// - nomic-ai/nomic-embed-text-v1.5 + /// - Xenova/all-MiniLM-L12-v2 + /// - Xenova/all-MiniLM-L6-v2 + /// + // TODO: Update this list when more models are added + pub fn get_quantization_mode(model_name: &EmbeddingModel) -> QuantizationMode { + match model_name { + EmbeddingModel::AllMiniLML6V2Q => QuantizationMode::Dynamic, + EmbeddingModel::AllMiniLML12V2Q => QuantizationMode::Dynamic, + EmbeddingModel::BGEBaseENV15Q => QuantizationMode::Static, + EmbeddingModel::BGELargeENV15Q => QuantizationMode::Static, + EmbeddingModel::BGESmallENV15Q => QuantizationMode::Static, + EmbeddingModel::NomicEmbedTextV15Q => QuantizationMode::Dynamic, + EmbeddingModel::ParaphraseMLMiniLML12V2Q => QuantizationMode::Static, + EmbeddingModel::MxbaiEmbedLargeV1Q => QuantizationMode::Dynamic, + EmbeddingModel::GTEBaseENV15Q => QuantizationMode::Dynamic, + EmbeddingModel::GTELargeENV15Q => QuantizationMode::Dynamic, + _ => QuantizationMode::None, + } + } + /// Retrieve a list of supported models pub fn list_supported_models() -> Vec> { models_list() @@ -190,7 +262,7 @@ impl TextEmbedding { /// If you want to use the raw session outputs, use [`EmbeddingOutput::into_raw`] /// on the output of this method. /// - /// If you want to choose a different export key or customise the way the batch + /// If you want to choose a different export key or customize the way the batch /// arrays are aggregated, you can define your own array transformer /// and use it on [`EmbeddingOutput::export_with_transformer`] to extract the /// embeddings with your custom output type. @@ -242,19 +314,19 @@ impl TextEmbedding { // Preallocate arrays with the maximum size let mut ids_array = Vec::with_capacity(max_size); let mut mask_array = Vec::with_capacity(max_size); - let mut typeids_array = Vec::with_capacity(max_size); + let mut type_ids_array = Vec::with_capacity(max_size); // Not using par_iter because the closure needs to be FnMut encodings.iter().for_each(|encoding| { let ids = encoding.get_ids(); let mask = encoding.get_attention_mask(); - let typeids = encoding.get_type_ids(); + let type_ids = encoding.get_type_ids(); // Extend the preallocated arrays with the current encoding // Requires the closure to be FnMut ids_array.extend(ids.iter().map(|x| *x as i64)); mask_array.extend(mask.iter().map(|x| *x as i64)); - typeids_array.extend(typeids.iter().map(|x| *x as i64)); + type_ids_array.extend(type_ids.iter().map(|x| *x as i64)); }); // Create CowArrays from vectors @@ -264,7 +336,7 @@ impl TextEmbedding { Array::from_shape_vec((batch_size, encoding_length), mask_array)?; let token_type_ids_array = - Array::from_shape_vec((batch_size, encoding_length), typeids_array)?; + Array::from_shape_vec((batch_size, encoding_length), type_ids_array)?; let mut session_inputs = ort::inputs![ "input_ids" => Value::from_array(inputs_ids_array)?, @@ -313,7 +385,7 @@ impl TextEmbedding { let batches = self.transform(texts, batch_size)?; batches.export_with_transformer(output::transformer_with_precedence( - output::OUTPUT_TYPE_PRECENDENCE, + output::OUTPUT_TYPE_PRECEDENCE, self.pooling.clone(), )) } diff --git a/src/text_embedding/output.rs b/src/text_embedding/output.rs index f7b5383..e41d962 100644 --- a/src/text_embedding/output.rs +++ b/src/text_embedding/output.rs @@ -10,7 +10,7 @@ use crate::{ use super::TextEmbedding; /// The default output precedence for the TextEmbedding model. -pub const OUTPUT_TYPE_PRECENDENCE: &[OutputKey] = &[ +pub const OUTPUT_TYPE_PRECEDENCE: &[OutputKey] = &[ OutputKey::OnlyOne, OutputKey::ByName("last_hidden_state"), OutputKey::ByName("sentence_embedding"), diff --git a/tests/assets/sample_text.txt b/tests/assets/sample_text.txt new file mode 100644 index 0000000..00db92c --- /dev/null +++ b/tests/assets/sample_text.txt @@ -0,0 +1 @@ +animals environment general health health general weight philosophy ethics Being vegetarian helps the environment Becoming a vegetarian is an environmentally friendly thing to do. Modern farming is one of the main sources of pollution in our rivers. Beef farming is one of the main causes of deforestation, and as long as people continue to buy fast food in their billions, there will be a financial incentive to continue cutting down trees to make room for cattle. Because of our desire to eat fish, our rivers and seas are being emptied of fish and many species are facing extinction. Energy resources are used up much more greedily by meat farming than my farming cereals, pulses etc. Eating meat and fish not only causes cruelty to animals, it causes serious harm to the environment and to biodiversity. For example consider Meat production related pollution and deforestation At Toronto’s 1992 Royal Agricultural Winter Fair, Agriculture Canada displayed two contrasting statistics: “it takes four football fields of land (about 1.6 hectares) to feed each Canadian” and “one apple tree produces enough fruit to make 320 pies.” Think about it — a couple of apple trees and a few rows of wheat on a mere fraction of a hectare could produce enough food for one person! [1] The 2006 U.N. Food and Agriculture Organization (FAO) report concluded that worldwide livestock farming generates 18% of the planet's greenhouse gas emissions — by comparison, all the world's cars, trains, planes and boats account for a combined 13% of greenhouse gas emissions. [2] As a result of the above point producing meat damages the environment. The demand for meat drives deforestation. Daniel Cesar Avelino of Brazil's Federal Public Prosecution Office says “We know that the single biggest driver of deforestation in the Amazon is cattle.” This clearing of tropical rainforests such as the Amazon for agriculture is estimated to produce 17% of the world's greenhouse gas emissions. [3] Not only this but the production of meat takes a lot more energy than it ultimately gives us chicken meat production consumes energy in a 4:1 ratio to protein output; beef cattle production requires an energy input to protein output ratio of 54:1. The same is true with water use due to the same phenomenon of meat being inefficient to produce in terms of the amount of grain needed to produce the same weight of meat, production requires a lot of water. Water is another scarce resource that we will soon not have enough of in various areas of the globe. Grain-fed beef production takes 100,000 liters of water for every kilogram of food. Raising broiler chickens takes 3,500 liters of water to make a kilogram of meat. In comparison, soybean production uses 2,000 liters for kilogram of food produced; rice, 1,912; wheat, 900; and potatoes, 500 liters. [4] This is while there are areas of the globe that have severe water shortages. With farming using up to 70 times more water than is used for domestic purposes: cooking and washing. A third of the population of the world is already suffering from a shortage of water. [5] Groundwater levels are falling all over the world and rivers are beginning to dry up. Already some of the biggest rivers such as China’s Yellow river do not reach the sea. [6] With a rising population becoming vegetarian is the only responsible way to eat. [1] Stephen Leckie, ‘How Meat-centred Eating Patterns Affect Food Security and the Environment’, International development research center [2] Bryan Walsh, Meat: Making Global Warming Worse, Time magazine, 10 September 2008 . [3] David Adam, Supermarket suppliers ‘helping to destroy Amazon rainforest’, The Guardian, 21st June 2009. [4] Roger Segelken, U.S. could feed 800 million people with grain that livestock eat, Cornell Science News, 7th August 1997. [5] Fiona Harvey, Water scarcity affects one in three, FT.com, 21st August 2003 [6] Rupert Wingfield-Hayes, Yellow river ‘drying up’, BBC News, 29th July 2004 \ No newline at end of file diff --git a/tests/embeddings.rs b/tests/embeddings.rs index 1569958..79e1248 100644 --- a/tests/embeddings.rs +++ b/tests/embeddings.rs @@ -1,4 +1,4 @@ -#![cfg(feature = "online")] +#![cfg(feature = "hf-hub")] use std::fs; use std::path::Path; @@ -7,11 +7,11 @@ use hf_hub::Repo; use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; use fastembed::{ - read_file_to_bytes, Embedding, EmbeddingModel, ImageEmbedding, ImageInitOptions, InitOptions, - InitOptionsUserDefined, OnnxSource, Pooling, QuantizationMode, RerankInitOptions, - RerankInitOptionsUserDefined, RerankerModel, SparseInitOptions, SparseTextEmbedding, - TextEmbedding, TextRerank, TokenizerFiles, UserDefinedEmbeddingModel, - UserDefinedRerankingModel, DEFAULT_CACHE_DIR, + read_file_to_bytes, Embedding, EmbeddingModel, ImageEmbedding, ImageEmbeddingModel, + ImageInitOptions, InitOptions, InitOptionsUserDefined, ModelInfo, OnnxSource, Pooling, + QuantizationMode, RerankInitOptions, RerankInitOptionsUserDefined, RerankerModel, + RerankerModelInfo, SparseInitOptions, SparseTextEmbedding, TextEmbedding, TextRerank, + TokenizerFiles, UserDefinedEmbeddingModel, UserDefinedRerankingModel, DEFAULT_CACHE_DIR, }; /// A small epsilon value for floating point comparisons. @@ -50,6 +50,7 @@ fn verify_embeddings(model: &EmbeddingModel, embeddings: &[Embedding]) -> Result EmbeddingModel::GTEBaseENV15Q => [-1.7032102, -1.7076654, -1.729326, -1.5317788], EmbeddingModel::GTELargeENV15 => [-1.6457459, -1.6582386, -1.6809471, -1.6070237], EmbeddingModel::GTELargeENV15Q => [-1.6044945, -1.6469251, -1.6828246, -1.6265479], + EmbeddingModel::ModernBertEmbedLarge => [ 0.24799639, 0.32174295, 0.17255782, 0.32919246], EmbeddingModel::MultilingualE5Base => [-0.057211064, -0.14287914, -0.071678676, -0.17549144], EmbeddingModel::MultilingualE5Large => [-0.7473163, -0.76040405, -0.7537941, -0.72920954], EmbeddingModel::MultilingualE5Small => [-0.2640718, -0.13929011, -0.08091972, -0.12388548], @@ -119,7 +120,7 @@ macro_rules! create_embeddings_test { let embeddings = model.embed(documents.clone(), batch_size); if matches!( - (batch_size, supported_model.model.get_quantization_mode()), + (batch_size, TextEmbedding::get_quantization_mode(&supported_model.model)), (Some(n), QuantizationMode::Dynamic) if n < documents.len() ) { // For Dynamic quantization, the batch size must be greater than or equal to the number of documents @@ -154,10 +155,12 @@ macro_rules! create_embeddings_test { }; } + create_embeddings_test!( name: test_batch_size_default, batch_size: None, ); + create_embeddings_test!( name: test_batch_size_less_than_document_count, batch_size: Some(3), @@ -284,14 +287,11 @@ fn test_user_defined_embedding_model() { #[test] fn test_rerank() { - TextRerank::list_supported_models() - .par_iter() - .for_each(|supported_model| { - - println!("supported_model: {:?}", supported_model); + let test_one_model = |supported_model: &RerankerModelInfo| { + println!("supported_model: {:?}", supported_model); - let result = TextRerank::try_new(RerankInitOptions::new(supported_model.model.clone())) - .unwrap(); + let result = + TextRerank::try_new(RerankInitOptions::new(supported_model.model.clone())).unwrap(); let documents = vec![ "hi", @@ -305,24 +305,35 @@ fn test_rerank() { .rerank("what is panda?", documents.clone(), true, None) .unwrap(); - assert_eq!(results.len(), documents.len(), "rerank model {:?} failed", supported_model); + assert_eq!( + results.len(), + documents.len(), + "rerank model {:?} failed", + supported_model + ); let option_a = "panda is an animal"; let option_b = "The giant panda, sometimes called a panda bear or simply panda, is a bear species endemic to China."; assert!( - results[0].document.as_ref().unwrap() == option_a || - results[0].document.as_ref().unwrap() == option_b + results[0].document.as_ref().unwrap() == option_a + || results[0].document.as_ref().unwrap() == option_b ); assert!( - results[1].document.as_ref().unwrap() == option_a || - results[1].document.as_ref().unwrap() == option_b + results[1].document.as_ref().unwrap() == option_a + || results[1].document.as_ref().unwrap() == option_b + ); + assert_ne!( + results[0].document, results[1].document, + "The top two results should be different" ); - assert_ne!(results[0].document, results[1].document, "The top two results should be different"); // Clear the model cache to avoid running out of space on GitHub Actions. clean_cache(supported_model.model_code.clone()) - }); + }; + TextRerank::list_supported_models() + .par_iter() + .for_each(test_one_model); } #[ignore] @@ -467,26 +478,20 @@ fn test_user_defined_reranking_model() { #[test] fn test_image_embedding_model() { - ImageEmbedding::list_supported_models() - .par_iter() - .for_each(|supported_model| { - let model: ImageEmbedding = - ImageEmbedding::try_new(ImageInitOptions::new(supported_model.model.clone())) - .unwrap(); - - let images = vec!["tests/assets/image_0.png", "tests/assets/image_1.png"]; + let test_one_model = |supported_model: &ModelInfo| { + let model: ImageEmbedding = + ImageEmbedding::try_new(ImageInitOptions::new(supported_model.model.clone())).unwrap(); - // Generate embeddings with the default batch size, 256 - let embeddings = model.embed(images.clone(), None).unwrap(); + let images = vec!["tests/assets/image_0.png", "tests/assets/image_1.png"]; - assert_eq!(embeddings.len(), images.len()); - for embedding in embeddings { - assert_eq!(embedding.len(), supported_model.dim); - } + // Generate embeddings with the default batch size, 256 + let embeddings = model.embed(images.clone(), None).unwrap(); - // Clear the model cache to avoid running out of space on GitHub Actions. - clean_cache(supported_model.model_code.clone()) - }); + assert_eq!(embeddings.len(), images.len()); + }; + ImageEmbedding::list_supported_models() + .par_iter() + .for_each(test_one_model); } #[test] @@ -551,17 +556,19 @@ fn clean_cache(model_code: String) { let cache_dir = format!("{}/{}", DEFAULT_CACHE_DIR, repo.folder_name()); fs::remove_dir_all(cache_dir).ok(); } + // This is item "test-environment-aeghhgwpe-pro02a" of the [Aguana corpus](http://argumentation.bplaced.net/arguana/data) fn get_sample_text() -> String { - let t = "animals environment general health health general weight philosophy ethics Being vegetarian helps the environment Becoming a vegetarian is an environmentally friendly thing to do. Modern farming is one of the main sources of pollution in our rivers. Beef farming is one of the main causes of deforestation, and as long as people continue to buy fast food in their billions, there will be a financial incentive to continue cutting down trees to make room for cattle. Because of our desire to eat fish, our rivers and seas are being emptied of fish and many species are facing extinction. Energy resources are used up much more greedily by meat farming than my farming cereals, pulses etc. Eating meat and fish not only causes cruelty to animals, it causes serious harm to the environment and to biodiversity. For example consider Meat production related pollution and deforestation At Toronto\u{2019}s 1992 Royal Agricultural Winter Fair, Agriculture Canada displayed two contrasting statistics: \u{201c}it takes four football fields of land (about 1.6 hectares) to feed each Canadian\u{201d} and \u{201c}one apple tree produces enough fruit to make 320 pies.\u{201d} Think about it \u{2014} a couple of apple trees and a few rows of wheat on a mere fraction of a hectare could produce enough food for one person! [1] The 2006 U.N. Food and Agriculture Organization (FAO) report concluded that worldwide livestock farming generates 18% of the planet's greenhouse gas emissions \u{2014} by comparison, all the world's cars, trains, planes and boats account for a combined 13% of greenhouse gas emissions. [2] As a result of the above point producing meat damages the environment. The demand for meat drives deforestation. Daniel Cesar Avelino of Brazil's Federal Public Prosecution Office says \u{201c}We know that the single biggest driver of deforestation in the Amazon is cattle.\u{201d} This clearing of tropical rainforests such as the Amazon for agriculture is estimated to produce 17% of the world's greenhouse gas emissions. [3] Not only this but the production of meat takes a lot more energy than it ultimately gives us chicken meat production consumes energy in a 4:1 ratio to protein output; beef cattle production requires an energy input to protein output ratio of 54:1. The same is true with water use due to the same phenomenon of meat being inefficient to produce in terms of the amount of grain needed to produce the same weight of meat, production requires a lot of water. Water is another scarce resource that we will soon not have enough of in various areas of the globe. Grain-fed beef production takes 100,000 liters of water for every kilogram of food. Raising broiler chickens takes 3,500 liters of water to make a kilogram of meat. In comparison, soybean production uses 2,000 liters for kilogram of food produced; rice, 1,912; wheat, 900; and potatoes, 500 liters. [4] This is while there are areas of the globe that have severe water shortages. With farming using up to 70 times more water than is used for domestic purposes: cooking and washing. A third of the population of the world is already suffering from a shortage of water. [5] Groundwater levels are falling all over the world and rivers are beginning to dry up. Already some of the biggest rivers such as China\u{2019}s Yellow river do not reach the sea. [6] With a rising population becoming vegetarian is the only responsible way to eat. [1] Stephen Leckie, \u{2018}How Meat-centred Eating Patterns Affect Food Security and the Environment\u{2019}, International development research center [2] Bryan Walsh, Meat: Making Global Warming Worse, Time magazine, 10 September 2008 . [3] David Adam, Supermarket suppliers \u{2018}helping to destroy Amazon rainforest\u{2019}, The Guardian, 21st June 2009. [4] Roger Segelken, U.S. could feed 800 million people with grain that livestock eat, Cornell Science News, 7th August 1997. [5] Fiona Harvey, Water scarcity affects one in three, FT.com, 21st August 2003 [6] Rupert Wingfield-Hayes, Yellow river \u{2018}drying up\u{2019}, BBC News, 29th July 2004"; + let t = include_str!("assets/sample_text.txt"); t.to_string() } + #[test] fn test_batch_size_does_not_change_output() { let model = TextEmbedding::try_new( InitOptions::new(EmbeddingModel::AllMiniLML6V2).with_max_length(384), ) - .expect("Create model succesfully"); + .expect("Create model successfully"); let sentences = vec![ "Books are no more threatened by Kindle than stairs by elevators.", @@ -593,7 +600,8 @@ fn test_bgesmallen1point5_match_python_counterpart() { let model = TextEmbedding::try_new( InitOptions::new(EmbeddingModel::BGESmallENV15).with_max_length(384), ) - .expect("Create model succesfully"); + .expect("Create model successfully"); + let text = get_sample_text(); // baseline is generated in python using Xenova/bge-small-en-v1.5.onnx @@ -631,7 +639,7 @@ fn test_allminilml6v2_match_python_counterpart() { let model = TextEmbedding::try_new( InitOptions::new(EmbeddingModel::AllMiniLML6V2).with_max_length(384), ) - .expect("Create model succesfully"); + .expect("Create model successfully"); let text = get_sample_text(); @@ -664,3 +672,41 @@ fn test_allminilml6v2_match_python_counterpart() { assert!((expected - actual).abs() < tolerance); } } + +#[test] +fn test_modernbert_embeddings() { + let supported_model = TextEmbedding::list_supported_models() + .into_iter() + .find(|model| matches!(model.model, EmbeddingModel::ModernBertEmbedLarge)) + .expect("ModernBERT model not found in supported models"); + + let model: TextEmbedding = + TextEmbedding::try_new(InitOptions::new(supported_model.model.clone())).unwrap(); + + let documents = vec![ + "Hello, World!", + "This is an example passage.", + "fastembed-rs is licensed under Apache-2.0", + "Some other short text here blah blah blah", + ]; + + let embeddings = model.embed(documents.clone(), None).unwrap(); + assert_eq!(embeddings.len(), documents.len()); + + for embedding in &embeddings { + assert_eq!(embedding.len(), supported_model.dim); + } + + match verify_embeddings(&supported_model.model, &embeddings) { + Ok(_) => {} + Err(mismatched_indices) => { + panic!( + "Mismatched embeddings for ModernBERT: {sentences:?}", + sentences = &mismatched_indices + .iter() + .map(|&i| documents[i]) + .collect::>() + ); + } + } +} diff --git a/tests/optimum_cli_export.rs b/tests/optimum_cli_export.rs index 23bef6d..cb034d7 100644 --- a/tests/optimum_cli_export.rs +++ b/tests/optimum_cli_export.rs @@ -1,4 +1,4 @@ -#![cfg(feature = "online")] +#![cfg(feature = "hf-hub")] #![cfg(feature = "optimum-cli")] //! Test the use of the ``optimum-cli`` to pull models from the Hugging Face Hub, //! and generate embeddings successfully with the pulled model.