lance_index/vector.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors
//! Vector Index
//!
use std::{collections::HashMap, sync::Arc};
use arrow_array::{ArrayRef, RecordBatch, UInt32Array};
use arrow_schema::Field;
use async_trait::async_trait;
use ivf::storage::IvfModel;
use lance_core::{Result, ROW_ID_FIELD};
use lance_io::object_store::ObjectStore;
use lance_io::traits::Reader;
use lance_linalg::distance::DistanceType;
use lazy_static::lazy_static;
use object_store::path::Path;
use quantizer::{QuantizationType, Quantizer};
use v3::subindex::SubIndexType;
pub mod bq;
pub mod flat;
pub mod graph;
pub mod hnsw;
pub mod ivf;
pub mod kmeans;
pub mod pq;
pub mod quantizer;
pub mod residual;
pub mod sq;
pub mod storage;
pub mod transform;
pub mod utils;
pub mod v3;
use super::pb;
use crate::{prefilter::PreFilter, Index};
pub use residual::RESIDUAL_COLUMN;
// TODO: Make these crate private once the migration from lance to lance-index is done.
pub const DIST_COL: &str = "_distance";
pub const DISTANCE_TYPE_KEY: &str = "distance_type";
pub const INDEX_UUID_COLUMN: &str = "__index_uuid";
pub const PART_ID_COLUMN: &str = "__ivf_part_id";
pub const PQ_CODE_COLUMN: &str = "__pq_code";
pub const SQ_CODE_COLUMN: &str = "__sq_code";
lazy_static! {
pub static ref VECTOR_RESULT_SCHEMA: arrow_schema::SchemaRef =
arrow_schema::SchemaRef::new(arrow_schema::Schema::new(vec![
Field::new(DIST_COL, arrow_schema::DataType::Float32, false),
ROW_ID_FIELD.clone(),
]));
}
/// Query parameters for the vector indices
#[derive(Debug, Clone)]
pub struct Query {
/// The column to be searched.
pub column: String,
/// The vector to be searched.
pub key: ArrayRef,
/// Top k results to return.
pub k: usize,
/// The lower bound (inclusive) of the distance to be searched.
pub lower_bound: Option<f32>,
/// The upper bound (exclusive) of the distance to be searched.
pub upper_bound: Option<f32>,
/// The number of probes to load and search.
pub nprobes: usize,
/// The number of candidates to reserve while searching.
/// this is an optional parameter for HNSW related index types.
pub ef: Option<usize>,
/// If presented, apply a refine step.
/// TODO: should we support fraction / float number here?
pub refine_factor: Option<u32>,
/// Distance metric type
pub metric_type: DistanceType,
/// Whether to use an ANN index if available
pub use_index: bool,
}
impl From<pb::VectorMetricType> for DistanceType {
fn from(proto: pb::VectorMetricType) -> Self {
match proto {
pb::VectorMetricType::L2 => Self::L2,
pb::VectorMetricType::Cosine => Self::Cosine,
pb::VectorMetricType::Dot => Self::Dot,
pb::VectorMetricType::Hamming => Self::Hamming,
}
}
}
impl From<DistanceType> for pb::VectorMetricType {
fn from(mt: DistanceType) -> Self {
match mt {
DistanceType::L2 => Self::L2,
DistanceType::Cosine => Self::Cosine,
DistanceType::Dot => Self::Dot,
DistanceType::Hamming => Self::Hamming,
}
}
}
/// Vector Index for (Approximate) Nearest Neighbor (ANN) Search.
/// It's always the IVF index, any other index types without partitioning will be treated as IVF with one partition.
#[async_trait]
#[allow(clippy::redundant_pub_crate)]
pub trait VectorIndex: Send + Sync + std::fmt::Debug + Index {
/// Search the vector for nearest neighbors.
///
/// It returns a [RecordBatch] with Schema of:
///
/// ```
/// use arrow_schema::{Schema, Field, DataType};
///
/// Schema::new(vec![
/// Field::new("_rowid", DataType::UInt64, true),
/// Field::new("_distance", DataType::Float32, false),
/// ]);
/// ```
///
/// The `pre_filter` argument is used to filter out row ids that we know are
/// not relevant to the query. For example, it removes deleted rows.
///
/// *WARNINGS*:
/// - Only supports `f32` now. Will add f64/f16 later.
async fn search(&self, query: &Query, pre_filter: Arc<dyn PreFilter>) -> Result<RecordBatch>;
fn find_partitions(&self, query: &Query) -> Result<UInt32Array>;
async fn search_in_partition(
&self,
partition_id: usize,
query: &Query,
pre_filter: Arc<dyn PreFilter>,
) -> Result<RecordBatch>;
/// If the index is loadable by IVF, so it can be a sub-index that
/// is loaded on demand by IVF.
fn is_loadable(&self) -> bool;
/// Use residual vector to search.
fn use_residual(&self) -> bool;
/// If the index can be remapped return Ok. Else return an error
/// explaining why not
fn check_can_remap(&self) -> Result<()>;
// async fn append(&self, batches: Vec<RecordBatch>) -> Result<()>;
// async fn merge(&self, indices: Vec<Arc<dyn VectorIndex>>) -> Result<()>;
/// Load the index from the reader on-demand.
async fn load(
&self,
reader: Arc<dyn Reader>,
offset: usize,
length: usize,
) -> Result<Box<dyn VectorIndex>>;
/// Load the partition from the reader on-demand.
async fn load_partition(
&self,
reader: Arc<dyn Reader>,
offset: usize,
length: usize,
_partition_id: usize,
) -> Result<Box<dyn VectorIndex>> {
self.load(reader, offset, length).await
}
/// Return the IDs of rows in the index.
fn row_ids(&self) -> Box<dyn Iterator<Item = &'_ u64> + '_>;
/// Remap the index according to mapping
///
/// Each item in mapping describes an old row id -> new row id
/// pair. If old row id -> None then that row id has been
/// deleted and can be removed from the index.
///
/// If an old row id is not in the mapping then it should be
/// left alone.
async fn remap(&mut self, mapping: &HashMap<u64, Option<u64>>) -> Result<()>;
/// Remap the index according to mapping
///
/// write the remapped index to the index_dir
/// this is available for only v3 index
async fn remap_to(
self: Arc<Self>,
_store: ObjectStore,
_mapping: &HashMap<u64, Option<u64>>,
_column: String,
_index_dir: Path,
) -> Result<()> {
unimplemented!("only for v3 index")
}
/// The metric type of this vector index.
fn metric_type(&self) -> DistanceType;
fn ivf_model(&self) -> IvfModel;
fn quantizer(&self) -> Quantizer;
/// the index type of this vector index.
fn sub_index_type(&self) -> (SubIndexType, QuantizationType);
}