llama_cpp_2/lib.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
//! Bindings to the llama.cpp library.
//!
//! As llama.cpp is a very fast moving target, this crate does not attempt to create a stable API
//! with all the rust idioms. Instead it provided safe wrappers around nearly direct bindings to
//! llama.cpp. This makes it easier to keep up with the changes in llama.cpp, but does mean that
//! the API is not as nice as it could be.
//!
//! # Examples
//!
//! - [simple](https://github.com/utilityai/llama-cpp-rs/tree/main/simple)
//!
//! # Feature Flags
//!
//! - `cuda` enables CUDA gpu support.
//! - `sampler` adds the [`context::sample::sampler`] struct for a more rusty way of sampling.
use std::ffi::NulError;
use std::fmt::Debug;
use std::num::NonZeroI32;
use crate::llama_batch::BatchAddError;
use std::os::raw::c_int;
use std::path::PathBuf;
use std::string::FromUtf8Error;
pub mod context;
pub mod llama_backend;
pub mod llama_batch;
pub mod model;
pub mod sampling;
pub mod timing;
pub mod token;
pub mod token_type;
/// A failable result from a llama.cpp function.
pub type Result<T> = std::result::Result<T, LLamaCppError>;
/// All errors that can occur in the llama-cpp crate.
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LLamaCppError {
/// The backend was already initialized. This can generally be ignored as initializing the backend
/// is idempotent.
#[error("BackendAlreadyInitialized")]
BackendAlreadyInitialized,
/// There was an error while get the chat template from model.
#[error("{0}")]
ChatTemplateError(#[from] ChatTemplateError),
/// There was an error while decoding a batch.
#[error("{0}")]
DecodeError(#[from] DecodeError),
/// There was an error while encoding a batch.
#[error("{0}")]
EncodeError(#[from] EncodeError),
/// There was an error loading a model.
#[error("{0}")]
LlamaModelLoadError(#[from] LlamaModelLoadError),
/// There was an error creating a new model context.
#[error("{0}")]
LlamaContextLoadError(#[from] LlamaContextLoadError),
/// There was an error adding a token to a batch.
#[error["{0}"]]
BatchAddError(#[from] BatchAddError),
/// see [`EmbeddingsError`]
#[error(transparent)]
EmbeddingError(#[from] EmbeddingsError),
// See [`LlamaSamplerError`]
}
/// There was an error while getting the chat template from a model.
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum ChatTemplateError {
/// the buffer was too small.
#[error("The buffer was too small. However, a buffer size of {0} would be just large enough.")]
BuffSizeError(usize),
/// gguf has no chat template
#[error("the model has no meta val - returned code {0}")]
MissingTemplate(i32),
/// The chat template was not valid utf8.
#[error(transparent)]
Utf8Error(#[from] std::str::Utf8Error),
}
/// Failed to Load context
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaContextLoadError {
/// llama.cpp returned null
#[error("null reference from llama.cpp")]
NullReturn,
}
/// Failed to decode a batch.
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum DecodeError {
/// No kv cache slot was available.
#[error("Decode Error 1: NoKvCacheSlot")]
NoKvCacheSlot,
/// The number of tokens in the batch was 0.
#[error("Decode Error -1: n_tokens == 0")]
NTokensZero,
/// An unknown error occurred.
#[error("Decode Error {0}: unknown")]
Unknown(c_int),
}
/// Failed to decode a batch.
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum EncodeError {
/// No kv cache slot was available.
#[error("Encode Error 1: NoKvCacheSlot")]
NoKvCacheSlot,
/// The number of tokens in the batch was 0.
#[error("Encode Error -1: n_tokens == 0")]
NTokensZero,
/// An unknown error occurred.
#[error("Encode Error {0}: unknown")]
Unknown(c_int),
}
/// When embedding related functions fail
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum EmbeddingsError {
/// Embeddings weren't enabled in the context options
#[error("Embeddings weren't enabled in the context options")]
NotEnabled,
/// Logits weren't enabled for the given token
#[error("Logits were not enabled for the given token")]
LogitsNotEnabled,
/// The given sequence index exceeds the max sequence id
#[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
NonePoolType,
}
/// Decode a error from llama.cpp into a [`DecodeError`].
impl From<NonZeroI32> for DecodeError {
fn from(value: NonZeroI32) -> Self {
match value.get() {
1 => DecodeError::NoKvCacheSlot,
-1 => DecodeError::NTokensZero,
i => DecodeError::Unknown(i),
}
}
}
/// Encode a error from llama.cpp into a [`EncodeError`].
impl From<NonZeroI32> for EncodeError {
fn from(value: NonZeroI32) -> Self {
match value.get() {
1 => EncodeError::NoKvCacheSlot,
-1 => EncodeError::NTokensZero,
i => EncodeError::Unknown(i),
}
}
}
/// An error that can occur when loading a model.
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaModelLoadError {
/// There was a null byte in a provided string and thus it could not be converted to a C string.
#[error("null byte in string {0}")]
NullError(#[from] NulError),
/// llama.cpp returned a nullptr - this could be many different causes.
#[error("null result from llama cpp")]
NullResult,
/// Failed to convert the path to a rust str. This means the path was not valid unicode
#[error("failed to convert path {0} to str")]
PathToStrError(PathBuf),
}
/// An error that can occur when loading a model.
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaLoraAdapterInitError {
/// There was a null byte in a provided string and thus it could not be converted to a C string.
#[error("null byte in string {0}")]
NullError(#[from] NulError),
/// llama.cpp returned a nullptr - this could be many different causes.
#[error("null result from llama cpp")]
NullResult,
/// Failed to convert the path to a rust str. This means the path was not valid unicode
#[error("failed to convert path {0} to str")]
PathToStrError(PathBuf),
}
/// An error that can occur when loading a model.
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaLoraAdapterSetError {
/// llama.cpp returned a non-zero error code.
#[error("error code from llama cpp")]
ErrorResult(i32),
}
/// An error that can occur when loading a model.
#[derive(Debug, Eq, PartialEq, thiserror::Error)]
pub enum LlamaLoraAdapterRemoveError {
/// llama.cpp returned a non-zero error code.
#[error("error code from llama cpp")]
ErrorResult(i32),
}
/// get the time (in microseconds) according to llama.cpp
/// ```
/// # use llama_cpp_2::llama_time_us;
/// let time = llama_time_us();
/// assert!(time > 0);
/// ```
#[must_use]
pub fn llama_time_us() -> i64 {
unsafe { llama_cpp_sys_2::llama_time_us() }
}
/// get the max number of devices according to llama.cpp (this is generally cuda devices)
/// ```
/// # use llama_cpp_2::max_devices;
/// let max_devices = max_devices();
/// assert!(max_devices >= 0);
/// ```
#[must_use]
pub fn max_devices() -> usize {
unsafe { llama_cpp_sys_2::llama_max_devices() }
}
/// is memory mapping supported according to llama.cpp
/// ```
/// # use llama_cpp_2::mmap_supported;
/// let mmap_supported = mmap_supported();
/// if mmap_supported {
/// println!("mmap_supported!");
/// }
/// ```
#[must_use]
pub fn mmap_supported() -> bool {
unsafe { llama_cpp_sys_2::llama_supports_mmap() }
}
/// is memory locking supported according to llama.cpp
/// ```
/// # use llama_cpp_2::mlock_supported;
/// let mlock_supported = mlock_supported();
/// if mlock_supported {
/// println!("mlock_supported!");
/// }
/// ```
#[must_use]
pub fn mlock_supported() -> bool {
unsafe { llama_cpp_sys_2::llama_supports_mlock() }
}
/// An error that can occur when converting a token to a string.
#[derive(Debug, thiserror::Error, Clone)]
#[non_exhaustive]
pub enum TokenToStringError {
/// the token type was unknown
#[error("Unknown Token Type")]
UnknownTokenType,
/// There was insufficient buffer space to convert the token to a string.
#[error("Insufficient Buffer Space {0}")]
InsufficientBufferSpace(c_int),
/// The token was not valid utf8.
#[error("FromUtf8Error {0}")]
FromUtf8Error(#[from] FromUtf8Error),
}
/// Failed to convert a string to a token sequence.
#[derive(Debug, thiserror::Error)]
pub enum StringToTokenError {
/// the string contained a null byte and thus could not be converted to a c string.
#[error("{0}")]
NulError(#[from] NulError),
#[error("{0}")]
/// Failed to convert a provided integer to a [`c_int`].
CIntConversionError(#[from] std::num::TryFromIntError),
}
/// Failed to apply model chat template.
#[derive(Debug, thiserror::Error)]
pub enum NewLlamaChatMessageError {
/// the string contained a null byte and thus could not be converted to a c string.
#[error("{0}")]
NulError(#[from] NulError),
}
/// Failed to apply model chat template.
#[derive(Debug, thiserror::Error)]
pub enum ApplyChatTemplateError {
/// the string contained a null byte and thus could not be converted to a c string.
#[error("{0}")]
NulError(#[from] NulError),
/// the string could not be converted to utf8.
#[error("{0}")]
FromUtf8Error(#[from] FromUtf8Error),
}
/// Get the time in microseconds according to ggml
///
/// ```
/// # use std::time::Duration;
/// use llama_cpp_2::ggml_time_us;
///
/// let start = ggml_time_us();
///
/// std::thread::sleep(Duration::from_micros(10));
///
/// let end = ggml_time_us();
///
/// let elapsed = end - start;
///
/// assert!(elapsed >= 10)
#[must_use]
pub fn ggml_time_us() -> i64 {
unsafe { llama_cpp_sys_2::ggml_time_us() }
}
/// checks if mlock is supported
///
/// ```
/// # use llama_cpp_2::llama_supports_mlock;
///
/// if llama_supports_mlock() {
/// println!("mlock is supported!");
/// } else {
/// println!("mlock is not supported!");
/// }
/// ```
#[must_use]
pub fn llama_supports_mlock() -> bool {
unsafe { llama_cpp_sys_2::llama_supports_mlock() }
}