llama_cpp_sys_4/common.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364
//! Manual wrapper for values in llama.cpp/common/common.h
use crate::{
ggml_numa_strategy, llama_attention_type, llama_pooling_type, llama_rope_scaling_type,
llama_split_mode, GGML_NUMA_STRATEGY_DISABLED, LLAMA_ATTENTION_TYPE_UNSPECIFIED,
LLAMA_DEFAULT_SEED, LLAMA_POOLING_TYPE_UNSPECIFIED, LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
LLAMA_SPLIT_MODE_LAYER,
};
pub const COMMON_SAMPLER_TYPE_NONE: common_sampler_type = 0;
pub const COMMON_SAMPLER_TYPE_DRY: common_sampler_type = 1;
pub const COMMON_SAMPLER_TYPE_TOP_K: common_sampler_type = 2;
pub const COMMON_SAMPLER_TYPE_TOP_P: common_sampler_type = 3;
pub const COMMON_SAMPLER_TYPE_MIN_P: common_sampler_type = 4;
pub const COMMON_SAMPLER_TYPE_TFS_Z: common_sampler_type = 5;
pub const COMMON_SAMPLER_TYPE_TYPICAL_P: common_sampler_type = 6;
pub const COMMON_SAMPLER_TYPE_TEMPERATURE: common_sampler_type = 7;
pub const COMMON_SAMPLER_TYPE_XTC: common_sampler_type = 8;
pub const COMMON_SAMPLER_TYPE_INFILL: common_sampler_type = 9;
pub type common_sampler_type = ::core::ffi::c_uint;
/// common sampler params
#[repr(C)]
#[derive(Debug, PartialEq)]
pub struct common_sampler_params {
/// the seed used to initialize `llama_sampler`
pub seed: u32,
/// number of previous tokens to remember
pub n_prev: i32,
/// if greater than 0, output the probabilities of top `n_probs` tokens.
pub n_probs: i32,
/// 0 = disabled, otherwise samplers should return at least `min_keep` tokens
pub min_keep: i32,
/// <= 0 to use vocab size
pub top_k: i32,
/// 1.0 = disabled
pub top_p: f32,
/// 0.0 = disabled
pub min_p: f32,
/// 0.0 = disabled
pub xtc_probability: f32,
/// > 0.5 disables XTC
pub xtc_threshold: f32,
/// 1.0 = disabled
pub tfs_z: f32,
/// typical_p, 1.0 = disabled
pub typ_p: f32,
/// <= 0.0 to sample greedily, 0.0 to not output probabilities
pub temp: f32,
/// 0.0 = disabled
pub dynatemp_range: f32,
/// controls how entropy maps to temperature in dynamic temperature sampler
pub dynatemp_exponent: f32,
/// last n tokens to penalize (0 = disable penalty, -1 = context size)
pub penalty_last_n: i32,
/// 1.0 = disabled
pub penalty_repeat: f32,
/// 0.0 = disabled
pub penalty_freq: f32,
/// 0.0 = disabled
pub penalty_present: f32,
/// 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
pub dry_multiplier: f32,
/// 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
pub dry_base: f32,
/// tokens extending repetitions beyond this receive penalty
pub dry_allowed_length: i32,
/// how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
pub dry_penalty_last_n: i32,
/// 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
pub mirostat: i32,
/// target entropy
pub mirostat_tau: f32,
/// learning rate
pub mirostat_eta: f32,
/// consider newlines as a repeatable token
pub penalize_nl: bool,
pub ignore_eos: bool,
/// disable performance metrics
pub no_perf: bool,
pub dry_sequence_breakers: Vec<String>,
pub samplers: Vec<common_sampler_type>,
pub grammar: Vec<String>,
pub logit_bias: Vec<(i32, f64)>,
}
impl Default for common_sampler_params {
fn default() -> Self {
Self {
seed: LLAMA_DEFAULT_SEED, // the seed used to initialize llama_sampler
n_prev: 64, // number of previous tokens to remember
n_probs: 0, // if greater than 0, output the probabilities of top n_probs tokens.
min_keep: 0, // 0 = disabled, otherwise samplers should return at least min_keep tokens
top_k: 40, // <= 0 to use vocab size
top_p: 0.95, // 1.0 = disabled
min_p: 0.05, // 0.0 = disabled
xtc_probability: 0.00, // 0.0 = disabled
xtc_threshold: 0.10, // > 0.5 disables XTC
tfs_z: 1.00, // 1.0 = disabled
typ_p: 1.00, // typical_p, 1.0 = disabled
temp: 0.80, // <= 0.0 to sample greedily, 0.0 to not output probabilities
dynatemp_range: 0.00, // 0.0 = disabled
dynatemp_exponent: 1.00, // controls how entropy maps to temperature in dynamic temperature sampler
penalty_last_n: 64, // last n tokens to penalize (0 = disable penalty, -1 = context size)
penalty_repeat: 1.00, // 1.0 = disabled
penalty_freq: 0.00, // 0.0 = disabled
penalty_present: 0.00, // 0.0 = disabled
dry_multiplier: 0.0, // 0.0 = disabled; DRY repetition penalty for tokens extending repetition:
dry_base: 1.75, // 0.0 = disabled; multiplier * base ^ (length of sequence before token - allowed length)
dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty
dry_penalty_last_n: -1, // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
mirostat: 0, // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
mirostat_tau: 5.00, // target entropy
mirostat_eta: 0.10, // learning rate
penalize_nl: false, // consider newlines as a repeatable token
ignore_eos: false,
no_perf: false, // disable performance metrics
dry_sequence_breakers: vec!["\n".into(), ":".into(), "\"".into(), "*".into()], // default sequence breakers for DRY
samplers: vec![
COMMON_SAMPLER_TYPE_DRY,
COMMON_SAMPLER_TYPE_TOP_K,
COMMON_SAMPLER_TYPE_TFS_Z,
COMMON_SAMPLER_TYPE_TYPICAL_P,
COMMON_SAMPLER_TYPE_TOP_P,
COMMON_SAMPLER_TYPE_MIN_P,
COMMON_SAMPLER_TYPE_XTC,
COMMON_SAMPLER_TYPE_TEMPERATURE,
],
grammar: vec![], // optional BNF-like grammar to constrain sampling
logit_bias: vec![], // logit biases to apply
}
}
}
#[repr(C)]
#[derive(Debug, PartialEq)]
pub struct common_params {
/// new tokens to predict
pub n_predict: i32,
/// context size
pub n_ctx: i32,
/// logical batch size for prompt processing (must be >=32 to use BLAS)
pub n_batch: i32,
/// physical batch size for prompt processing (must be >=32 to use BLAS)
pub n_ubatch: i32,
/// number of tokens to keep from initial prompt
pub n_keep: i32,
/// number of tokens to draft during speculative decoding
pub n_draft: i32,
/// max number of chunks to process (-1 = unlimited)
pub n_chunks: i32,
/// number of parallel sequences to decode
pub n_parallel: i32,
/// number of sequences to decode
pub n_sequences: i32,
// speculative decoding split probability
pub p_split: f32,
/// number of layers to store in VRAM (-1 - use default)
pub n_gpu_layers: i32,
/// number of layers to store in VRAM for the draft model (-1 - use default)
pub n_gpu_layers_draft: i32,
/// the GPU that is used for scratch and small tensors
pub main_gpu: i32,
/// how split tensors should be distributed across GPUs
// pub tensor_split: [f32; 128usize],
/// group-attention factor
pub grp_attn_n: i32,
/// group-attention width
pub grp_attn_w: i32,
/// print token count every n tokens (-1 = disabled)
pub n_print: i32,
/// RoPE base frequency
pub rope_freq_base: f32,
/// RoPE frequency scaling factor
pub rope_freq_scale: f32,
/// YaRN extrapolation mix factor
pub yarn_ext_factor: f32,
/// YaRN magnitude scaling factor
pub yarn_attn_factor: f32,
/// YaRN low correction dim
pub yarn_beta_fast: f32,
/// YaRN high correction dim
pub yarn_beta_slow: f32,
/// YaRN original context length
pub yarn_orig_ctx: i32,
/// KV cache defragmentation threshold
pub defrag_thold: f32,
// pub cpuparams: cpu_params,
// pub cpuparams_batch: cpu_params,
// pub draft_cpuparams: cpu_params,
// pub draft_cpuparams_batch: cpu_params,
// pub cb_eval: ggml_backend_sched_eval_callback,
// pub cb_eval_user_data: *mut ::core::ffi::c_void,
pub numa: ggml_numa_strategy,
pub split_mode: llama_split_mode,
pub rope_scaling_type: llama_rope_scaling_type,
pub pooling_type: llama_pooling_type,
pub attention_type: llama_attention_type,
pub sparams: common_sampler_params,
// pub model: std___1_string,
// pub model_draft: std___1_string,
// pub model_alias: std___1_string,
// pub model_url: std___1_string,
// pub hf_token: std___1_string,
// pub hf_repo: std___1_string,
// pub hf_file: std___1_string,
// pub prompt: std___1_string,
// pub prompt_file: std___1_string,
// pub path_prompt_cache: std___1_string,
// pub input_prefix: std___1_string,
// pub input_suffix: std___1_string,
// pub logdir: std___1_string,
// pub lookup_cache_static: std___1_string,
// pub lookup_cache_dynamic: std___1_string,
// pub logits_file: std___1_string,
// pub rpc_servers: std___1_string,
// pub in_files: [u64; 3usize],
// pub antiprompt: [u64; 3usize],
// pub kv_overrides: [u64; 3usize],
// pub lora_init_without_apply: bool,
// pub lora_adapters: [u64; 3usize],
// pub control_vectors: [u64; 3usize],
// pub verbosity: i32,
// pub control_vector_layer_start: i32,
// pub control_vector_layer_end: i32,
// pub ppl_stride: i32,
// pub ppl_output_type: i32,
// pub hellaswag: bool,
// pub hellaswag_tasks: usize,
// pub winogrande: bool,
// pub winogrande_tasks: usize,
// pub multiple_choice: bool,
// pub multiple_choice_tasks: usize,
// pub kl_divergence: bool,
// pub usage: bool,
// pub use_color: bool,
// pub special: bool,
// pub interactive: bool,
// pub interactive_first: bool,
// pub conversation: bool,
// pub prompt_cache_all: bool,
// pub prompt_cache_ro: bool,
// pub escape: bool,
// pub multiline_input: bool,
// pub simple_io: bool,
// pub cont_batching: bool,
// pub flash_attn: bool,
// pub no_perf: bool,
// pub ctx_shift: bool,
// pub input_prefix_bos: bool,
// pub logits_all: bool,
// pub use_mmap: bool,
// pub use_mlock: bool,
// pub verbose_prompt: bool,
// pub display_prompt: bool,
// pub dump_kv_cache: bool,
// pub no_kv_offload: bool,
// pub warmup: bool,
// pub check_tensors: bool,
// pub cache_type_k: std___1_string,
// pub cache_type_v: std___1_string,
// pub mmproj: std___1_string,
// pub image: [u64; 3usize],
// pub embedding: bool,
// pub embd_normalize: i32,
// pub embd_out: std___1_string,
// pub embd_sep: std___1_string,
// pub reranking: bool,
// pub port: i32,
// pub timeout_read: i32,
// pub timeout_write: i32,
// pub n_threads_http: i32,
// pub n_cache_reuse: i32,
// pub hostname: std___1_string,
// pub public_path: std___1_string,
// pub chat_template: std___1_string,
// pub enable_chat_template: bool,
// pub api_keys: [u64; 3usize],
// pub ssl_file_key: std___1_string,
// pub ssl_file_cert: std___1_string,
// pub webui: bool,
// pub endpoint_slots: bool,
// pub endpoint_props: bool,
// pub endpoint_metrics: bool,
// pub log_json: bool,
// pub slot_save_path: std___1_string,
// pub slot_prompt_similarity: f32,
// pub is_pp_shared: bool,
// pub n_pp: [u64; 3usize],
// pub n_tg: [u64; 3usize],
// pub n_pl: [u64; 3usize],
// pub context_files: [u64; 3usize],
// pub chunk_size: i32,
// pub chunk_separator: std___1_string,
// pub n_junk: i32,
// pub i_pos: i32,
// pub out_file: std___1_string,
// pub n_out_freq: i32,
// pub n_save_freq: i32,
// pub i_chunk: i32,
// pub process_output: bool,
// pub compute_ppl: bool,
// pub n_pca_batch: ::core::ffi::c_int,
// pub n_pca_iterations: ::core::ffi::c_int,
// pub cvector_dimre_method: dimre_method,
// pub cvector_outfile: std___1_string,
// pub cvector_positive_file: std___1_string,
// pub cvector_negative_file: std___1_string,
// pub spm_infill: bool,
// pub lora_outfile: std___1_string,
// pub batched_bench_output_jsonl: bool,
}
impl Default for common_params {
fn default() -> Self {
Self {
n_predict: -1, // new tokens to predict
n_ctx: 0, // context size
n_batch: 2048, // logical batch size for prompt processing (must be >=32 to use BLAS)
n_ubatch: 512, // physical batch size for prompt processing (must be >=32 to use BLAS)
n_keep: 0, // number of tokens to keep from initial prompt
n_draft: 5, // number of tokens to draft during speculative decoding
n_chunks: -1, // max number of chunks to process (-1 = unlimited)
n_parallel: 1, // number of parallel sequences to decode
n_sequences: 1, // number of sequences to decode
p_split: 0.1, // speculative decoding split probability
n_gpu_layers: -1, // number of layers to store in VRAM (-1 - use default)
n_gpu_layers_draft: -1, // number of layers to store in VRAM for the draft model (-1 - use default)
main_gpu: 0, // the GPU that is used for scratch and small tensors
// tensor_split[128] : {0}, // how split tensors should be distributed across GPUs
grp_attn_n: 1, // group-attention factor
grp_attn_w: 512, // group-attention width
n_print: -1, // print token count every n tokens (-1 = disabled)
rope_freq_base: 0.0, // RoPE base frequency
rope_freq_scale: 0.0, // RoPE frequency scaling factor
yarn_ext_factor: -1.0, // YaRN extrapolation mix factor
yarn_attn_factor: 1.0, // YaRN magnitude scaling factor
yarn_beta_fast: 32.0, // YaRN low correction dim
yarn_beta_slow: 1.0, // YaRN high correction dim
yarn_orig_ctx: 0, // YaRN original context length
defrag_thold: -1.0, // KV cache defragmentation threshold
// struct cpu_params cpuparams;
// struct cpu_params cpuparams_batch;
// struct cpu_params draft_cpuparams;
// struct cpu_params draft_cpuparams_batch;
// ggml_backend_sched_eval_callback cb_eval = nullptr;
// void * cb_eval_user_data = nullptr;
numa: GGML_NUMA_STRATEGY_DISABLED,
split_mode: LLAMA_SPLIT_MODE_LAYER, // how to split the model across GPUs
rope_scaling_type: LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
pooling_type: LLAMA_POOLING_TYPE_UNSPECIFIED, // pooling type for embeddings
attention_type: LLAMA_ATTENTION_TYPE_UNSPECIFIED, // attention type for embeddings
sparams: common_sampler_params::default(),
}
}
}