llama_cpp_sys_4/
common.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
//! Manual wrapper for values in llama.cpp/common/common.h

use crate::{
    ggml_numa_strategy, llama_attention_type, llama_pooling_type, llama_rope_scaling_type,
    llama_split_mode, GGML_NUMA_STRATEGY_DISABLED, LLAMA_ATTENTION_TYPE_UNSPECIFIED,
    LLAMA_DEFAULT_SEED, LLAMA_POOLING_TYPE_UNSPECIFIED, LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
    LLAMA_SPLIT_MODE_LAYER,
};

pub const COMMON_SAMPLER_TYPE_NONE: common_sampler_type = 0;
pub const COMMON_SAMPLER_TYPE_DRY: common_sampler_type = 1;
pub const COMMON_SAMPLER_TYPE_TOP_K: common_sampler_type = 2;
pub const COMMON_SAMPLER_TYPE_TOP_P: common_sampler_type = 3;
pub const COMMON_SAMPLER_TYPE_MIN_P: common_sampler_type = 4;
pub const COMMON_SAMPLER_TYPE_TFS_Z: common_sampler_type = 5;
pub const COMMON_SAMPLER_TYPE_TYPICAL_P: common_sampler_type = 6;
pub const COMMON_SAMPLER_TYPE_TEMPERATURE: common_sampler_type = 7;
pub const COMMON_SAMPLER_TYPE_XTC: common_sampler_type = 8;
pub const COMMON_SAMPLER_TYPE_INFILL: common_sampler_type = 9;
pub type common_sampler_type = ::core::ffi::c_uint;

/// common sampler params
#[repr(C)]
#[derive(Debug, PartialEq)]
pub struct common_sampler_params {
    /// the seed used to initialize `llama_sampler`
    pub seed: u32,
    /// number of previous tokens to remember
    pub n_prev: i32,
    /// if greater than 0, output the probabilities of top `n_probs` tokens.
    pub n_probs: i32,
    /// 0 = disabled, otherwise samplers should return at least `min_keep` tokens
    pub min_keep: i32,
    /// <= 0 to use vocab size
    pub top_k: i32,
    /// 1.0 = disabled
    pub top_p: f32,
    /// 0.0 = disabled
    pub min_p: f32,
    /// 0.0 = disabled
    pub xtc_probability: f32,
    /// > 0.5 disables XTC
    pub xtc_threshold: f32,
    /// 1.0 = disabled
    pub tfs_z: f32,
    /// typical_p, 1.0 = disabled
    pub typ_p: f32,
    /// <= 0.0 to sample greedily, 0.0 to not output probabilities
    pub temp: f32,
    /// 0.0 = disabled
    pub dynatemp_range: f32,
    /// controls how entropy maps to temperature in dynamic temperature sampler
    pub dynatemp_exponent: f32,
    /// last n tokens to penalize (0 = disable penalty, -1 = context size)
    pub penalty_last_n: i32,
    /// 1.0 = disabled
    pub penalty_repeat: f32,
    /// 0.0 = disabled
    pub penalty_freq: f32,
    /// 0.0 = disabled
    pub penalty_present: f32,
    /// 0.0 = disabled;      DRY repetition penalty for tokens extending repetition:
    pub dry_multiplier: f32,
    /// 0.0 = disabled;      multiplier * base ^ (length of sequence before token - allowed length)
    pub dry_base: f32,
    /// tokens extending repetitions beyond this receive penalty
    pub dry_allowed_length: i32,
    /// how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
    pub dry_penalty_last_n: i32,
    /// 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    pub mirostat: i32,
    /// target entropy
    pub mirostat_tau: f32,
    /// learning rate
    pub mirostat_eta: f32,
    /// consider newlines as a repeatable token
    pub penalize_nl: bool,
    pub ignore_eos: bool,
    /// disable performance metrics
    pub no_perf: bool,
    pub dry_sequence_breakers: Vec<String>,
    pub samplers: Vec<common_sampler_type>,
    pub grammar: Vec<String>,
    pub logit_bias: Vec<(i32, f64)>,
}

impl Default for common_sampler_params {
    fn default() -> Self {
        Self {
            seed: LLAMA_DEFAULT_SEED, // the seed used to initialize llama_sampler
            n_prev: 64,               // number of previous tokens to remember
            n_probs: 0, // if greater than 0, output the probabilities of top n_probs tokens.
            min_keep: 0, // 0 = disabled, otherwise samplers should return at least min_keep tokens
            top_k: 40,  // <= 0 to use vocab size
            top_p: 0.95, // 1.0 = disabled
            min_p: 0.05, // 0.0 = disabled
            xtc_probability: 0.00, // 0.0 = disabled
            xtc_threshold: 0.10, // > 0.5 disables XTC
            tfs_z: 1.00, // 1.0 = disabled
            typ_p: 1.00, // typical_p, 1.0 = disabled
            temp: 0.80, // <= 0.0 to sample greedily, 0.0 to not output probabilities
            dynatemp_range: 0.00, // 0.0 = disabled
            dynatemp_exponent: 1.00, // controls how entropy maps to temperature in dynamic temperature sampler
            penalty_last_n: 64, // last n tokens to penalize (0 = disable penalty, -1 = context size)
            penalty_repeat: 1.00, // 1.0 = disabled
            penalty_freq: 0.00, // 0.0 = disabled
            penalty_present: 0.00, // 0.0 = disabled
            dry_multiplier: 0.0, // 0.0 = disabled;      DRY repetition penalty for tokens extending repetition:
            dry_base: 1.75, // 0.0 = disabled;      multiplier * base ^ (length of sequence before token - allowed length)
            dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty
            dry_penalty_last_n: -1, // how many tokens to scan for repetitions (0 = disable penalty, -1 = context size)
            mirostat: 0,            // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
            mirostat_tau: 5.00,     // target entropy
            mirostat_eta: 0.10,     // learning rate
            penalize_nl: false,     // consider newlines as a repeatable token
            ignore_eos: false,
            no_perf: false, // disable performance metrics

            dry_sequence_breakers: vec!["\n".into(), ":".into(), "\"".into(), "*".into()], // default sequence breakers for DRY

            samplers: vec![
                COMMON_SAMPLER_TYPE_DRY,
                COMMON_SAMPLER_TYPE_TOP_K,
                COMMON_SAMPLER_TYPE_TFS_Z,
                COMMON_SAMPLER_TYPE_TYPICAL_P,
                COMMON_SAMPLER_TYPE_TOP_P,
                COMMON_SAMPLER_TYPE_MIN_P,
                COMMON_SAMPLER_TYPE_XTC,
                COMMON_SAMPLER_TYPE_TEMPERATURE,
            ],

            grammar: vec![], // optional BNF-like grammar to constrain sampling

            logit_bias: vec![], // logit biases to apply
        }
    }
}

#[repr(C)]
#[derive(Debug, PartialEq)]
pub struct common_params {
    /// new tokens to predict
    pub n_predict: i32,
    /// context size
    pub n_ctx: i32,
    /// logical batch size for prompt processing (must be >=32 to use BLAS)
    pub n_batch: i32,
    /// physical batch size for prompt processing (must be >=32 to use BLAS)
    pub n_ubatch: i32,
    /// number of tokens to keep from initial prompt
    pub n_keep: i32,
    /// number of tokens to draft during speculative decoding
    pub n_draft: i32,
    /// max number of chunks to process (-1 = unlimited)
    pub n_chunks: i32,
    /// number of parallel sequences to decode
    pub n_parallel: i32,
    /// number of sequences to decode
    pub n_sequences: i32,
    // speculative decoding split probability
    pub p_split: f32,
    /// number of layers to store in VRAM (-1 - use default)
    pub n_gpu_layers: i32,
    /// number of layers to store in VRAM for the draft model (-1 - use default)
    pub n_gpu_layers_draft: i32,
    /// the GPU that is used for scratch and small tensors
    pub main_gpu: i32,
    /// how split tensors should be distributed across GPUs
    // pub tensor_split: [f32; 128usize],
    /// group-attention factor
    pub grp_attn_n: i32,
    /// group-attention width
    pub grp_attn_w: i32,
    /// print token count every n tokens (-1 = disabled)
    pub n_print: i32,
    /// RoPE base frequency
    pub rope_freq_base: f32,
    /// RoPE frequency scaling factor
    pub rope_freq_scale: f32,
    /// YaRN extrapolation mix factor
    pub yarn_ext_factor: f32,
    /// YaRN magnitude scaling factor
    pub yarn_attn_factor: f32,
    /// YaRN low correction dim
    pub yarn_beta_fast: f32,
    /// YaRN high correction dim
    pub yarn_beta_slow: f32,
    /// YaRN original context length
    pub yarn_orig_ctx: i32,
    /// KV cache defragmentation threshold
    pub defrag_thold: f32,
    // pub cpuparams: cpu_params,
    // pub cpuparams_batch: cpu_params,
    // pub draft_cpuparams: cpu_params,
    // pub draft_cpuparams_batch: cpu_params,
    // pub cb_eval: ggml_backend_sched_eval_callback,
    // pub cb_eval_user_data: *mut ::core::ffi::c_void,
    pub numa: ggml_numa_strategy,
    pub split_mode: llama_split_mode,
    pub rope_scaling_type: llama_rope_scaling_type,
    pub pooling_type: llama_pooling_type,
    pub attention_type: llama_attention_type,
    pub sparams: common_sampler_params,
    // pub model: std___1_string,
    // pub model_draft: std___1_string,
    // pub model_alias: std___1_string,
    // pub model_url: std___1_string,
    // pub hf_token: std___1_string,
    // pub hf_repo: std___1_string,
    // pub hf_file: std___1_string,
    // pub prompt: std___1_string,
    // pub prompt_file: std___1_string,
    // pub path_prompt_cache: std___1_string,
    // pub input_prefix: std___1_string,
    // pub input_suffix: std___1_string,
    // pub logdir: std___1_string,
    // pub lookup_cache_static: std___1_string,
    // pub lookup_cache_dynamic: std___1_string,
    // pub logits_file: std___1_string,
    // pub rpc_servers: std___1_string,
    // pub in_files: [u64; 3usize],
    // pub antiprompt: [u64; 3usize],
    // pub kv_overrides: [u64; 3usize],
    // pub lora_init_without_apply: bool,
    // pub lora_adapters: [u64; 3usize],
    // pub control_vectors: [u64; 3usize],
    // pub verbosity: i32,
    // pub control_vector_layer_start: i32,
    // pub control_vector_layer_end: i32,
    // pub ppl_stride: i32,
    // pub ppl_output_type: i32,
    // pub hellaswag: bool,
    // pub hellaswag_tasks: usize,
    // pub winogrande: bool,
    // pub winogrande_tasks: usize,
    // pub multiple_choice: bool,
    // pub multiple_choice_tasks: usize,
    // pub kl_divergence: bool,
    // pub usage: bool,
    // pub use_color: bool,
    // pub special: bool,
    // pub interactive: bool,
    // pub interactive_first: bool,
    // pub conversation: bool,
    // pub prompt_cache_all: bool,
    // pub prompt_cache_ro: bool,
    // pub escape: bool,
    // pub multiline_input: bool,
    // pub simple_io: bool,
    // pub cont_batching: bool,
    // pub flash_attn: bool,
    // pub no_perf: bool,
    // pub ctx_shift: bool,
    // pub input_prefix_bos: bool,
    // pub logits_all: bool,
    // pub use_mmap: bool,
    // pub use_mlock: bool,
    // pub verbose_prompt: bool,
    // pub display_prompt: bool,
    // pub dump_kv_cache: bool,
    // pub no_kv_offload: bool,
    // pub warmup: bool,
    // pub check_tensors: bool,
    // pub cache_type_k: std___1_string,
    // pub cache_type_v: std___1_string,
    // pub mmproj: std___1_string,
    // pub image: [u64; 3usize],
    // pub embedding: bool,
    // pub embd_normalize: i32,
    // pub embd_out: std___1_string,
    // pub embd_sep: std___1_string,
    // pub reranking: bool,
    // pub port: i32,
    // pub timeout_read: i32,
    // pub timeout_write: i32,
    // pub n_threads_http: i32,
    // pub n_cache_reuse: i32,
    // pub hostname: std___1_string,
    // pub public_path: std___1_string,
    // pub chat_template: std___1_string,
    // pub enable_chat_template: bool,
    // pub api_keys: [u64; 3usize],
    // pub ssl_file_key: std___1_string,
    // pub ssl_file_cert: std___1_string,
    // pub webui: bool,
    // pub endpoint_slots: bool,
    // pub endpoint_props: bool,
    // pub endpoint_metrics: bool,
    // pub log_json: bool,
    // pub slot_save_path: std___1_string,
    // pub slot_prompt_similarity: f32,
    // pub is_pp_shared: bool,
    // pub n_pp: [u64; 3usize],
    // pub n_tg: [u64; 3usize],
    // pub n_pl: [u64; 3usize],
    // pub context_files: [u64; 3usize],
    // pub chunk_size: i32,
    // pub chunk_separator: std___1_string,
    // pub n_junk: i32,
    // pub i_pos: i32,
    // pub out_file: std___1_string,
    // pub n_out_freq: i32,
    // pub n_save_freq: i32,
    // pub i_chunk: i32,
    // pub process_output: bool,
    // pub compute_ppl: bool,
    // pub n_pca_batch: ::core::ffi::c_int,
    // pub n_pca_iterations: ::core::ffi::c_int,
    // pub cvector_dimre_method: dimre_method,
    // pub cvector_outfile: std___1_string,
    // pub cvector_positive_file: std___1_string,
    // pub cvector_negative_file: std___1_string,
    // pub spm_infill: bool,
    // pub lora_outfile: std___1_string,
    // pub batched_bench_output_jsonl: bool,
}

impl Default for common_params {
    fn default() -> Self {
        Self {
            n_predict: -1,          // new tokens to predict
            n_ctx: 0,               // context size
            n_batch: 2048, // logical batch size for prompt processing (must be >=32 to use BLAS)
            n_ubatch: 512, // physical batch size for prompt processing (must be >=32 to use BLAS)
            n_keep: 0,     // number of tokens to keep from initial prompt
            n_draft: 5,    // number of tokens to draft during speculative decoding
            n_chunks: -1,  // max number of chunks to process (-1 = unlimited)
            n_parallel: 1, // number of parallel sequences to decode
            n_sequences: 1, // number of sequences to decode
            p_split: 0.1,  // speculative decoding split probability
            n_gpu_layers: -1, // number of layers to store in VRAM (-1 - use default)
            n_gpu_layers_draft: -1, // number of layers to store in VRAM for the draft model (-1 - use default)
            main_gpu: 0,            // the GPU that is used for scratch and small tensors
            // tensor_split[128]     :   {0}, // how split tensors should be distributed across GPUs
            grp_attn_n: 1,         // group-attention factor
            grp_attn_w: 512,       // group-attention width
            n_print: -1,           // print token count every n tokens (-1 = disabled)
            rope_freq_base: 0.0,   // RoPE base frequency
            rope_freq_scale: 0.0,  // RoPE frequency scaling factor
            yarn_ext_factor: -1.0, // YaRN extrapolation mix factor
            yarn_attn_factor: 1.0, // YaRN magnitude scaling factor
            yarn_beta_fast: 32.0,  // YaRN low correction dim
            yarn_beta_slow: 1.0,   // YaRN high correction dim
            yarn_orig_ctx: 0,      // YaRN original context length
            defrag_thold: -1.0,    // KV cache defragmentation threshold

            // struct cpu_params cpuparams;
            // struct cpu_params cpuparams_batch;
            // struct cpu_params draft_cpuparams;
            // struct cpu_params draft_cpuparams_batch;

            // ggml_backend_sched_eval_callback cb_eval = nullptr;
            // void * cb_eval_user_data                 = nullptr;
            numa: GGML_NUMA_STRATEGY_DISABLED,

            split_mode: LLAMA_SPLIT_MODE_LAYER, // how to split the model across GPUs
            rope_scaling_type: LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
            pooling_type: LLAMA_POOLING_TYPE_UNSPECIFIED, // pooling type for embeddings
            attention_type: LLAMA_ATTENTION_TYPE_UNSPECIFIED, // attention type for embeddings

            sparams: common_sampler_params::default(),
        }
    }
}