#[repr(C)]pub struct common_params {Show 30 fields
pub n_predict: i32,
pub n_ctx: i32,
pub n_batch: i32,
pub n_ubatch: i32,
pub n_keep: i32,
pub n_draft: i32,
pub n_chunks: i32,
pub n_parallel: i32,
pub n_sequences: i32,
pub p_split: f32,
pub n_gpu_layers: i32,
pub n_gpu_layers_draft: i32,
pub main_gpu: i32,
pub grp_attn_n: i32,
pub grp_attn_w: i32,
pub n_print: i32,
pub rope_freq_base: f32,
pub rope_freq_scale: f32,
pub yarn_ext_factor: f32,
pub yarn_attn_factor: f32,
pub yarn_beta_fast: f32,
pub yarn_beta_slow: f32,
pub yarn_orig_ctx: i32,
pub defrag_thold: f32,
pub numa: ggml_numa_strategy,
pub split_mode: llama_split_mode,
pub rope_scaling_type: llama_rope_scaling_type,
pub pooling_type: llama_pooling_type,
pub attention_type: llama_attention_type,
pub sparams: common_sampler_params,
}
Fields§
§n_predict: i32
new tokens to predict
n_ctx: i32
context size
n_batch: i32
logical batch size for prompt processing (must be >=32 to use BLAS)
n_ubatch: i32
physical batch size for prompt processing (must be >=32 to use BLAS)
n_keep: i32
number of tokens to keep from initial prompt
n_draft: i32
number of tokens to draft during speculative decoding
n_chunks: i32
max number of chunks to process (-1 = unlimited)
n_parallel: i32
number of parallel sequences to decode
n_sequences: i32
number of sequences to decode
p_split: f32
§n_gpu_layers: i32
number of layers to store in VRAM (-1 - use default)
n_gpu_layers_draft: i32
number of layers to store in VRAM for the draft model (-1 - use default)
main_gpu: i32
the GPU that is used for scratch and small tensors
grp_attn_n: i32
how split tensors should be distributed across GPUs group-attention factor
grp_attn_w: i32
group-attention width
n_print: i32
print token count every n tokens (-1 = disabled)
rope_freq_base: f32
RoPE base frequency
rope_freq_scale: f32
RoPE frequency scaling factor
yarn_ext_factor: f32
YaRN extrapolation mix factor
yarn_attn_factor: f32
YaRN magnitude scaling factor
yarn_beta_fast: f32
YaRN low correction dim
yarn_beta_slow: f32
YaRN high correction dim
yarn_orig_ctx: i32
YaRN original context length
defrag_thold: f32
KV cache defragmentation threshold
numa: ggml_numa_strategy
§split_mode: llama_split_mode
§rope_scaling_type: llama_rope_scaling_type
§pooling_type: llama_pooling_type
§attention_type: llama_attention_type
§sparams: common_sampler_params