llama_cpp_2/timing.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
//! Safe wrapper around `llama_timings`.
use std::fmt::{Debug, Display, Formatter};
/// A wrapper around `llama_timings`.
#[derive(Clone, Copy, Debug)]
pub struct LlamaTimings {
pub(crate) timings: llama_cpp_sys_2::llama_perf_context_data,
}
impl LlamaTimings {
/// Create a new `LlamaTimings`.
/// ```
/// # use llama_cpp_2::timing::LlamaTimings;
/// let timings = LlamaTimings::new(1.0, 2.0, 3.0, 4.0, 5, 6);
/// let timings_str = "load time = 2.00 ms
/// prompt eval time = 3.00 ms / 5 tokens (0.60 ms per token, 1666.67 tokens per second)
/// eval time = 4.00 ms / 6 runs (0.67 ms per token, 1500.00 tokens per second)\n";
/// assert_eq!(timings_str, format!("{}", timings));
/// ```
#[allow(clippy::too_many_arguments)]
#[must_use]
pub fn new(
t_start_ms: f64,
t_load_ms: f64,
t_p_eval_ms: f64,
t_eval_ms: f64,
n_p_eval: i32,
n_eval: i32,
) -> Self {
Self {
timings: llama_cpp_sys_2::llama_perf_context_data {
t_start_ms,
t_load_ms,
t_p_eval_ms,
t_eval_ms,
n_p_eval,
n_eval,
},
}
}
/// Get the start time in milliseconds.
#[must_use]
pub fn t_start_ms(&self) -> f64 {
self.timings.t_start_ms
}
/// Get the load time in milliseconds.
#[must_use]
pub fn t_load_ms(&self) -> f64 {
self.timings.t_load_ms
}
/// Get the prompt evaluation time in milliseconds.
#[must_use]
pub fn t_p_eval_ms(&self) -> f64 {
self.timings.t_p_eval_ms
}
/// Get the evaluation time in milliseconds.
#[must_use]
pub fn t_eval_ms(&self) -> f64 {
self.timings.t_eval_ms
}
/// Get the number of prompt evaluations.
#[must_use]
pub fn n_p_eval(&self) -> i32 {
self.timings.n_p_eval
}
/// Get the number of evaluations.
#[must_use]
pub fn n_eval(&self) -> i32 {
self.timings.n_eval
}
/// Set the start time in milliseconds.
pub fn set_t_start_ms(&mut self, t_start_ms: f64) {
self.timings.t_start_ms = t_start_ms;
}
/// Set the load time in milliseconds.
pub fn set_t_load_ms(&mut self, t_load_ms: f64) {
self.timings.t_load_ms = t_load_ms;
}
/// Set the prompt evaluation time in milliseconds.
pub fn set_t_p_eval_ms(&mut self, t_p_eval_ms: f64) {
self.timings.t_p_eval_ms = t_p_eval_ms;
}
/// Set the evaluation time in milliseconds.
pub fn set_t_eval_ms(&mut self, t_eval_ms: f64) {
self.timings.t_eval_ms = t_eval_ms;
}
/// Set the number of prompt evaluations.
pub fn set_n_p_eval(&mut self, n_p_eval: i32) {
self.timings.n_p_eval = n_p_eval;
}
/// Set the number of evaluations.
pub fn set_n_eval(&mut self, n_eval: i32) {
self.timings.n_eval = n_eval;
}
}
impl Display for LlamaTimings {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
writeln!(f, "load time = {:.2} ms", self.t_load_ms())?;
writeln!(
f,
"prompt eval time = {:.2} ms / {} tokens ({:.2} ms per token, {:.2} tokens per second)",
self.t_p_eval_ms(),
self.n_p_eval(),
self.t_p_eval_ms() / f64::from(self.n_p_eval()),
1e3 / self.t_p_eval_ms() * f64::from(self.n_p_eval())
)?;
writeln!(
f,
"eval time = {:.2} ms / {} runs ({:.2} ms per token, {:.2} tokens per second)",
self.t_eval_ms(),
self.n_eval(),
self.t_eval_ms() / f64::from(self.n_eval()),
1e3 / self.t_eval_ms() * f64::from(self.n_eval())
)?;
Ok(())
}
}