1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
use std::{
    convert::TryInto,
    sync::{Arc, Mutex},
};

use crate::{Client, SpanLocation};

#[repr(u8)]
/// The API label associated with the given gpu context. The list here only includes
/// APIs that are currently supported by Tracy's own gpu implementations.
//
// Copied from `tracy-client-sys/tracy/common/TracyQueue.hpp:391`. Comment on enum states
// that the values are stable, due to potential serialization issues, so copying this enum
// shouldn't be a problem.
pub enum GpuContextType {
    /// Stand in for other types of contexts.
    Invalid = 0,
    /// An OpenGL context
    OpenGL = 1,
    /// A Vulkan context
    Vulkan = 2,
    /// An OpenCL context
    OpenCL = 3,
    /// A D3D12 context.
    Direct3D12 = 4,
    /// A D3D11 context.
    Direct3D11 = 5,
}

/// Context for creating gpu spans.
///
/// Generally corresponds to a single hardware queue.
///
/// The flow of creating and using gpu context generally looks like this:
///
/// ```rust,no_run
/// # let client = tracy_client::Client::start();
/// // The period of the gpu clock in nanoseconds, as provided by your GPU api.
/// // This value corresponds to 1GHz.
/// let period: f32 = 1_000_000_000.0;
///
/// // GPU API: Record writing a timestamp and resolve that to a mappable buffer.
/// // GPU API: Submit the command buffer writing the timestamp.
/// // GPU API: Immediately block until the submission is finished.
/// // GPU API: Map buffer, get timestamp value.
/// let starting_timestamp: i64 = /* whatever you get from this timestamp */ 0;
///
/// // Create the gpu context
/// let gpu_context = client.new_gpu_context(
///     Some("MyContext"),
///     tracy_client::GpuContextType::Vulkan,
///     starting_timestamp,
///     period
/// ).unwrap();
///
/// // Now you have some work that you want to time on the gpu.
///
/// // GPU API: Record writing a timestamp before the work.
/// let mut span = gpu_context.span_alloc("MyGpuSpan1", "My::Work", "myfile.rs", 12).unwrap();
///
/// // GPU API: Record work.
///
/// // GPU API: Record writing a timestamp after the work.
/// span.end_zone();
///
/// // Some time later, once the written timestamp values are available on the cpu.
/// # let (starting_timestamp, ending_timestamp) = (0, 0);
///
/// // Consumes span.
/// span.upload_timestamp(starting_timestamp, ending_timestamp);
/// ```
#[derive(Clone)]
pub struct GpuContext {
    #[cfg(feature = "enable")]
    _client: Client,
    #[cfg(feature = "enable")]
    value: u8,
    #[cfg(feature = "enable")]
    gpu_start_timestamp: i64,
    #[cfg(feature = "enable")]
    span_freelist: Arc<Mutex<Vec<u16>>>,
    _private: (),
}
#[cfg(feature = "enable")]
static GPU_CONTEXT_INDEX: Mutex<u8> = Mutex::new(0);

/// Errors that can occur when creating a gpu context.
#[derive(Debug)]
pub enum GpuContextCreationError {
    /// More than `u8::MAX` contexts have been created at any point in the program.
    TooManyContextsCreated,
}

impl std::fmt::Display for GpuContextCreationError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "More than 255 contexts have been created at any point in the execution of this program."
        )
    }
}

impl std::error::Error for GpuContextCreationError {}

#[derive(Debug, PartialEq)]
enum GpuSpanState {
    /// The span has been started. All gpu spans start in this state.
    Started,
    /// The span has been ended, waiting for timestamp upload.
    Ended,
    /// All timestamps have been uploaded.
    Uploaded,
}

/// Span for timing gpu work.
///
/// See the [context level documentation](GpuContext) for more information on use.
///
/// If the span is dropped early, the following happens:
/// - If the span has not been ended, the span is ended. AND
/// - If the span has not had values uploaded, the span is uploaded with
///   the timestamps marking the start of the current gpu context. This
///   will put the span out of the way of other spans.
#[must_use]
pub struct GpuSpan {
    #[cfg(feature = "enable")]
    context: GpuContext,
    #[cfg(feature = "enable")]
    start_query_id: u16,
    #[cfg(feature = "enable")]
    end_query_id: u16,
    #[cfg(feature = "enable")]
    state: GpuSpanState,
    _private: (),
}

/// Errors that can occur when creating a gpu span.
#[derive(Debug)]
pub enum GpuSpanCreationError {
    /// More than `32767` spans are still waiting for gpu data.
    TooManyPendingSpans,
}

impl std::fmt::Display for GpuSpanCreationError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "Too many spans still waiting for gpu data. There may not be more than 32767 spans that are pending gpu data at once."
        )
    }
}

impl std::error::Error for GpuSpanCreationError {}

impl Client {
    /// Creates a new GPU context.
    ///
    /// - `name` is the name of the context.
    /// - `ty` is the type (backend) of the context.
    /// - `gpu_timestamp` is the gpu side timestamp the corresponds (as close as possible) to this call.
    /// - `period` is the period of the gpu clock in nanoseconds (setting 1.0 means the clock is 1GHz, 1000.0 means 1MHz, etc).
    ///
    /// See the [type level documentation](GpuContext) for more information.
    ///
    /// # Errors
    ///
    /// - If more than 255 contexts were made during the lifetime of the application.
    pub fn new_gpu_context(
        self,
        name: Option<&str>,
        ty: GpuContextType,
        gpu_timestamp: i64,
        period: f32,
    ) -> Result<GpuContext, GpuContextCreationError> {
        #[cfg(feature = "enable")]
        {
            // We use a mutex to lock the context index to prevent races when using fetch_add.
            //
            // This prevents multiple contexts getting the same context id.
            let mut context_index_guard = GPU_CONTEXT_INDEX.lock().unwrap();
            if *context_index_guard == 255 {
                return Err(GpuContextCreationError::TooManyContextsCreated);
            }
            let context = *context_index_guard;
            *context_index_guard += 1;
            drop(context_index_guard);

            // SAFETY:
            // - We know we aren't re-using the context id because of the above logic.
            unsafe {
                sys::___tracy_emit_gpu_new_context_serial(sys::___tracy_gpu_new_context_data {
                    gpuTime: gpu_timestamp,
                    period,
                    context,
                    flags: 0,
                    type_: ty as u8,
                });
            };

            if let Some(name) = name {
                // SAFETY:
                // - We've allocated a context.
                // - The names will copied into the command stream, so the pointers do not need to last.
                unsafe {
                    sys::___tracy_emit_gpu_context_name_serial(
                        sys::___tracy_gpu_context_name_data {
                            context,
                            name: name.as_ptr().cast(),
                            len: name.len().try_into().unwrap_or(u16::MAX),
                        },
                    );
                }
            }

            Ok(GpuContext {
                _client: self,
                value: context,
                gpu_start_timestamp: gpu_timestamp,
                span_freelist: Arc::new(Mutex::new((0..=u16::MAX).collect())),
                _private: (),
            })
        }
        #[cfg(not(feature = "enable"))]
        Ok(GpuContext { _private: () })
    }
}

impl GpuContext {
    #[cfg(feature = "enable")]
    fn alloc_span_ids(&self) -> Result<(u16, u16), GpuSpanCreationError> {
        let mut freelist = self.span_freelist.lock().unwrap();
        if freelist.len() < 2 {
            return Err(GpuSpanCreationError::TooManyPendingSpans);
        }
        // These unwraps are unreachable.
        let start = freelist.pop().unwrap();
        let end = freelist.pop().unwrap();
        Ok((start, end))
    }

    /// Creates a new gpu span with the given source location.
    ///
    /// This should be called right next to where you record the corresponding gpu timestamp. This
    /// allows tracy to correctly associate the cpu time with the gpu timestamp.
    ///
    /// # Errors
    ///
    /// - If there are more than 32767 spans waiting for gpu data at once.
    pub fn span(
        &self,
        span_location: &'static SpanLocation,
    ) -> Result<GpuSpan, GpuSpanCreationError> {
        #[cfg(feature = "enable")]
        {
            let (start_query_id, end_query_id) = self.alloc_span_ids()?;

            // SAFETY: We know that the span location is valid forever as it is 'static. `usize` will
            // always be smaller than u64, so no data will be lost.
            unsafe {
                sys::___tracy_emit_gpu_zone_begin_serial(sys::___tracy_gpu_zone_begin_data {
                    srcloc: std::ptr::addr_of!(span_location.data) as usize as u64,
                    queryId: start_query_id,
                    context: self.value,
                });
            };

            Ok(GpuSpan {
                context: self.clone(),
                start_query_id,
                end_query_id,
                state: GpuSpanState::Started,
                _private: (),
            })
        }
        #[cfg(not(feature = "enable"))]
        Ok(GpuSpan { _private: () })
    }

    /// Creates a new gpu span with the given name, function, file, and line.
    ///
    /// This should be called right next to where you record the corresponding gpu timestamp. This
    /// allows tracy to correctly associate the cpu time with the gpu timestamp.
    ///
    /// # Errors
    ///
    /// - If there are more than 32767 spans waiting for gpu data at once.
    pub fn span_alloc(
        &self,
        name: &str,
        function: &str,
        file: &str,
        line: u32,
    ) -> Result<GpuSpan, GpuSpanCreationError> {
        #[cfg(feature = "enable")]
        {
            let srcloc = unsafe {
                sys::___tracy_alloc_srcloc_name(
                    line,
                    file.as_ptr().cast(),
                    file.len(),
                    function.as_ptr().cast(),
                    function.len(),
                    name.as_ptr().cast(),
                    name.len(),
                    0,
                )
            };

            let (start_query_id, end_query_id) = self.alloc_span_ids()?;

            unsafe {
                sys::___tracy_emit_gpu_zone_begin_alloc_serial(sys::___tracy_gpu_zone_begin_data {
                    srcloc,
                    queryId: start_query_id,
                    context: self.value,
                });
            };

            Ok(GpuSpan {
                context: self.clone(),
                start_query_id,
                end_query_id,
                state: GpuSpanState::Started,
                _private: (),
            })
        }
        #[cfg(not(feature = "enable"))]
        Ok(GpuSpan { _private: () })
    }
}

impl GpuSpan {
    /// Marks the end of the given gpu span. This should be called right next to where you record
    /// the corresponding gpu timestamp for the end of the span. This allows tracy to correctly
    /// associate the cpu time with the gpu timestamp.
    ///
    /// Only the first time you call this function will it actually emit a gpu zone end event. Any
    /// subsequent calls will be ignored.
    pub fn end_zone(&mut self) {
        #[cfg(feature = "enable")]
        {
            if self.state != GpuSpanState::Started {
                return;
            }
            unsafe {
                sys::___tracy_emit_gpu_zone_end_serial(sys::___tracy_gpu_zone_end_data {
                    queryId: self.end_query_id,
                    context: self.context.value,
                });
            };
            self.state = GpuSpanState::Ended;
        }
    }

    /// Uploads the gpu timestamps associated with the span start and end to tracy,
    /// closing out the span.
    pub fn upload_timestamp(mut self, start_timestamp: i64, end_timestamp: i64) {
        #[cfg(feature = "enable")]
        self.upload_timestamp_impl(start_timestamp, end_timestamp);
    }

    #[cfg(feature = "enable")]
    fn upload_timestamp_impl(&mut self, start_timestamp: i64, end_timestamp: i64) {
        assert_eq!(
            self.state,
            GpuSpanState::Ended,
            "You must call end_zone before uploading timestamps."
        );
        unsafe {
            sys::___tracy_emit_gpu_time_serial(sys::___tracy_gpu_time_data {
                gpuTime: start_timestamp,
                queryId: self.start_query_id,
                context: self.context.value,
            });
        };

        unsafe {
            sys::___tracy_emit_gpu_time_serial(sys::___tracy_gpu_time_data {
                gpuTime: end_timestamp,
                queryId: self.end_query_id,
                context: self.context.value,
            });
        };

        // Put the ids back into the freelist.
        let mut freelist = self.context.span_freelist.lock().unwrap();
        freelist.push(self.start_query_id);
        freelist.push(self.end_query_id);
        drop(freelist);

        self.state = GpuSpanState::Uploaded;
    }
}

impl Drop for GpuSpan {
    fn drop(&mut self) {
        #[cfg(feature = "enable")]
        match self.state {
            GpuSpanState::Started => {
                self.end_zone();
                self.upload_timestamp_impl(
                    self.context.gpu_start_timestamp,
                    self.context.gpu_start_timestamp,
                );
            }
            GpuSpanState::Ended => {
                self.upload_timestamp_impl(
                    self.context.gpu_start_timestamp,
                    self.context.gpu_start_timestamp,
                );
            }
            GpuSpanState::Uploaded => {}
        }
    }
}