imara_diff/
sources.rs

1use std::mem::take;
2use std::str::from_utf8_unchecked;
3
4use crate::TokenSource;
5
6/// Returns a [`TokenSource`] that uses
7/// the lines in `data` as Tokens. The newline seperator (`\r\n` or `\n`) is
8/// not included in the emitted tokens.
9/// This means that changing the newline seperator from `\r\n` to `\n`
10/// (or omitting it fully on the last line) is not detected by [`diff`](crate::diff).
11pub fn lines(data: &str) -> Lines<'_, false> {
12    Lines(ByteLines(data.as_bytes()))
13}
14
15/// Returns a [`TokenSource`] that uses
16/// the lines in `data` as Tokens. The newline seperator (`\r\n` or `\n`) is
17/// included in the emitted tokens.
18/// This means that changing the newline seperator from `\r\n` to `\n`
19/// (or omitting it fully on the last line) is  detected by [`diff`](crate::diff).
20pub fn lines_with_terminator(data: &str) -> Lines<'_, true> {
21    Lines(ByteLines(data.as_bytes()))
22}
23
24/// Returns a [`TokenSource`] that uses
25/// the lines in `data` as Tokens. A lines is a continous subslice of
26/// `data` which does not contain `\n` (or `\r\n`).
27/// The newline seperator (`\r\n` or `\n`) is not included in the emitted tokens.
28/// This means that changing the newline seperator from `\r\n` to `\n`
29/// (or omitting it fully on the last line) is not detected by [`diff`](crate::diff).
30pub fn byte_lines_with_terminator(data: &[u8]) -> ByteLines<'_, true> {
31    ByteLines(data)
32}
33
34/// Returns a [`TokenSource`] that uses
35/// the lines in `data` as Tokens. The newline seperator (`\r\n` or `\n`) is
36/// included in the emitted tokens.
37/// This means that changing the newline seperator from `\r\n` to `\n`
38/// (or omitting it fully on the last line) is  detected by [`diff`](crate::diff).
39pub fn byte_lines(data: &[u8]) -> ByteLines<'_, false> {
40    ByteLines(data)
41}
42
43/// By default, a line diff is produced for a string
44impl<'a> TokenSource for &'a str {
45    type Token = &'a str;
46
47    type Tokenizer = Lines<'a, false>;
48
49    fn tokenize(&self) -> Self::Tokenizer {
50        lines(self)
51    }
52
53    fn estimate_tokens(&self) -> u32 {
54        lines_with_terminator(self).estimate_tokens()
55    }
56}
57
58/// By default, a line diff is produced for a bytes
59impl<'a> TokenSource for &'a [u8] {
60    type Token = Self;
61    type Tokenizer = ByteLines<'a, false>;
62
63    fn tokenize(&self) -> Self::Tokenizer {
64        byte_lines(self)
65    }
66
67    fn estimate_tokens(&self) -> u32 {
68        byte_lines(self).estimate_tokens()
69    }
70}
71
72/// A [`TokenSource`] that returns the lines of a `str` as tokens.
73/// See [`lines`] and [`lines_with_terminator`] for details
74#[derive(Clone, Copy, PartialEq, Eq)]
75pub struct Lines<'a, const INCLUDE_LINE_TERMINATOR: bool>(ByteLines<'a, INCLUDE_LINE_TERMINATOR>);
76
77impl<'a, const INCLUDE_LINE_TERMINATOR: bool> Iterator for Lines<'a, INCLUDE_LINE_TERMINATOR> {
78    type Item = &'a str;
79
80    fn next(&mut self) -> Option<Self::Item> {
81        // safety invariant: this struct may only contain valid utf8
82        // dividing valid utf8 bytes by ascii characters always produces valid utf-8
83        self.0.next().map(|it| unsafe { from_utf8_unchecked(it) })
84    }
85}
86
87/// By default a line diff is produced for a string
88impl<'a, const INCLUDE_LINE_TERMINATOR: bool> TokenSource for Lines<'a, INCLUDE_LINE_TERMINATOR> {
89    type Token = &'a str;
90
91    type Tokenizer = Self;
92
93    fn tokenize(&self) -> Self::Tokenizer {
94        *self
95    }
96
97    fn estimate_tokens(&self) -> u32 {
98        self.0.estimate_tokens()
99    }
100}
101
102/// A [`TokenSource`] that returns the lines of a byte slice as tokens.
103/// See [`byte_lines`] and [`byte_lines_with_terminator`] for details
104#[derive(Clone, Copy, PartialEq, Eq)]
105pub struct ByteLines<'a, const INCLUDE_LINE_TERMINATOR: bool>(&'a [u8]);
106
107impl<'a, const INCLUDE_LINE_TERMINATOR: bool> Iterator for ByteLines<'a, INCLUDE_LINE_TERMINATOR> {
108    type Item = &'a [u8];
109
110    fn next(&mut self) -> Option<Self::Item> {
111        let mut saw_carriage_return = false;
112        let mut iter = self.0.iter().enumerate();
113        let line_len = loop {
114            match iter.next() {
115                Some((i, b'\n')) => break i + 1,
116                None => {
117                    return (!self.0.is_empty()).then(|| take(&mut self.0));
118                }
119                Some((_, &it)) => saw_carriage_return = it == b'\r',
120            }
121        };
122        let (mut line, rem) = self.0.split_at(line_len);
123        self.0 = rem;
124        if !INCLUDE_LINE_TERMINATOR {
125            line = &line[..line_len - 1 - saw_carriage_return as usize];
126        }
127        Some(line)
128    }
129}
130
131/// By default a line diff is produced for a string
132impl<'a, const INCLUDE_LINE_TERMINATOR: bool> TokenSource
133    for ByteLines<'a, INCLUDE_LINE_TERMINATOR>
134{
135    type Token = &'a [u8];
136
137    type Tokenizer = Self;
138
139    fn tokenize(&self) -> Self::Tokenizer {
140        *self
141    }
142
143    fn estimate_tokens(&self) -> u32 {
144        let len: usize = self.take(20).map(|line| line.len()).sum();
145        if len == 0 {
146            100
147        } else {
148            (self.0.len() * 20 / len) as u32
149        }
150    }
151}