1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
use std::mem::take;
use std::str::from_utf8_unchecked;
use crate::TokenSource;
/// Returns a [`TokenSource`] that uses
/// the lines in `data` as Tokens. The newline seperator (`\r\n` or `\n`) is
/// not included in the emitted tokens.
/// This means that changing the newline seperator from `\r\n` to `\n`
/// (or omitting it fully on the last line) is not detected by [`diff`](crate::diff).
pub fn lines(data: &str) -> Lines<'_, false> {
Lines(ByteLines(data.as_bytes()))
}
/// Returns a [`TokenSource`] that uses
/// the lines in `data` as Tokens. The newline seperator (`\r\n` or `\n`) is
/// included in the emitted tokens.
/// This means that changing the newline seperator from `\r\n` to `\n`
/// (or omitting it fully on the last line) is detected by [`diff`](crate::diff).
pub fn lines_with_terminator(data: &str) -> Lines<'_, true> {
Lines(ByteLines(data.as_bytes()))
}
/// Returns a [`TokenSource`] that uses
/// the lines in `data` as Tokens. A lines is a continous subslice of
/// `data` which does not contain `\n` (or `\r\n`).
/// The newline seperator (`\r\n` or `\n`) is not included in the emitted tokens.
/// This means that changing the newline seperator from `\r\n` to `\n`
/// (or omitting it fully on the last line) is not detected by [`diff`](crate::diff).
pub fn byte_lines_with_terminator(data: &[u8]) -> ByteLines<'_, true> {
ByteLines(data)
}
/// Returns a [`TokenSource`] that uses
/// the lines in `data` as Tokens. The newline seperator (`\r\n` or `\n`) is
/// included in the emitted tokens.
/// This means that changing the newline seperator from `\r\n` to `\n`
/// (or omitting it fully on the last line) is detected by [`diff`](crate::diff).
pub fn byte_lines(data: &[u8]) -> ByteLines<'_, false> {
ByteLines(data)
}
/// By default a line diff is produced for a string
impl<'a> TokenSource for &'a str {
type Token = &'a str;
type Tokenizer = Lines<'a, false>;
fn tokenize(&self) -> Self::Tokenizer {
lines(self)
}
fn estimate_tokens(&self) -> u32 {
lines_with_terminator(self).estimate_tokens()
}
}
/// By default a line diff is produced for a bytes
impl<'a> TokenSource for &'a [u8] {
type Token = Self;
type Tokenizer = ByteLines<'a, false>;
fn tokenize(&self) -> Self::Tokenizer {
byte_lines(self)
}
fn estimate_tokens(&self) -> u32 {
byte_lines(self).estimate_tokens()
}
}
/// A [`TokenSource`] that returns the lines of a `str` as tokens.
/// See [`lines`] and [`lines_with_terminator`] for details
#[derive(Clone, Copy, PartialEq, Eq)]
pub struct Lines<'a, const INCLUDE_LINE_TERMINATOR: bool>(ByteLines<'a, INCLUDE_LINE_TERMINATOR>);
impl<'a, const INCLUDE_LINE_TERMINATOR: bool> Iterator for Lines<'a, INCLUDE_LINE_TERMINATOR> {
type Item = &'a str;
fn next(&mut self) -> Option<Self::Item> {
// safety invariant: this struct may only contain valid utf8
// dividing valid utf8 bytes by ascii characters always produces valid utf-8
self.0.next().map(|it| unsafe { from_utf8_unchecked(it) })
}
}
/// By default a line diff is produced for a string
impl<'a, const INCLUDE_LINE_TERMINATOR: bool> TokenSource for Lines<'a, INCLUDE_LINE_TERMINATOR> {
type Token = &'a str;
type Tokenizer = Self;
fn tokenize(&self) -> Self::Tokenizer {
*self
}
fn estimate_tokens(&self) -> u32 {
self.0.estimate_tokens()
}
}
/// A [`TokenSource`] that returns the lines of a byte slice as tokens.
/// See [`byte_lines`] and [`byte_lines_with_terminator`] for details
#[derive(Clone, Copy, PartialEq, Eq)]
pub struct ByteLines<'a, const INCLUDE_LINE_TERMINATOR: bool>(&'a [u8]);
impl<'a, const INCLUDE_LINE_TERMINATOR: bool> Iterator for ByteLines<'a, INCLUDE_LINE_TERMINATOR> {
type Item = &'a [u8];
fn next(&mut self) -> Option<Self::Item> {
let mut saw_carriage_return = false;
let mut iter = self.0.iter().enumerate();
let line_len = loop {
match iter.next() {
Some((i, b'\n')) => break i + 1,
None => {
return (!self.0.is_empty()).then(|| take(&mut self.0));
}
Some((_, &it)) => saw_carriage_return = it == b'\r',
}
};
let (mut line, rem) = self.0.split_at(line_len);
self.0 = rem;
if !INCLUDE_LINE_TERMINATOR {
line = &line[..line_len - 1 - saw_carriage_return as usize];
}
Some(line)
}
}
/// By default a line diff is produced for a string
impl<'a, const INCLUDE_LINE_TERMINATOR: bool> TokenSource
for ByteLines<'a, INCLUDE_LINE_TERMINATOR>
{
type Token = &'a [u8];
type Tokenizer = Self;
fn tokenize(&self) -> Self::Tokenizer {
*self
}
fn estimate_tokens(&self) -> u32 {
let len: usize = self.take(20).map(|line| line.len()).sum();
if len == 0 {
100
} else {
(self.0.len() * 20 / len) as u32
}
}
}