unic/
lib.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
// Copyright 2017 The UNIC Project Developers.
//
// See the COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

#![warn(
    bad_style,
    missing_debug_implementations,
    missing_docs,
    unconditional_recursion
)]
#![forbid(unsafe_code)]

//! # UNIC: Unicode and Internationalization Crates for Rust
//!
//! The `unic` super-crate (this) is a collection of all UNIC components, providing
//! an easy way of access to all functionalities, when all or many are needed,
//! instead of importing components one-by-one, and ensuring all components
//! imported are compatible in algorithms and consistent data-wise.
//!
//! ## Major Components
//!
//! -   [`char`](/unic-char): Unicode Character utilities.
//!
//! -   [`ucd`](/unic-ucd): Unicode Character Database. (UAX\#44).
//!
//! -   [`bidi`](/unic-bidi): Unicode Bidirectional Algorithm (UAX\#9).
//!
//! -   [`normal`](/unic-normal): Unicode Normalization Forms (UAX\#15).
//!
//! -   [`segment`](/unic-segment): Unicode Text Segmentation (UAX\#29).
//!
//! -   [`idna`](/unic-idna): Unicode IDNA Compatibility Processing (UTS\#46).
//!
//!
//! ## A Basic Example
//!
//! ```rust
//! use unic::ucd::common::is_alphanumeric;
//! use unic::bidi::BidiInfo;
//! use unic::normal::StrNormalForm;
//! use unic::segment::{GraphemeIndices, Graphemes, WordBoundIndices, WordBounds, Words};
//! use unic::ucd::normal::compose;
//! use unic::ucd::{is_cased, Age, BidiClass, CharAge, CharBidiClass, StrBidiClass, UnicodeVersion};
//!
//! #[cfg_attr(rustfmt, rustfmt_skip)]
//! #[test]
//! fn test_sample() {
//!
//!     // Age
//!
//!     assert_eq!(Age::of('A').unwrap().actual(), UnicodeVersion { major: 1, minor: 1, micro: 0 });
//!     assert_eq!(Age::of('\u{A0000}'), None);
//!     assert_eq!(
//!         Age::of('\u{10FFFF}').unwrap().actual(),
//!         UnicodeVersion { major: 2, minor: 0, micro: 0 }
//!     );
//!
//!     if let Some(age) = '🦊'.age() {
//!         assert_eq!(age.actual().major, 9);
//!         assert_eq!(age.actual().minor, 0);
//!         assert_eq!(age.actual().micro, 0);
//!     }
//!
//!     // Bidi
//!
//!     let text = concat![
//!         "א",
//!         "ב",
//!         "ג",
//!         "a",
//!         "b",
//!         "c",
//!     ];
//!
//!     assert!(!text.has_bidi_explicit());
//!     assert!(text.has_rtl());
//!     assert!(text.has_ltr());
//!
//!     assert_eq!(text.chars().nth(0).unwrap().bidi_class(), BidiClass::RightToLeft);
//!     assert!(!text.chars().nth(0).unwrap().is_ltr());
//!     assert!(text.chars().nth(0).unwrap().is_rtl());
//!
//!     assert_eq!(text.chars().nth(3).unwrap().bidi_class(), BidiClass::LeftToRight);
//!     assert!(text.chars().nth(3).unwrap().is_ltr());
//!     assert!(!text.chars().nth(3).unwrap().is_rtl());
//!
//!     let bidi_info = BidiInfo::new(text, None);
//!     assert_eq!(bidi_info.paragraphs.len(), 1);
//!
//!     let para = &bidi_info.paragraphs[0];
//!     assert_eq!(para.level.number(), 1);
//!     assert_eq!(para.level.is_rtl(), true);
//!
//!     let line = para.range.clone();
//!     let display = bidi_info.reorder_line(para, line);
//!     assert_eq!(
//!         display,
//!         concat![
//!             "a",
//!             "b",
//!             "c",
//!             "ג",
//!             "ב",
//!             "א",
//!         ]
//!     );
//!
//!     // Case
//!
//!     assert_eq!(is_cased('A'), true);
//!     assert_eq!(is_cased('א'), false);
//!
//!     // Normalization
//!
//!     assert_eq!(compose('A', '\u{030A}'), Some('Å'));
//!
//!     let s = "ÅΩ";
//!     let c = s.nfc().collect::<String>();
//!     assert_eq!(c, "ÅΩ");
//!
//!     // Segmentation
//!
//!     assert_eq!(
//!         Graphemes::new("a\u{310}e\u{301}o\u{308}\u{332}").collect::<Vec<&str>>(),
//!         &["a\u{310}", "e\u{301}", "o\u{308}\u{332}"]
//!     );
//!
//!     assert_eq!(
//!         Graphemes::new("a\r\nb🇺🇳🇮🇨").collect::<Vec<&str>>(),
//!         &["a", "\r\n", "b", "🇺🇳", "🇮🇨"]
//!     );
//!
//!     assert_eq!(
//!         GraphemeIndices::new("a̐éö̲\r\n").collect::<Vec<(usize, &str)>>(),
//!         &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]
//!     );
//!
//!     assert_eq!(
//!         Words::new(
//!             "The quick (\"brown\") fox can't jump 32.3 feet, right?",
//!             |s: &&str| s.chars().any(is_alphanumeric),
//!         ).collect::<Vec<&str>>(),
//!         &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"]
//!     );
//!
//!     assert_eq!(
//!         WordBounds::new("The quick (\"brown\")  fox").collect::<Vec<&str>>(),
//!         &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"]
//!     );
//!
//!     assert_eq!(
//!         WordBoundIndices::new("Brr, it's 29.3°F!").collect::<Vec<(usize, &str)>>(),
//!         &[
//!             (0, "Brr"),
//!             (3, ","),
//!             (4, " "),
//!             (5, "it's"),
//!             (9, " "),
//!             (10, "29.3"),
//!             (14, "°"),
//!             (16, "F"),
//!             (17, "!")
//!         ]
//!     );
//! }
//! ```

pub use unic_bidi as bidi;
pub use unic_char as char;
pub use unic_emoji as emoji;
pub use unic_idna as idna;
pub use unic_normal as normal;
pub use unic_segment as segment;
pub use unic_ucd as ucd;

/// The [Unicode version](https://www.unicode.org/versions/) of data
pub use crate::ucd::UNICODE_VERSION;

mod pkg_info;
pub use crate::pkg_info::{PKG_DESCRIPTION, PKG_NAME, PKG_VERSION};