pub struct TfIdf { /* private fields */ }
Expand description
TF-IDF keywords extraction
Require tfidf
feature to be enabled
Implementations§
Source§impl TfIdf
Implementation of JiebaKeywordExtract using a TF-IDF dictionary.
impl TfIdf
Implementation of JiebaKeywordExtract using a TF-IDF dictionary.
This takes the segments produced by Jieba and attempts to extract keywords. Segments are filtered for stopwords and short terms. They are then matched against a loaded dictionary to calculate TF-IDF scores.
Sourcepub fn new(
opt_dict: Option<&mut impl BufRead>,
config: KeywordExtractConfig,
) -> Self
pub fn new( opt_dict: Option<&mut impl BufRead>, config: KeywordExtractConfig, ) -> Self
Creates an TfIdf.
§Examples
New instance with custom idf dictionary.
use jieba_rs::{TfIdf, KeywordExtractConfig};
let mut sample_idf = "劳动防护 13.900677652\n\
生化学 13.900677652\n";
TfIdf::new(
Some(&mut sample_idf.as_bytes()),
KeywordExtractConfig::default());
New instance with module default stop words and no initial IDF
dictionary. Dictionary should be loaded later with load_dict()
calls.
use jieba_rs::{TfIdf, KeywordExtractConfig};
TfIdf::new(
None::<&mut std::io::Empty>,
KeywordExtractConfig::default());
Sourcepub fn load_dict(&mut self, dict: &mut impl BufRead) -> Result<()>
pub fn load_dict(&mut self, dict: &mut impl BufRead) -> Result<()>
Merges entires from dict
into the idf_dict
.
use jieba_rs::{Jieba, KeywordExtract, Keyword, KeywordExtractConfig,
TfIdf};
let jieba = Jieba::default();
let mut init_idf = "生化学 13.900677652\n";
let mut tfidf = TfIdf::new(
Some(&mut init_idf.as_bytes()),
KeywordExtractConfig::default());
let top_k = tfidf.extract_keywords(&jieba, "生化学不是光化学的,", 3, vec![]);
assert_eq!(
top_k,
vec![
Keyword { keyword: "不是".to_string(), weight: 4.6335592173333335 },
Keyword { keyword: "光化学".to_string(), weight: 4.6335592173333335 },
Keyword { keyword: "生化学".to_string(), weight: 4.6335592173333335 }
]
);
let mut init_idf = "光化学 99.123456789\n";
tfidf.load_dict(&mut init_idf.as_bytes());
let new_top_k = tfidf.extract_keywords(&jieba, "生化学不是光化学的,", 3, vec![]);
assert_eq!(
new_top_k,
vec![
Keyword { keyword: "不是".to_string(), weight: 33.041152263 },
Keyword { keyword: "光化学".to_string(), weight: 33.041152263 },
Keyword { keyword: "生化学".to_string(), weight: 4.6335592173333335 }
]
);
pub fn config(&self) -> &KeywordExtractConfig
pub fn config_mut(&mut self) -> &mut KeywordExtractConfig
Trait Implementations§
Source§impl KeywordExtract for TfIdf
impl KeywordExtract for TfIdf
Source§fn extract_keywords(
&self,
jieba: &Jieba,
sentence: &str,
top_k: usize,
allowed_pos: Vec<String>,
) -> Vec<Keyword>
fn extract_keywords( &self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec<String>, ) -> Vec<Keyword>
Uses TF-IDF algorithm to extract the top_k
keywords from sentence
.
If allowed_pos
is not empty, then only terms matching those parts if
speech are considered.
§Examples
use jieba_rs::{Jieba, KeywordExtract, TfIdf};
let jieba = Jieba::new();
let keyword_extractor = TfIdf::default();
let mut top_k = keyword_extractor.extract_keywords(
&jieba,
"今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好,昨天纽约的天气也不好,北京烤鸭真好吃",
3,
vec![],
);
assert_eq!(
top_k.iter().map(|x| &x.keyword).collect::<Vec<&String>>(),
vec!["北京烤鸭", "纽约", "天气"]
);
top_k = keyword_extractor.extract_keywords(
&jieba,
"此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。",
5,
vec![],
);
assert_eq!(
top_k.iter().map(|x| &x.keyword).collect::<Vec<&String>>(),
vec!["欧亚", "吉林", "置业", "万元", "增资"]
);
top_k = keyword_extractor.extract_keywords(
&jieba,
"此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。",
5,
vec![String::from("ns"), String::from("n"), String::from("vn"), String::from("v")],
);
assert_eq!(
top_k.iter().map(|x| &x.keyword).collect::<Vec<&String>>(),
vec!["欧亚", "吉林", "置业", "增资", "实现"]
);
Auto Trait Implementations§
impl Freeze for TfIdf
impl RefUnwindSafe for TfIdf
impl Send for TfIdf
impl Sync for TfIdf
impl Unpin for TfIdf
impl UnwindSafe for TfIdf
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more