这是indexloc提供的服务,不要输入任何密码
Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 19 additions & 29 deletions src/segmenter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,38 +19,26 @@ impl Segmenter {
/// A new Segmenter instance with the specified or default AdaBoost learner.
pub fn new(learner: Option<AdaBoost>) -> Self {
let patterns = vec![
// Numbers
(Regex::new(r"[0-90-9]").unwrap(), "N"),
// Japanese Kanji numbers
(Regex::new(r"[一二三四五六七八九十百千万億兆]").unwrap(), "M"),
// Japanese Kanji
(Regex::new(r"[一-龠々〆ヵヶ]").unwrap(), "H"),
// Japanese Hiragana
// Hiragana (Japanese)
(Regex::new(r"[ぁ-ん]").unwrap(), "I"),
// Japanese Katakana
(Regex::new(r"[ァ-ヴーア-ン゙ー]").unwrap(), "K"),
// Latin alphabet (ASCII + full-width)
// Katakana (Japanese)
(Regex::new(r"[ァ-ヴーア-ン゙゚]").unwrap(), "K"),
// Hangul (Korean)
(Regex::new(r"[가-힣]").unwrap(), "G"),
// Thai script
(Regex::new(r"[ก-๛]").unwrap(), "T"),
// Kanji (Japanese)
(Regex::new(r"[一-龠々〆ヵヶ]").unwrap(), "H"),
// Kanji (CJK Unified Ideographs)
(Regex::new(r"[㐀-䶵一-鿿]").unwrap(), "Z"),
// Extended Latin (Vietnamese, etc.)
(Regex::new(r"[À-ÿĀ-ſƀ-ƿǍ-ɏ]").unwrap(), "E"),
// ASCII + Full-width Latin
(Regex::new(r"[a-zA-Za-zA-Z]").unwrap(), "A"),
// Numbers (ASCII + full-width)
(Regex::new(r"[0-90-9]").unwrap(), "N"),
// // Japanese Kanji numbers
// (Regex::new(r"[一二三四五六七八九十百千万億兆]").unwrap(), "M"),
// // Japanese Kanji
// (Regex::new(r"[一-龠々〆ヵヶ]").unwrap(), "J"),
// // Chinese Kanji (CJK Unified Ideographs)
// (Regex::new(r"[㐀-䶵一-鿿]").unwrap(), "M"),
// // Hangul (Korean)
// (Regex::new(r"[가-힣]").unwrap(), "K"),
// // Hiragana (Japanese)
// (Regex::new(r"[ぁ-ん]").unwrap(), "I"),
// // Katakana (Japanese)
// (Regex::new(r"[ァ-ヴーア-ン゙゚]").unwrap(), "K"),
// // Latin alphabet (ASCII + full-width)
// (Regex::new(r"[a-zA-Za-zA-Z]").unwrap(), "A"),
// // Numbers (ASCII + full-width)
// (Regex::new(r"[0-90-9]").unwrap(), "N"),
// // Vietnamese Extended Latin
// (Regex::new(r"[À-ſ]").unwrap(), "V"),
// // Thai script
// (Regex::new(r"[ก-๛]").unwrap(), "T"),
];

Segmenter {
Expand All @@ -65,7 +53,9 @@ impl Segmenter {
/// * `ch` - A string slice representing a single character.
///
/// # Returns
/// A static string representing the type of the character, such as "M", "H", "I", "K", "A", "N", or "O" (for others).
/// A string slice representing the type of the character, such as "N" for number,
/// "I" for Hiragana, "K" for Katakana, etc. If the character does not match any pattern,
/// it returns "O" for Other.
pub fn get_type(&self, ch: &str) -> &str {
for (pattern, label) in &self.patterns {
if pattern.is_match(ch) {
Expand Down