From 3a18e4379e68ba62414900ded6ed0f89bb0f978e Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Tue, 3 Jun 2025 21:53:51 +0900 Subject: [PATCH 1/2] Support multi languages --- src/segmenter.rs | 44 ++++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/src/segmenter.rs b/src/segmenter.rs index e479fb6..1d217f3 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -19,38 +19,26 @@ impl Segmenter { /// A new Segmenter instance with the specified or default AdaBoost learner. pub fn new(learner: Option) -> Self { let patterns = vec![ + // Numbers + (Regex::new(r"[0-90-9]").unwrap(), "N"), // Japanese Kanji numbers (Regex::new(r"[一二三四五六七八九十百千万億兆]").unwrap(), "M"), - // Japanese Kanji - (Regex::new(r"[一-龠々〆ヵヶ]").unwrap(), "H"), - // Japanese Hiragana + // Hiragana (Japanese) (Regex::new(r"[ぁ-ん]").unwrap(), "I"), - // Japanese Katakana - (Regex::new(r"[ァ-ヴーア-ン゙ー]").unwrap(), "K"), - // Latin alphabet (ASCII + full-width) + // Katakana (Japanese) + (Regex::new(r"[ァ-ヴーア-ン゙゚]").unwrap(), "K"), + // Hangul (Korean) + (Regex::new(r"[가-힣]").unwrap(), "G"), + // Thai script + (Regex::new(r"[ก-๛]").unwrap(), "T"), + // Kanji (Japanese) + (Regex::new(r"[一-龠々〆ヵヶ]").unwrap(), "H"), + // Kanji (CJK Unified Ideographs) + (Regex::new(r"[㐀-䶵一-鿿]").unwrap(), "Z"), + // Extended Latin (Vietnamese, etc.) + (Regex::new(r"[À-ÿĀ-ſƀ-ƿǍ-ɏ]").unwrap(), "E"), + // ASCII + Full-width Latin (Regex::new(r"[a-zA-Za-zA-Z]").unwrap(), "A"), - // Numbers (ASCII + full-width) - (Regex::new(r"[0-90-9]").unwrap(), "N"), - // // Japanese Kanji numbers - // (Regex::new(r"[一二三四五六七八九十百千万億兆]").unwrap(), "M"), - // // Japanese Kanji - // (Regex::new(r"[一-龠々〆ヵヶ]").unwrap(), "J"), - // // Chinese Kanji (CJK Unified Ideographs) - // (Regex::new(r"[㐀-䶵一-鿿]").unwrap(), "M"), - // // Hangul (Korean) - // (Regex::new(r"[가-힣]").unwrap(), "K"), - // // Hiragana (Japanese) - // (Regex::new(r"[ぁ-ん]").unwrap(), "I"), - // // Katakana (Japanese) - // (Regex::new(r"[ァ-ヴーア-ン゙゚]").unwrap(), "K"), - // // Latin alphabet (ASCII + full-width) - // (Regex::new(r"[a-zA-Za-zA-Z]").unwrap(), "A"), - // // Numbers (ASCII + full-width) - // (Regex::new(r"[0-90-9]").unwrap(), "N"), - // // Vietnamese Extended Latin - // (Regex::new(r"[À-ſ]").unwrap(), "V"), - // // Thai script - // (Regex::new(r"[ก-๛]").unwrap(), "T"), ]; Segmenter { From 2a114204319ef24701403bf8e83262a99b13ae4f Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Tue, 3 Jun 2025 22:01:05 +0900 Subject: [PATCH 2/2] Fix document --- src/segmenter.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/segmenter.rs b/src/segmenter.rs index 1d217f3..cf0e117 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -53,7 +53,9 @@ impl Segmenter { /// * `ch` - A string slice representing a single character. /// /// # Returns - /// A static string representing the type of the character, such as "M", "H", "I", "K", "A", "N", or "O" (for others). + /// A string slice representing the type of the character, such as "N" for number, + /// "I" for Hiragana, "K" for Katakana, etc. If the character does not match any pattern, + /// it returns "O" for Other. pub fn get_type(&self, ch: &str) -> &str { for (pattern, label) in &self.patterns { if pattern.is_match(ch) {