From fa949a58241d2f4d4b228b8944ea572963ae0b31 Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Tue, 3 Jun 2025 11:42:02 +0900 Subject: [PATCH 1/3] Add tests --- src/segmenter.rs | 46 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/src/segmenter.rs b/src/segmenter.rs index 2a58084..3f6206d 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -29,6 +29,7 @@ impl Segmenter { (Regex::new(r"[a-zA-Za-zA-Z]").unwrap(), "A"), (Regex::new(r"[0-90-9]").unwrap(), "N"), ]; + Segmenter { patterns, learner: learner.unwrap_or_else(|| AdaBoost::new(0.01, 100, 1)), @@ -41,14 +42,14 @@ impl Segmenter { /// * `ch` - A string slice representing a single character. /// /// # Returns - /// A static string representing the type of the character, such as "M", "H", "I", "K", "A", "N", or "O" (for others). - pub fn get_type(&self, ch: &str) -> &'static str { - for (pattern, s_type) in &self.patterns { - if pattern.is_match(ch) { - return s_type; + /// Returns a string slice representing the type of the character. + pub fn get_type(&self, ch: &str) -> &str { + for (regex, label) in &self.patterns { + if regex.is_match(ch) { + return label; } } - "O" + "O" // Other } /// Adds a sentence to the segmenter with a custom writer function. @@ -269,3 +270,36 @@ impl Segmenter { .collect() } } + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use super::*; + + #[test] + fn test_segmenter() { + let model_file = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("./resources") + .join("RWCP.model"); + + let mut learner = AdaBoost::new(0.01, 100, 1); + learner.load_model(model_file.as_path()).unwrap(); + + let mut segmenter = Segmenter::new(Some(learner)); + let sentence = "これはテストです。"; + segmenter.add_sentence(sentence); + let result = segmenter.parse(sentence); + assert!(!result.is_empty()); + assert_eq!(result.len(), 5); // Adjust based on expected segmentation + } + + #[test] + fn test_get_type() { + let segmenter = Segmenter::new(None); + assert_eq!(segmenter.get_type("あ"), "I"); // Hiragana + assert_eq!(segmenter.get_type("漢"), "H"); // Kanji + assert_eq!(segmenter.get_type("A"), "A"); // Latin + assert_eq!(segmenter.get_type("1"), "N"); // Digit + } +} From 7eed49d6e3a587f3f4a89e918579e8eb16d41bd8 Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Tue, 3 Jun 2025 11:43:21 +0900 Subject: [PATCH 2/3] Update doc --- src/segmenter.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segmenter.rs b/src/segmenter.rs index 3f6206d..423e3e0 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -42,7 +42,7 @@ impl Segmenter { /// * `ch` - A string slice representing a single character. /// /// # Returns - /// Returns a string slice representing the type of the character. + /// A static string representing the type of the character, such as "M", "H", "I", "K", "A", "N", or "O" (for others). pub fn get_type(&self, ch: &str) -> &str { for (regex, label) in &self.patterns { if regex.is_match(ch) { From fde7a04a40cca2256458181cb86be58b2a7d3e6a Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Tue, 3 Jun 2025 11:44:29 +0900 Subject: [PATCH 3/3] Rename --- src/segmenter.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/segmenter.rs b/src/segmenter.rs index 423e3e0..ec613a8 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -44,8 +44,8 @@ impl Segmenter { /// # Returns /// A static string representing the type of the character, such as "M", "H", "I", "K", "A", "N", or "O" (for others). pub fn get_type(&self, ch: &str) -> &str { - for (regex, label) in &self.patterns { - if regex.is_match(ch) { + for (pattern, label) in &self.patterns { + if pattern.is_match(ch) { return label; } }