From 6d8832c44f629828bffb3fb8436d927133ae8210 Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Mon, 2 Jun 2025 23:38:50 +0900 Subject: [PATCH 01/15] Refactoring --- Cargo.toml | 2 +- src/adaboost.rs | 19 ++++---- src/extractor.rs | 67 +++++++++++++++++++++++++++ src/lib.rs | 2 + src/main.rs | 115 +++++++++++++++++------------------------------ src/segmenter.rs | 27 +++++++---- src/trainer.rs | 48 ++++++++++++++++++++ 7 files changed, 186 insertions(+), 94 deletions(-) create mode 100644 src/extractor.rs create mode 100644 src/trainer.rs diff --git a/Cargo.toml b/Cargo.toml index 0652eae..ad4ee53 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ categories = ["text-processing"] license = "MIT" [features] -default = [] # No directories included +default = [] [dependencies] clap = { version = "4.5.39", features = ["derive"] } diff --git a/src/adaboost.rs b/src/adaboost.rs index d2125f1..51b4fa3 100644 --- a/src/adaboost.rs +++ b/src/adaboost.rs @@ -1,6 +1,7 @@ use std::collections::{BTreeMap, HashMap, HashSet}; use std::fs::File; use std::io::{BufRead, BufReader, Write}; +use std::path::Path; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; @@ -53,7 +54,7 @@ impl AdaBoost { /// * `filename`: The path to the file containing the features. /// # Returns: A result indicating success or failure. /// # Errors: Returns an error if the file cannot be opened or read. - pub fn initialize_features(&mut self, filename: &str) -> std::io::Result<()> { + pub fn initialize_features(&mut self, filename: &Path) -> std::io::Result<()> { let file = File::open(filename)?; let reader = BufReader::new(file); let mut map = BTreeMap::new(); // preserve order @@ -100,7 +101,7 @@ impl AdaBoost { /// * `filename`: The path to the file containing the instances. /// # Returns: A result indicating success or failure. /// # Errors: Returns an error if the file cannot be opened or read. - pub fn initialize_instances(&mut self, filename: &str) -> std::io::Result<()> { + pub fn initialize_instances(&mut self, filename: &Path) -> std::io::Result<()> { let file = File::open(filename)?; let reader = BufReader::new(file); let bias = self.get_bias(); @@ -173,7 +174,7 @@ impl AdaBoost { // Find the best hypothesis let mut h_best = 0; let mut best_error_rate = positive_weight_sum / instance_weight_sum; - for h in 1..num_features { + for (h, _) in errors.iter().enumerate().take(num_features).skip(1) { let mut e = errors[h] + positive_weight_sum; e /= instance_weight_sum; if (0.5 - e).abs() > (0.5 - best_error_rate).abs() { @@ -232,7 +233,7 @@ impl AdaBoost { /// # Errors: Returns an error if the file cannot be created or written to. /// # Notes: The bias term is calculated as the negative sum of the weights divided by 2. /// The model is saved in a way that can be easily loaded later. - pub fn save_model(&self, filename: &str) -> std::io::Result<()> { + pub fn save_model(&self, filename: &Path) -> std::io::Result<()> { let mut file = File::create(filename)?; let mut bias = -self.model[0]; for (h, &w) in self.features.iter().zip(self.model.iter()).skip(1) { @@ -254,7 +255,7 @@ impl AdaBoost { /// # Errors: Returns an error if the file cannot be opened or read. /// # Notes: The model is loaded into the `features` and `model` vectors, /// and the bias is calculated as the negative sum of the weights divided by 2. - pub fn load_model(&mut self, filename: &str) -> std::io::Result<()> { + pub fn load_model(&mut self, filename: &Path) -> std::io::Result<()> { let file = File::open(filename)?; let reader = BufReader::new(file); let mut m: HashMap = HashMap::new(); @@ -313,12 +314,10 @@ impl AdaBoost { } else { pn += 1 } + } else if label > 0 { + np += 1 } else { - if label > 0 { - np += 1 - } else { - nn += 1 - } + nn += 1 } } diff --git a/src/extractor.rs b/src/extractor.rs new file mode 100644 index 0000000..6af0529 --- /dev/null +++ b/src/extractor.rs @@ -0,0 +1,67 @@ +use std::collections::HashSet; +use std::error::Error; +use std::fs::File; +use std::io::{self, BufRead, Write}; +use std::path::Path; + +use crate::segmenter::Segmenter; + +pub struct Extractor { + segmenter: Segmenter, +} + +impl Default for Extractor { + fn default() -> Self { + Self::new() + } +} + +impl Extractor { + pub fn new() -> Self { + Extractor { + segmenter: Segmenter::new(None), + } + } + + pub fn extract( + &mut self, + corpus_path: &Path, + features_path: &Path, + ) -> Result<(), Box> { + // Read sentences from stdin + // Each line is treated as a separate sentence + let corpus_file = File::open(corpus_path)?; + let corpus = io::BufReader::new(corpus_file); + + // Create a file to write the features + let features_file = File::create(features_path)?; + let mut features = io::BufWriter::new(features_file); + + // learner function to write features + // This function will be called for each word in the input sentences + // It takes a set of attributes and a label, and writes them to stdout + let mut learner = |attributes: HashSet, label: i8| { + let mut attrs: Vec = attributes.into_iter().collect(); + attrs.sort(); + let mut line = vec![label.to_string()]; + line.extend(attrs); + writeln!(features, "{}", line.join("\t")).expect("Failed to write features"); + }; + + for line in corpus.lines() { + match line { + Ok(line) => { + let line = line.trim(); + if !line.is_empty() { + self.segmenter.add_sentence_with_writer(line, &mut learner); + } + } + Err(err) => { + eprintln!("Error reading input: {}", err); + } + } + } + + Ok(()) + } +} diff --git a/src/lib.rs b/src/lib.rs index feca1db..3d62349 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,7 @@ pub mod adaboost; +pub mod extractor; pub mod segmenter; +pub mod trainer; const VERERSION: &str = env!("CARGO_PKG_VERSION"); diff --git a/src/main.rs b/src/main.rs index 6bccaae..fcc5934 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,15 +1,16 @@ -use std::collections::HashSet; use std::error::Error; -use std::fs::File; use std::io::{self, BufRead, Write}; +use std::path::PathBuf; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use clap::{Args, Parser, Subcommand}; use litsea::adaboost::AdaBoost; +use litsea::extractor::Extractor; use litsea::get_version; use litsea::segmenter::Segmenter; +use litsea::trainer::Trainer; #[derive(Debug, Args)] #[clap( @@ -18,8 +19,8 @@ use litsea::segmenter::Segmenter; version = get_version(), )] struct ExtractArgs { - corpus_file: String, - features_file: String, + corpus_file: PathBuf, + features_file: PathBuf, } #[derive(Debug, Args)] @@ -38,10 +39,10 @@ struct TrainArgs { num_threads: usize, #[arg(short = 'm', long)] - load_model: Option, + load_model_file: Option, - instances_file: String, - model_file: String, + features_file: PathBuf, + model_file: PathBuf, } #[derive(Debug, Args)] @@ -50,7 +51,7 @@ struct TrainArgs { version = get_version(), )] struct SegmentArgs { - model_file: String, + model_file: PathBuf, } #[derive(Debug, Subcommand)] @@ -73,44 +74,11 @@ struct CommandArgs { } fn extract(args: ExtractArgs) -> Result<(), Box> { - // Create a file to write the features - let features_file = File::create(&args.features_file)?; - let mut features = io::BufWriter::new(features_file); - - // Initialize the segmenter - // No model is loaded, so it will use the default feature extraction - let mut segmenter = Segmenter::new(None); - - // learner function to write features - // This function will be called for each word in the input sentences - // It takes a set of attributes and a label, and writes them to stdout - let mut learner = |attributes: HashSet, label: i8| { - let mut attrs: Vec = attributes.into_iter().collect(); - attrs.sort(); - let mut line = vec![label.to_string()]; - line.extend(attrs); - writeln!(features, "{}", line.join("\t")).expect("Failed to write features"); - }; - - // Read sentences from stdin - // Each line is treated as a separate sentence - let corpus_file = File::open(&args.corpus_file)?; - let corpus = io::BufReader::new(corpus_file); - - for line in corpus.lines() { - match line { - Ok(line) => { - let line = line.trim(); - if !line.is_empty() { - segmenter.add_sentence_with_writer(line, &mut learner); - } - } - Err(err) => { - eprintln!("Error reading input: {}", err); - } - } - } + let mut extractor = Extractor::new(); + + extractor.extract(args.corpus_file.as_path(), args.features_file.as_path())?; + println!("Feature extraction completed successfully."); Ok(()) } @@ -127,56 +95,46 @@ fn train(args: TrainArgs) -> Result<(), Box> { }) .expect("Error setting Ctrl-C handler"); - let mut boost = AdaBoost::new(args.threshold, args.num_iterations, args.num_threads); + let mut trainer = Trainer::new( + args.threshold, + args.num_iterations, + args.num_threads, + args.features_file.as_path(), + ); - if let Some(model_path) = args.load_model.as_ref() { - boost.load_model(model_path).unwrap(); + if let Some(model_path) = &args.load_model_file { + trainer.load_model(model_path.as_path())?; } - boost.initialize_features(&args.instances_file).unwrap(); - boost.initialize_instances(&args.instances_file).unwrap(); - - boost.train(running.clone()); - boost.save_model(&args.model_file).unwrap(); - boost.show_result(); + trainer.train(running, args.model_file.as_path())?; + println!("Training completed successfully."); Ok(()) } fn segment(args: SegmentArgs) -> Result<(), Box> { - let model_path = &args.model_file; + let mut leaner = AdaBoost::new(0.01, 100, 1); + leaner.load_model(args.model_file.as_path())?; - let mut model = AdaBoost::new(0.01, 100, 1); - if let Err(e) = model.load_model(model_path) { - eprintln!("Failed to load model: {}", e); - std::process::exit(1); - } - - let segmenter = Segmenter::new(Some(model)); + let segmenter = Segmenter::new(Some(leaner)); let stdin = io::stdin(); let stdout = io::stdout(); let mut writer = io::BufWriter::new(stdout.lock()); for line in stdin.lock().lines() { - match line { - Ok(line) => { - let line = line.trim(); - if line.is_empty() { - continue; - } - let tokens = segmenter.parse(line); - writeln!(writer, "{}", tokens.join(" ")).expect("write failed"); - } - Err(err) => { - eprintln!("Error reading input: {}", err); - } + let line = line?; + let line = line.trim(); + if line.is_empty() { + continue; } + let tokens = segmenter.parse(line); + writeln!(writer, "{}", tokens.join(" "))?; } Ok(()) } -fn main() -> Result<(), Box> { +fn run() -> Result<(), Box> { let args = CommandArgs::parse(); match args.command { @@ -185,3 +143,10 @@ fn main() -> Result<(), Box> { Commands::Segment(args) => segment(args), } } + +fn main() { + if let Err(e) = run() { + eprintln!("Error: {}", e); + std::process::exit(1); + } +} diff --git a/src/segmenter.rs b/src/segmenter.rs index 18f2d91..faabbc6 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -10,8 +10,10 @@ pub struct Segmenter { impl Segmenter { /// Creates a new Segmenter with the given AdaBoost learner or a default one + /// /// # Arguments /// * `learner` - An optional AdaBoost instance. If None, a default AdaBoost instance is created. + /// /// # Returns /// A new Segmenter instance with the specified or default AdaBoost learner. pub fn new(learner: Option) -> Self { @@ -32,9 +34,11 @@ impl Segmenter { } } - /// gets the type of a character based on predefined patterns + /// Gets the type of a character based on predefined patterns. + /// /// # Arguments /// * `ch` - A string slice representing a single character. + /// /// # Returns /// A static string representing the type of the character, such as "M", "H", "I", "K", "A", "N", or "O" (for others). pub fn get_type(&self, ch: &str) -> &'static str { @@ -46,10 +50,11 @@ impl Segmenter { "O" } - /// Adds a sentence to the segmenter with a custom writer function + /// Adds a sentence to the segmenter with a custom writer function. + /// /// # Arguments /// * `sentence` - A string slice representing the sentence to be added. - /// * `writer` - A closure that takes a HashSet of attributes and a label (i8) as arguments. + /// * `writer` - A closure that takes a `HashSet` of attributes and a label (`i8`) as arguments. /// This closure is called for each word in the sentence, allowing custom handling of the attributes and label. pub fn add_sentence_with_writer(&mut self, sentence: &str, mut writer: F) where @@ -91,12 +96,14 @@ impl Segmenter { } } - /// Adds a sentence to the segmenter for training + /// Adds a sentence to the segmenter for training. + /// /// # Arguments /// * `sentence` - A string slice representing the sentence to be added. + /// /// This method processes the sentence, extracts features, and adds them to the AdaBoost learner. /// It constructs attributes based on the characters and their types, and uses the AdaBoost learner to add instances. - /// If the sentence is empty or too short, it does nothing. + /// If the sentence is empty or too short, it does nothing. pub fn add_sentence(&mut self, sentence: &str) { if sentence.is_empty() { return; @@ -130,14 +137,16 @@ impl Segmenter { for i in 4..(chars.len() - 3) { let label = if tags[i] == "B" { 1 } else { -1 }; let attrs = self.get_attributes(i, &tags, &chars, &types); - // ★ ここで毎回 self.learner を呼ぶことで借用がぶつからない! + // Call the learner for each instance; doing so individually avoids borrowing conflicts. self.learner.add_instance(attrs, label); } } - /// Parses a sentence and segments it into words + /// Parses a sentence and segments it into words. + /// /// # Arguments /// * `sentence` - A string slice representing the sentence to be parsed. + /// /// # Returns /// A vector of strings, where each string is a segmented word from the sentence. pub fn parse(&self, sentence: &str) -> Vec { @@ -174,12 +183,14 @@ impl Segmenter { result } - /// Gets the attributes for a specific index in the character and type arrays + /// Gets the attributes for a specific index in the character and type arrays. + /// /// # Arguments /// * `i` - The index for which to get the attributes. /// * `tags` - A slice of strings representing the tags for each character. /// * `chars` - A slice of strings representing the characters in the sentence. /// * `types` - A slice of strings representing the types of each character. + /// /// # Returns /// A HashSet of strings representing the attributes for the specified index. fn get_attributes( diff --git a/src/trainer.rs b/src/trainer.rs new file mode 100644 index 0000000..101218d --- /dev/null +++ b/src/trainer.rs @@ -0,0 +1,48 @@ +use std::path::Path; +use std::sync::atomic::AtomicBool; +use std::sync::Arc; + +use crate::adaboost::AdaBoost; + +pub struct Trainer { + learner: AdaBoost, +} + +impl Trainer { + pub fn new( + threshold: f64, + num_iterations: usize, + num_threads: usize, + features_path: &Path, + ) -> Self { + let mut learner = AdaBoost::new(threshold, num_iterations, num_threads); + + learner + .initialize_features(features_path) + .expect("Failed to initialize features"); + learner + .initialize_instances(features_path) + .expect("Failed to initialize instances"); + + Trainer { learner } + } + + pub fn load_model(&mut self, model_path: &Path) -> Result<(), Box> { + // Load the model from the specified file + Ok(self.learner.load_model(model_path)?) + } + + pub fn train( + &mut self, + running: Arc, + model_path: &Path, + ) -> Result<(), Box> { + self.learner.train(running.clone()); + + // Save the trained model to the specified file + self.learner.save_model(model_path)?; + self.learner.show_result(); + + Ok(()) + } +} From e81be7e13162d7357ead38d06793fcb9b50e8e78 Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Tue, 3 Jun 2025 10:02:54 +0900 Subject: [PATCH 02/15] Add docs --- README.md | 7 +++++++ src/adaboost.rs | 37 +++++++++++++++++++++++-------------- src/extractor.rs | 19 +++++++++++++++++++ src/main.rs | 33 +++++++++++++++++++++++++++++++++ src/segmenter.rs | 10 +++++++--- src/trainer.rs | 38 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 127 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 09bd431..727f0ed 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,12 @@ Extract the information and features from the corpus: ./target/release/litsea extract ./resources/corpus.txt ./resources/features.txt ``` +The output from the `extract` command is similar to: + +```text +Feature extraction completed successfully. +``` + Train the features output by the above command using AdaBoost. Training stops if the new weak classifier’s accuracy falls below 0.001 or after 10,000 iterations. ```sh @@ -81,6 +87,7 @@ Accuracy: 100.00% (61 / 61) Precision: 100.00% (24 / 24) Recall: 100.00% (24 / 24) Confusion Matrix: TP: 24, FP: 0, FN: 0, TN: 37 +Training completed successfully. ``` ## How to segment sentences into words diff --git a/src/adaboost.rs b/src/adaboost.rs index 51b4fa3..cfa1188 100644 --- a/src/adaboost.rs +++ b/src/adaboost.rs @@ -11,7 +11,6 @@ type Label = i8; /// This implementation uses a simple feature extraction method /// and is designed for educational purposes. /// It is not optimized for performance or large datasets. -/// #[derive(Debug)] pub struct AdaBoost { pub threshold: f64, @@ -27,11 +26,15 @@ pub struct AdaBoost { } impl AdaBoost { - /// Creates a new [`AdaBoost`]. + /// Creates a new instance of [`AdaBoost`]. + /// This method initializes the AdaBoost parameters such as threshold, + /// number of iterations, and number of threads. + /// /// # Arguments /// * `threshold`: The threshold for stopping the training. /// * `num_iterations`: The maximum number of iterations for training. /// * `num_threads`: The number of threads to use for training (not used in this implementation). + /// /// # Returns: A new instance of [`AdaBoost`]. pub fn new(threshold: f64, num_iterations: usize, num_threads: usize) -> Self { AdaBoost { @@ -50,9 +53,12 @@ impl AdaBoost { /// Initializes the features from a file. /// The file should contain lines with a label followed by space-separated features. + /// /// # Arguments /// * `filename`: The path to the file containing the features. + /// /// # Returns: A result indicating success or failure. + /// /// # Errors: Returns an error if the file cannot be opened or read. pub fn initialize_features(&mut self, filename: &Path) -> std::io::Result<()> { let file = File::open(filename)?; @@ -97,9 +103,12 @@ impl AdaBoost { /// Initializes the instances from a file. /// The file should contain lines with a label followed by space-separated features. + /// /// # Arguments /// * `filename`: The path to the file containing the instances. + /// /// # Returns: A result indicating success or failure. + /// /// # Errors: Returns an error if the file cannot be opened or read. pub fn initialize_instances(&mut self, filename: &Path) -> std::io::Result<()> { let file = File::open(filename)?; @@ -141,9 +150,9 @@ impl AdaBoost { /// Trains the AdaBoost model. /// This method iteratively updates the model based on the training data. + /// /// # Arguments /// * `running`: An `Arc` to control the running state of the training process. - /// # Notes: The training process will stop if `running` is set to false. pub fn train(&mut self, running: Arc) { let num_features = self.features.len(); @@ -227,12 +236,13 @@ impl AdaBoost { /// Saves the trained model to a file. /// The model is saved in a format where each line contains a feature and its weight, /// with the last line containing the bias term. + /// /// # Arguments /// * `filename`: The path to the file where the model will be saved. + /// /// # Returns: A result indicating success or failure. + /// /// # Errors: Returns an error if the file cannot be created or written to. - /// # Notes: The bias term is calculated as the negative sum of the weights divided by 2. - /// The model is saved in a way that can be easily loaded later. pub fn save_model(&self, filename: &Path) -> std::io::Result<()> { let mut file = File::create(filename)?; let mut bias = -self.model[0]; @@ -249,12 +259,13 @@ impl AdaBoost { /// Loads a model from a file. /// The file should contain lines with a feature and its weight, /// with the last line containing the bias term. + /// /// # Arguments /// * `filename`: The path to the file containing the model. + /// /// # Returns: A result indicating success or failure. + /// /// # Errors: Returns an error if the file cannot be opened or read. - /// # Notes: The model is loaded into the `features` and `model` vectors, - /// and the bias is calculated as the negative sum of the weights divided by 2. pub fn load_model(&mut self, filename: &Path) -> std::io::Result<()> { let file = File::open(filename)?; let reader = BufReader::new(file); @@ -283,16 +294,14 @@ impl AdaBoost { /// Gets the bias term of the model. /// The bias is calculated as the negative sum of the model weights divided by 2. - /// # Returns: The bias term as a `f64`. - /// # Notes: This is used to adjust the decision boundary of the model. + /// + /// # Returns:The bias term as a `f64`. pub fn get_bias(&self) -> f64 { -self.model.iter().sum::() / 2.0 } /// Displays the result of the model's performance on the training data. /// It calculates accuracy, precision, recall, and confusion matrix. - /// # Notes: This method iterates through the instances, calculates the score for each, - /// and counts true positives, false positives, true negatives, and false negatives. pub fn show_result(&self) { let bias = self.get_bias(); let mut pp = 0; @@ -342,11 +351,10 @@ impl AdaBoost { /// Adds a new instance to the model. /// The instance is represented by a set of attributes and a label. + /// /// # Arguments /// * `attributes`: A `HashSet` containing the attributes of the instance. /// * `label`: The label of the instance, represented as an `i8`. - /// # Notes: The attributes are sorted and added to the `features` vector if they do not already exist. - /// The instance is stored in `instances_buf`, and its start and end indices are recorded in `instances`. pub fn add_instance(&mut self, attributes: HashSet, label: i8) { let start = self.instances_buf.len(); let mut attrs: Vec = attributes.into_iter().collect(); @@ -369,10 +377,11 @@ impl AdaBoost { } /// Predicts the label for a given set of attributes. + /// /// # Arguments /// * `attributes`: A `HashSet` containing the attributes to predict. + /// /// # Returns: The predicted label as an `i8`, where 1 indicates a positive prediction and -1 indicates a negative prediction. - /// # Notes: The prediction is made by calculating the score based on the model weights for the given attributes. pub fn predict(&self, attributes: HashSet) -> i8 { let mut score = 0.0; for attr in attributes { diff --git a/src/extractor.rs b/src/extractor.rs index 6af0529..6efa325 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -6,23 +6,42 @@ use std::path::Path; use crate::segmenter::Segmenter; +/// Extractor struct for processing text data and extracting features. +/// It reads sentences from a corpus file, segments them into words, +/// and writes the extracted features to a specified output file. pub struct Extractor { segmenter: Segmenter, } impl Default for Extractor { + /// Creates a new instance of [`Extractor`] with default settings. + /// + /// # Returns + /// Returns a new instance of `Extractor`. fn default() -> Self { Self::new() } } impl Extractor { + /// Creates a new instance of [`Extractor`]. + /// + /// # Returns + /// Returns a new instance of `Extractor` with a new `Segmenter`. pub fn new() -> Self { Extractor { segmenter: Segmenter::new(None), } } + /// Extracts features from a corpus file and writes them to a specified output file. + /// + /// # Arguments + /// * `corpus_path` - The path to the input corpus file containing sentences. + /// * `features_path` - The path to the output file where extracted features will be written. + /// + /// # Returns + /// Returns a Result indicating success or failure. pub fn extract( &mut self, corpus_path: &Path, diff --git a/src/main.rs b/src/main.rs index fcc5934..e31f697 100644 --- a/src/main.rs +++ b/src/main.rs @@ -12,6 +12,7 @@ use litsea::get_version; use litsea::segmenter::Segmenter; use litsea::trainer::Trainer; +/// Arguments for the extract command. #[derive(Debug, Args)] #[clap( author, @@ -23,6 +24,7 @@ struct ExtractArgs { features_file: PathBuf, } +/// Arguments for the train command. #[derive(Debug, Args)] #[clap(author, about = "Train a segmenter", @@ -45,6 +47,7 @@ struct TrainArgs { model_file: PathBuf, } +/// Arguments for the segment command. #[derive(Debug, Args)] #[clap(author, about = "Segment a sentence", @@ -54,6 +57,7 @@ struct SegmentArgs { model_file: PathBuf, } +/// Subcommands for lietsea CLI. #[derive(Debug, Subcommand)] enum Commands { Extract(ExtractArgs), @@ -61,6 +65,7 @@ enum Commands { Segment(SegmentArgs), } +/// Arguments for the litsea command. #[derive(Debug, Parser)] #[clap( name = "litsea", @@ -73,6 +78,15 @@ struct CommandArgs { command: Commands, } +/// Extract features from a corpus file and write them to a specified output file. +/// This function reads sentences from the corpus file, segments them into words, +/// and writes the extracted features to the output file. +/// +/// # Arguments +/// * `args` - The arguments for the extract command [`ExtractArgs`]. +/// +/// # Returns +/// Returns a Result indicating success or failure. fn extract(args: ExtractArgs) -> Result<(), Box> { let mut extractor = Extractor::new(); @@ -82,6 +96,15 @@ fn extract(args: ExtractArgs) -> Result<(), Box> { Ok(()) } +/// Train a segmenter using the provided arguments. +/// This function initializes a Trainer with the specified parameters, +/// loads a model if specified, and trains the model using the features file. +/// +/// # Arguments +/// * `args` - The arguments for the train command [`TrainArgs`]. +/// +/// # Returns +/// Returns a Result indicating success or failure. fn train(args: TrainArgs) -> Result<(), Box> { let running = Arc::new(AtomicBool::new(true)); let r = running.clone(); @@ -112,6 +135,16 @@ fn train(args: TrainArgs) -> Result<(), Box> { Ok(()) } +/// Segment a sentence using the trained model. +/// This function loads the AdaBoost model from the specified file, +/// reads sentences from standard input, segments them into words, +/// and writes the segmented sentences to standard output. +/// +/// # Arguments +/// * `args` - The arguments for the segment command [`SegmentArgs`]. +/// +/// # Returns +/// Returns a Result indicating success or failure. fn segment(args: SegmentArgs) -> Result<(), Box> { let mut leaner = AdaBoost::new(0.01, 100, 1); leaner.load_model(args.model_file.as_path())?; diff --git a/src/segmenter.rs b/src/segmenter.rs index faabbc6..2a58084 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -3,13 +3,14 @@ use regex::Regex; use std::collections::HashSet; /// Segmenter struct for text segmentation using AdaBoost +/// It uses predefined patterns to classify characters and segments sentences into words. pub struct Segmenter { patterns: Vec<(Regex, &'static str)>, pub learner: AdaBoost, } impl Segmenter { - /// Creates a new Segmenter with the given AdaBoost learner or a default one + /// creates a new instance of [`Segmenter`]. /// /// # Arguments /// * `learner` - An optional AdaBoost instance. If None, a default AdaBoost instance is created. @@ -55,7 +56,10 @@ impl Segmenter { /// # Arguments /// * `sentence` - A string slice representing the sentence to be added. /// * `writer` - A closure that takes a `HashSet` of attributes and a label (`i8`) as arguments. - /// This closure is called for each word in the sentence, allowing custom handling of the attributes and label. + /// + /// This closure is called for each instance created from the sentence. + /// This method processes the sentence, extracts features, and calls the writer function for each instance. + /// It constructs attributes based on the characters and their types, and uses the AdaBoost learner to add instances. pub fn add_sentence_with_writer(&mut self, sentence: &str, mut writer: F) where F: FnMut(HashSet, i8), @@ -103,7 +107,7 @@ impl Segmenter { /// /// This method processes the sentence, extracts features, and adds them to the AdaBoost learner. /// It constructs attributes based on the characters and their types, and uses the AdaBoost learner to add instances. - /// If the sentence is empty or too short, it does nothing. + /// If the sentence is empty or too short, it does nothing. pub fn add_sentence(&mut self, sentence: &str) { if sentence.is_empty() { return; diff --git a/src/trainer.rs b/src/trainer.rs index 101218d..dd76a94 100644 --- a/src/trainer.rs +++ b/src/trainer.rs @@ -4,11 +4,28 @@ use std::sync::Arc; use crate::adaboost::AdaBoost; +/// Trainer struct for managing the AdaBoost training process. +/// It initializes the AdaBoost learner with the specified parameters, +/// loads the model from a file, and provides methods to train the model +/// and save the trained model. pub struct Trainer { learner: AdaBoost, } impl Trainer { + /// Creates a new instance of [`Trainer`]. + /// + /// # Arguments + /// * `threshold` - The threshold for the AdaBoost algorithm. + /// * `num_iterations` - The number of iterations for the training. + /// * `num_threads` - The number of threads to use for training. + /// * `features_path` - The path to the features file. + /// + /// # Returns + /// Returns a new instance of `Trainer`. + /// + /// # Errors + /// Returns an error if the features or instances cannot be initialized. pub fn new( threshold: f64, num_iterations: usize, @@ -27,11 +44,32 @@ impl Trainer { Trainer { learner } } + /// Load Model from a file + /// + /// # Arguments + /// * `model_path` - The path to the model file to load. + /// + /// # Returns + /// Returns a Result indicating success or failure. + /// + /// # Errors + /// Returns an error if the model cannot be loaded. pub fn load_model(&mut self, model_path: &Path) -> Result<(), Box> { // Load the model from the specified file Ok(self.learner.load_model(model_path)?) } + /// Train the AdaBoost model. + /// + /// # Arguments + /// * `running` - An Arc to control the running state of the training process. + /// * `model_path` - The path to save the trained model. + /// + /// # Returns + /// Returns a Result indicating success or failure. + /// + /// # Errors + /// Returns an error if the training fails or if the model cannot be saved. pub fn train( &mut self, running: Arc, From e48c93e37db89140ea61b6d50aff826974b11590 Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Tue, 3 Jun 2025 11:47:30 +0900 Subject: [PATCH 03/15] Add tests (#3) * Add tests * Update doc * Rename --- src/segmenter.rs | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/src/segmenter.rs b/src/segmenter.rs index 2a58084..ec613a8 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -29,6 +29,7 @@ impl Segmenter { (Regex::new(r"[a-zA-Za-zA-Z]").unwrap(), "A"), (Regex::new(r"[0-90-9]").unwrap(), "N"), ]; + Segmenter { patterns, learner: learner.unwrap_or_else(|| AdaBoost::new(0.01, 100, 1)), @@ -42,13 +43,13 @@ impl Segmenter { /// /// # Returns /// A static string representing the type of the character, such as "M", "H", "I", "K", "A", "N", or "O" (for others). - pub fn get_type(&self, ch: &str) -> &'static str { - for (pattern, s_type) in &self.patterns { + pub fn get_type(&self, ch: &str) -> &str { + for (pattern, label) in &self.patterns { if pattern.is_match(ch) { - return s_type; + return label; } } - "O" + "O" // Other } /// Adds a sentence to the segmenter with a custom writer function. @@ -269,3 +270,36 @@ impl Segmenter { .collect() } } + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use super::*; + + #[test] + fn test_segmenter() { + let model_file = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("./resources") + .join("RWCP.model"); + + let mut learner = AdaBoost::new(0.01, 100, 1); + learner.load_model(model_file.as_path()).unwrap(); + + let mut segmenter = Segmenter::new(Some(learner)); + let sentence = "これはテストです。"; + segmenter.add_sentence(sentence); + let result = segmenter.parse(sentence); + assert!(!result.is_empty()); + assert_eq!(result.len(), 5); // Adjust based on expected segmentation + } + + #[test] + fn test_get_type() { + let segmenter = Segmenter::new(None); + assert_eq!(segmenter.get_type("あ"), "I"); // Hiragana + assert_eq!(segmenter.get_type("漢"), "H"); // Kanji + assert_eq!(segmenter.get_type("A"), "A"); // Latin + assert_eq!(segmenter.get_type("1"), "N"); // Digit + } +} From 6ad062567c76d1e11fe7662741b20633f69b33b2 Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Tue, 3 Jun 2025 19:25:02 +0900 Subject: [PATCH 04/15] Add rustfmt.toml (#4) * Add rustfmt.toml * Fix regex patterns --- resources/RWCP.model.bak | 1341 ++++++++++++++++++++++++++++++++++++++ rustfmt.toml | 80 +++ src/adaboost.rs | 37 +- src/segmenter.rs | 36 +- 4 files changed, 1457 insertions(+), 37 deletions(-) create mode 100644 resources/RWCP.model.bak create mode 100644 rustfmt.toml diff --git a/resources/RWCP.model.bak b/resources/RWCP.model.bak new file mode 100644 index 0000000..eef6ffa --- /dev/null +++ b/resources/RWCP.model.bak @@ -0,0 +1,1341 @@ +-0.0332 +UW6:ン -0.0496 +UW6:連 0.0463 +UW6:ル -0.0673 +UW6:か 0.0241 +UW6:業 -0.0697 +UW6:, 0.0227 +UW6:者 0.1811 +UW6:. 0.0808 +UW6:福 0.0974 +UW6:後 0.0535 +UW6:広 -0.0695 +UW6:ル -0.0673 +UW6:E1 0.0306 +UW6:ン -0.0496 +UW6:員 -0.1212 +UW6:に -0.0149 +UW6:学 -0.096 +UW6:郎 0.1082 +UW6:E1 0.0306 +UW6:相 0.0753 +UW6:も -0.0206 +UW6:り 0.0187 +UW6:る -0.0135 +UW6:社 -0.0507 +UW6:を 0.0195 +UW6:な -0.0253 +UW6:中 0.0201 +UW6:と -0.0105 +UW6:は -0.0236 +UW6:の -0.0417 +UW6:っ 0.0573 +UW6:で 0.0101 +UW6:て -0.1014 +UW6:す 0.0383 +UW6:じ 0.1782 +UW6:た -0.0428 +UW6:こ -0.02 +UW6:会 0.0624 +UW6:空 -0.0822 +UW6:件 -0.08 +UW6:く -0.0121 +UW6:が -0.0073 +UW6:あ -0.0307 +UW6:前 0.0302 +UW6:う 0.0189 +UW6:一 -0.0277 +UW6:1 -0.027 +UW6:市 0.0887 +UW6:委 0.0798 +UW6:区 0.1792 +UW6:1 -0.027 +UW6:、 0.0227 +UW6:。 0.0808 +BW1:には 0.1498 +BW1:B1あ 0.1404 +BW1:ため 0.0601 +BW1:大阪 0.1497 +BW1:に対 -0.0912 +BW1:引き -0.1336 +BW1:から 0.3472 +BW1:れた 0.2369 +BW1:うん 0.0665 +BW1:,同 0.0727 +BW1:毎日 -0.2113 +BW1:やむ -0.1947 +BW1:です 0.3445 +BW1:まで 0.1711 +BW1:いる 0.0672 +BW1:、同 0.0727 +BW1:」と 0.1682 +BW1:、と 0.066 +BW1:よっ -0.2565 +BW1:なっ 0.3015 +BW1:日本 -0.0195 +BW1:すで -0.3399 +BW1:平方 -0.2314 +BW1:れで -0.0913 +BW1:とい -0.4915 +BW1:ませ 0.2448 +BW1:つい -0.0802 +BW1:を見 0.0731 +BW1:てき 0.1249 +BW1:それ -0.0871 +BW1:こん -0.1262 +BW1:でき 0.1127 +BW1:をし 0.186 +BW1:てい 0.0805 +BW1:大き -0.2604 +BW1:B1あ 0.1404 +BW1:の中 0.0741 +BW1:して 0.1104 +BW1:では 0.0844 +BW1:さら -0.4143 +BW1:どこ 0.3887 +BW1:京都 0.2558 +BW1:いう 0.1743 +BW1:うし -0.4817 +BW1:ない 0.5713 +BW1:にし 0.2468 +BW1:まま 0.26 +BW1:あっ 0.1505 +BW1:の一 -0.0501 +BW1:亡く -0.1886 +BW1:こと 0.2083 +BW1:った 0.3463 +BW1:さん 0.4573 +BW1:にも 0.1671 +BW1:なん -0.1113 +BW1:B1同 0.0542 +BW1:とみ 0.1922 +BW1:」と 0.1682 +BW1:いっ -0.2055 +BW1:たち 0.1122 +BW1:本当 -0.2423 +BW1:,と 0.066 +BW1:がら 0.06 +BW1:こう -0.079 +BW1:取り -0.2784 +BW1:目指 -0.0724 +BW1:した 0.2641 +BW1:B1同 0.0542 +BW1:そこ 0.1977 +BW1:まる -0.2155 +BW1:など 0.7379 +BW2:との 0.072 +BW2:われ 0.7901 +BW2:とと -0.2279 +BW2:新聞 -0.4066 +BW2:を通 -1.1877 +BW2:とみ 0.5168 +BW2:朝鮮 -0.2355 +BW2:大阪 -0.2471 +BW2:同党 0.097 +BW2:とい 0.189 +BW2:がい 0.0853 +BW2:とこ -0.1746 +BW2:11 -0.0669 +BW2:米国 -0.4268 +BW2:れて 0.0849 +BW2:日新 -0.0722 +BW2:れば 0.4114 +BW2:うか 0.249 +BW2:くな -0.1597 +BW2:に関 -1.1388 +BW2:こと -0.8392 +BW2:かし -0.135 +BW2:この -0.4193 +BW2:なん 0.3099 +BW2:府県 -0.2363 +BW2:にし 0.2748 +BW2:会社 -0.1116 +BW2:同日 -0.0913 +BW2:れた 0.427 +BW2:かも -0.0602 +BW2:にな 0.2454 +BW2:然と -0.1384 +BW2:に対 -1.4943 +BW2:から -0.7194 +BW2:上が -0.4479 +BW2:かれ 0.4612 +BW2:んな -0.4115 +BW2:ては -0.311 +BW2:めて -0.3153 +BW2:んだ 0.0728 +BW2:らか -0.0944 +BW2:一方 -0.1375 +BW2:にお -0.1615 +BW2:分の -0.7758 +BW2:まし -0.1316 +BW2:てき 0.364 +BW2:てく 0.2551 +BW2:てい 0.6144 +BW2:らに -0.1897 +BW2:手権 -0.1982 +BW2:一日 0.097 +BW2:りし 0.0651 +BW2:はず -0.2532 +BW2:いう -0.1609 +BW2:少な -0.105 +BW2:はが -0.1033 +BW2:はい 0.1073 +BW2:ろう 0.6067 +BW2:でも -0.4203 +BW2:りま 0.162 +BW2:日本 -0.7068 +BW2:社会 -0.1276 +BW2:らし -0.1611 +BW2:もの -1.0713 +BW2:させ 0.4533 +BW2:.. -1.1822 +BW2:に従 -0.4688 +BW2:東京 -0.1543 +BW2:もい 0.223 +BW2:され 1.3168 +BW2:その -0.3744 +BW2:たい -0.1253 +BW2:たた -0.0662 +BW2:一部 -0.1051 +BW2:ばれ 0.1813 +BW2:委員 -0.125 +BW2:った 0.4589 +BW2:さん -0.3977 +BW2:たは -0.0939 +BW2:一人 0.0602 +BW2:って 0.1647 +BW2:たと 0.1224 +BW2:っと -0.2094 +BW2:たち -0.0786 +BW2:ただ -0.3857 +BW2:立て -0.099 +BW2:まれ 0.5409 +BW2:出て 0.2163 +BW2:ても -0.3065 +BW2:まで -0.6621 +BW2:11 -0.0669 +BW2:年度 -0.8669 +BW2:なの 0.2614 +BW2:など -0.6509 +BW2:でし -0.3828 +BW2:第に -0.1612 +BW2:曜日 -0.0601 +BW2:です -0.4761 +BW2:なが -0.1313 +BW2:でい 0.2666 +BW2:ない -0.2488 +BW2:でき -0.1528 +BW2:して 0.0972 +BW2:きた 0.1941 +BW2:ので -0.7059 +BW2:のに -0.6041 +BW2:しな 0.0939 +BW2:のの -0.6125 +BW2:本人 -0.2697 +BW2:―― -0.573 +BW2:しい -0.1819 +BW2:によ -0.7236 +BW2:のか 0.2093 +BW2:しか -0.0545 +BW2:年間 -0.1626 +BW2:がら -0.3198 +BW2:とも -0.3941 +BW2:−− -1.3175 +BW2:した 0.5078 +BW2:日米 0.3372 +BW3:との 0.0541 +BW3:われ -0.0605 +BW3:だ。 0.4098 +BW3:す. -0.131 +BW3:い。 -0.1185 +BW3:か。 0.2857 +BW3:がっ -0.0913 +BW3:がけ -0.1127 +BW3:とう -0.1387 +BW3:ず、 0.3426 +BW3:新聞 -0.5055 +BW3:日, 0.0974 +BW3:とし 0.2266 +BW3:会議 0.086 +BW3:がき -0.4855 +BW3:れて 0.1375 +BW3:する 0.6521 +BW3:ず, 0.3426 +BW3:に, -0.1021 +BW3:られ 0.682 +BW3:かけ -0.0743 +BW3:こと 0.7397 +BW3:この 0.1542 +BW3:かに -0.0669 +BW3:かっ -0.4098 +BW3:てい 0.624 +BW3:が、 0.1816 +BW3:うち 0.1117 +BW3:れた 0.185 +BW3:にな 0.1906 +BW3:には 0.2644 +BW3:かり -0.267 +BW3:から 0.652 +BW3:は、 0.1337 +BW3:まっ -0.1549 +BW3:まで 0.6154 +BW3:んで 0.0798 +BW3:んだ 0.0606 +BW3:うと 0.4798 +BW3:ころ -0.2757 +BW3:ます 0.6943 +BW3:てお 0.0855 +BW3:入り 0.1232 +BW3:にし 0.1771 +BW3:いえ 0.2079 +BW3:す。 -0.131 +BW3:いく 0.3029 +BW3:ずに 0.0841 +BW3:いい 0.5308 +BW3:るる 0.3818 +BW3:れ、 0.0854 +BW3:いた 0.2056 +BW3:の子 -0.1 +BW3:だっ 0.1004 +BW3:いっ 0.1883 +BW3:カ月 0.099 +BW3:か. 0.2857 +BW3:けど 0.1374 +BW3:た。 0.8875 +BW3:社会 0.2024 +BW3:さい -0.0714 +BW3:らし 0.1479 +BW3:い. -0.1185 +BW3:始め 0.1681 +BW3:の、 -0.0724 +BW3:が, 0.1816 +BW3:たい -0.0594 +BW3:った -0.4748 +BW3:さを 0.0976 +BW3:たの 0.0812 +BW3:日、 0.0974 +BW3:って 0.03 +BW3:べき 0.2181 +BW3:の, -0.0724 +BW3:に、 -0.1021 +BW3:そう 0.0428 +BW3:カ月 0.099 +BW3:まれ -0.0793 +BW3:ても 0.0302 +BW3:大会 0.2217 +BW3:たり -0.1183 +BW3:たる -0.0853 +BW3:では 0.2295 +BW3:など 0.2135 +BW3:いる 0.56 +BW3:し、 0.1557 +BW3:いわ 0.1527 +BW3:た. 0.8875 +BW3:ある 0.3846 +BW3:あり 0.0719 +BW3:れる 0.1091 +BW3:でに -0.1482 +BW3:は, 0.1337 +BW3:です 0.1437 +BW3:なく -0.0903 +BW3:ない 0.1796 +BW3:して 0.1449 +BW3:市 0.0965 +BW3:きた 0.1645 +BW3:しな 0.2608 +BW3:れ, 0.0854 +BW3:どう 0.4664 +BW3:しま 0.12 +BW3:まし 0.1113 +BW3:だ. 0.4098 +BW3:しい -0.3714 +BW3:し, 0.1557 +BW3:えと 0.1454 +BW3:れば -0.3246 +BW3:あた -0.2194 +BW3:がり -0.2064 +BW3:がら -0.4977 +BW3:とも -0.3543 +BW3:した 0.3562 +UC3:A -0.137 +UC3:I 0.2311 +TW4:からな -0.2348 +TW4:ません 0.1097 +TW4:という 0.1349 +TW4:ました 0.5543 +TW4:ようと -0.4258 +TW4:たが、 0.1516 +TW4:してい 0.2958 +TW4:たが, 0.1516 +TW4:ている 0.1538 +TW4:いう。 0.8576 +TW4:いう. 0.8576 +TW4:よると 0.5865 +UC1:A 0.0484 +UC1:K 0.0093 +UC1:M 0.0645 +UC1:O -0.0505 +UC6:I -0.0253 +UC6:H -0.0506 +UC6:K 0.0087 +UC6:M 0.0247 +UC6:O -0.0387 +UW3:・ -0.3794 +UW3:調 -0.0562 +UW3:ン 0.0278 +UW3:ロ 0.2201 +UW3:ル 0.1591 +UW3:度 0.1452 +UW3:非 0.2066 +UW3:ム 0.1109 +UW3:府 0.1605 +UW3:ト 0.0521 +UW3:く 0.1004 +UW3:ッ -0.135 +UW3:広 -0.103 +UW3:李 0.3094 +UW3:部 0.12 +UW3:予 -0.1193 +UW3:郡 0.4404 +UW3:二 0.0974 +UW3:法 0.1868 +UW3:員 0.4513 +UW3:森 0.2438 +UW3:村 0.0364 +UW3:郎 0.1026 +UW3:グ 0.1319 +UW3:力 0.0365 +UW3:い 0.1006 +UW3:東 -0.0805 +UW3:ほ -0.5516 +UW3:へ 0.1199 +UW3:主 -0.0758 +UW3:ま -0.4384 +UW3:ひ -0.2171 +UW3:共 -0.188 +UW3:ふ -0.1798 +UW3:開 -0.1432 +UW3:六 0.0755 +UW3:公 -0.303 +UW3:中 0.0653 +UW3:と 0.1691 +UW3:は 0.4555 +UW3:全 0.1574 +UW3:間 0.1302 +UW3:っ -0.1444 +UW3:ち -0.0521 +UW3:型 0.1389 +UW3:で 0.2318 +UW3:て 0.6167 +UW3:両 0.3815 +UW3:つ -0.1081 +UW3:せ 0.3685 +UW3:す 0.0584 +UW3:た 0.0842 +UW3:そ -0.5228 +UW3:党 0.3593 +UW3:こ -0.3552 +UW3:げ 0.0401 +UW3:け 0.0388 +UW3:し -0.0395 +UW3:さ -0.1058 +UW3:ご -0.3116 +UW3:か -0.1163 +UW3:お -0.4864 +UW3:え 0.1983 +UW3:下 -0.1759 +UW3:が 0.3271 +UW3:あ -0.2696 +UW3:う 0.2342 +UW3:元 0.4858 +UW3:一 -0.1619 +UW3:政 -0.2013 +UW3:区 0.4646 +UW3:税 0.0401 +UW3:系 0.3066 +UW3:化 0.1327 +UW3:北 -0.1038 +UW3:口 0.0483 +UW3:右 0.1233 +UW3:駅 0.162 +UW3:戸 -0.0488 +UW3:知 -0.1528 +UW3:− -0.1723 +UW3:妻 0.2016 +UW3:金 0.2163 +UW3:込 -0.1504 +UW3:無 0.0979 +UW3:よ -0.0202 +UW3:わ -0.1207 +UW3:を 0.662 +UW3:学 -0.1356 +UW3:当 -0.3885 +UW3:保 -0.2439 +UW3:再 0.3095 +UW3:円 0.5807 +UW3:約 0.3663 +UW3:的 0.7313 +UW3:級 0.1384 +UW3:ア 0.0551 +UW3:ス 0.0874 +UW3:1 -0.08 +UW3:・ -0.3794 +UW3:ッ -0.135 +UW3:市 0.3197 +UW3:用 0.0914 +UW3:能 0.0725 +UW3:別 0.1129 +UW3:昨 -0.0661 +UW3:町 0.1215 +UW3:何 0.4265 +UW3:初 0.2475 +UW3:作 -0.0361 +UW3:決 -0.1073 +UW3:低 0.0811 +UW3:生 -0.0273 +UW3:月 0.4125 +UW3:数 0.3222 +UW3:最 -0.0937 +UW3:選 -0.0681 +UW3:雨 0.2009 +UW3:立 -0.096 +UW3:期 0.036 +UW3:電 -0.1045 +UW3:」 0.267 +UW3:費 0.1777 +UW3:業 0.0484 +UW3:, 0.4889 +UW3:者 0.6457 +UW3:教 -0.1479 +UW3:務 -0.1872 +UW3:動 -0.0949 +UW3:財 -0.0733 +UW3:指 -0.3973 +UW3:車 0.1835 +UW3:軍 0.1375 +UW3:国 0.0642 +UW3:統 -0.4229 +UW3:直 -0.1835 +UW3:日 0.2099 +UW3:旧 0.5792 +UW3:千 -0.2309 +UW3:午 -0.0783 +UW3:協 -0.1006 +UW3:外 -0.0241 +UW3:建 -0.2352 +UW3:特 -0.385 +UW3:自 -0.2869 +UW3:物 0.0461 +UW3:平 -0.1804 +UW3:海 -0.0495 +UW3:人 0.2742 +UW3:〓 -0.3573 +UW3:」 0.267 +UW3:、 0.4889 +UW3:々 -0.2311 +UW3:長 0.0421 +UW3:〇 0.5827 +UW3:思 -0.1291 +UW3:安 -0.0423 +UW3:州 0.1155 +UW3:み -0.012 +UW3:実 -0.1008 +UW3:得 0.1905 +UW3:通 -0.1136 +UW3:性 0.1822 +UW3:同 0.3906 +UW3:合 -0.0241 +UW3:各 0.3588 +UW3:時 -0.1248 +UW3:ロ 0.2201 +UW3:ル 0.1591 +UW3:家 0.1078 +UW3:ン 0.0278 +UW3:ム 0.1109 +UW3:見 0.1044 +UW3:ト 0.0521 +UW3:新 0.1764 +UW3:に 0.2745 +UW3:な -0.2788 +UW3:文 -0.1489 +UW3:ど -0.0899 +UW3:米 0.7767 +UW3:の 0.4056 +UW3:も 0.2323 +UW3:め 0.1205 +UW3:や -0.0788 +UW3:り 0.0649 +UW3:る 0.5905 +UW3:氏 0.2613 +UW3:ら 0.0727 +UW3:今 0.0792 +UW3:核 0.5156 +UW3:れ 0.2773 +UW3:他 0.1889 +UW3:ん -0.0518 +UW3:民 -0.1694 +UW3:場 0.1219 +UW3:副 0.4437 +UW3:ア 0.0551 +UW3:分 0.0457 +UW3:以 -0.1368 +UW3:曜 -0.0951 +UW3:グ 0.1319 +UW3:年 0.2416 +UW3:和 -0.0837 +UW3:県 0.6293 +UW3:ス 0.0874 +UW3:前 0.2286 +UW3:1 -0.08 +UW3:総 0.1163 +UW3:少 -0.3102 +UW3:小 -0.0513 +UW3:線 0.1255 +UW3:第 0.1201 +UW3:関 -0.1282 +UW3:英 0.0785 +UW3:私 0.4231 +UW3:世 -0.2087 +UW3:省 0.0792 +UW2:行 0.0838 +UW2:最 -0.063 +UW2:調 0.101 +UW2:立 -0.0763 +UW2:朝 -0.1843 +UW2:本 -0.165 +UW2:, -0.0829 +UW2:ッ 0.0831 +UW2:事 0.0492 +UW2:目 -0.1584 +UW2:相 -0.0242 +UW2:人 -0.0123 +UW2:東 -0.0931 +UW2:べ 0.1261 +UW2:主 -0.0861 +UW2:ま 0.06 +UW2:太 -0.0483 +UW2:ひ -0.1273 +UW2:天 -0.0865 +UW2:強 0.1067 +UW2:開 0.1758 +UW2:に -0.1764 +UW2:な 0.1063 +UW2:ど 0.1273 +UW2:と -0.0981 +UW2:は -0.0409 +UW2:の 0.013 +UW2:間 -0.1257 +UW2:入 0.0548 +UW2:だ 0.1837 +UW2:で -0.0268 +UW2:て -0.0291 +UW2:つ -0.0949 +UW2:せ 0.03 +UW2:す -0.0675 +UW2:た 0.0188 +UW2:そ -0.1011 +UW2:こ 0.1141 +UW2:世 -0.0302 +UW2:し 0.1529 +UW2:ざ 0.054 +UW2:さ 0.0878 +UW2:か 0.1454 +UW2:お -0.0502 +UW2:不 -0.215 +UW2:く -0.0412 +UW2:三 -0.0758 +UW2:が -0.0856 +UW2:あ -0.0538 +UW2:う 0.0134 +UW2:い 0.0505 +UW2:政 0.1522 +UW2:区 -0.0422 +UW2:自 -0.1353 +UW2:揺 -0.1033 +UW2:大 -0.1769 +UW2:理 0.0752 +UW2:「 -0.0645 +UW2:」 0.3145 +UW2:次 -0.2378 +UW2:、 -0.0829 +UW2:発 0.0529 +UW2:〇 0.0892 +UW2:実 0.1023 +UW2:西 -0.0744 +UW2:込 0.3041 +UW2:日 -0.1815 +UW2:見 -0.3874 +UW2:子 -0.1519 +UW2:新 -0.1682 +UW2:学 0.076 +UW2:保 0.0362 +UW2:文 -0.1355 +UW2:中 -0.0968 +UW2:手 -0.1519 +UW2:米 0.0509 +UW2:も -0.1263 +UW2:や -0.0402 +UW2:り -0.0579 +UW2:る -0.0694 +UW2:よ 0.1639 +UW2:れ 0.0571 +UW2:を -0.2516 +UW2:ん 0.2095 +UW2:気 -0.174 +UW2:民 -0.018 +UW2:副 -0.1566 +UW2:ア -0.0587 +UW2:ア -0.0587 +UW2:果 -0.0665 +UW2:キ 0.0568 +UW2:カ 0.0306 +UW2:カ 0.0306 +UW2:キ 0.0568 +UW2:」 0.3145 +UW2:「 -0.0645 +UW2:年 -0.106 +UW2:ッ 0.0831 +UW2:市 -0.0813 +UW2:議 0.1198 +UW2:小 -0.2009 +UW2:第 0.081 +UW2:初 -0.3025 +UW2:北 -0.3414 +UW2:明 -0.1462 +UW2:県 -0.1165 +UW2:会 0.0978 +TC4:IOO 0.0054 +TC4:HIH 0.0804 +TC4:HII 0.0679 +TC4:IIO 0.0656 +TC4:III 0.1497 +TC4:IIH 0.0321 +TC4:IHO -0.2324 +TC4:MOM 0.0841 +TC4:MHH -0.0405 +TC4:MHI 0.0201 +TC4:HOH 0.0446 +TC4:KAK 0.4845 +TC4:HHO 0.0669 +TC4:MMM 0.0661 +TC4:IHH 0.0695 +TC4:MMH -0.0241 +TC4:KKK 0.3065 +TC4:HHK 0.0365 +TC4:HHI 0.1344 +TC4:HHH -0.0203 +TC4:KKA 0.3386 +TC4:HHN 0.0182 +TC4:HHM -0.0122 +TQ3:BIIH -0.0116 +TQ3:BIII -0.0105 +TQ3:OKHH 0.0587 +TQ3:OIIH 0.1344 +TQ3:BHII -0.0504 +TQ3:BHIH 0.0222 +TQ3:OOHH 0.011 +TQ3:OKAK 0.2792 +TQ3:BHHH 0.0478 +TQ3:BOMH 0.062 +TQ3:BHHM -0.1073 +TQ3:OIHH 0.0623 +TQ3:BMHM -0.0464 +TQ3:OOII -0.0685 +TQ3:OKKA 0.0679 +TQ3:BMHI -0.0863 +TQ3:OHHI 0.1729 +TQ3:OHHH 0.0346 +TQ3:OHMH 0.0481 +TQ3:OHII 0.0997 +TC2:OII -0.2649 +TC2:HMM -0.1154 +TC2:IHI -0.1965 +TC2:KKH 0.0703 +TC2:HII -0.1023 +TC2:HHO 0.2088 +TC3:KOK -0.1009 +TC3:AAA -0.0294 +TC3:NNO 0.0662 +TC3:OHO -0.3393 +TC3:NNH -0.1689 +TC3:KHH -0.1216 +TC3:IOI -0.0542 +TC3:IIM -0.1035 +TC3:HII -0.1088 +TC3:HIK 0.0731 +TC3:IIH -0.0825 +TC3:IHO -0.1935 +TC3:MHO 0.0123 +TC3:MHM -0.0457 +TC3:MHH -0.2694 +TC3:HOH -0.1486 +TC3:KKH -0.1217 +TC3:IHH 0.0128 +TC3:IHI -0.3041 +TC3:MMH -0.0471 +TC3:HHI -0.0341 +TC3:HHH 0.0346 +TC3:KKA 0.0491 +UW5:月 -0.4353 +UW5:ン -0.0343 +UW5:ル 0.0451 +UW5:挙 0.1618 +UW5:語 -0.1073 +UW5:, 0.0465 +UW5:者 -0.2233 +UW5:務 0.3519 +UW5:E2 -3.2768 +UW5:員 0.2104 +UW5:郎 -0.0368 +UW5:京 0.0722 +UW5:相 0.1319 +UW5:統 0.1955 +UW5:い 0.0331 +UW5:べ 0.1001 +UW5:み 0.0502 +UW5:大 -0.1296 +UW5:日 0.0218 +UW5:に -0.1224 +UW5:な -0.0787 +UW5:ど 0.1682 +UW5:と -0.0127 +UW5:は -0.0578 +UW5:の -0.0635 +UW5:間 0.1191 +UW5:っ 0.0052 +UW5:ち 0.1093 +UW5:だ -0.1186 +UW5:で -0.085 +UW5:て -0.0018 +UW5:つ 0.0921 +UW5:す -0.0852 +UW5:党 -0.0654 +UW5:研 -0.0997 +UW5:げ -0.0983 +UW5:し -0.1371 +UW5:空 -0.0813 +UW5:さ -0.1537 +UW5:か 0.0647 +UW5:お 0.0527 +UW5:え 0.1199 +UW5:く 0.0312 +UW5:ぎ 0.1971 +UW5:き 0.1624 +UW5:が -0.0421 +UW5:あ 0.1655 +UW5:う -0.0503 +UW5:E2 -3.2768 +UW5:表 0.0663 +UW5:区 -0.0901 +UW5:「 0.0363 +UW5:館 -0.0689 +UW5:、 0.0465 +UW5:。 -0.0299 +UW5:長 0.0786 +UW5:査 0.0932 +UW5:題 0.2368 +UW5:思 0.0872 +UW5:機 -0.1508 +UW5:定 0.1785 +UW5:. -0.0299 +UW5:格 0.1356 +UW5:氏 -0.1347 +UW5:ル 0.0451 +UW5:ン -0.0343 +UW5:社 -0.0278 +UW5:新 -0.1682 +UW5:学 -0.0548 +UW5:中 -0.0871 +UW5:所 -0.0814 +UW5:ゃ 0.335 +UW5:め 0.0865 +UW5:ょ 0.0854 +UW5:り -0.0208 +UW5:る 0.0429 +UW5:的 -0.3149 +UW5:わ 0.0419 +UW5:れ 0.0504 +UW5:を -0.1264 +UW5:ん 0.0327 +UW5:イ 0.0241 +UW5:イ 0.0241 +UW5:会 -0.1153 +UW5:嵐 -0.1304 +UW5:1 -0.0514 +UW5:「 0.0363 +UW5:年 0.1763 +UW5:1 -0.0514 +UW5:市 -0.2991 +UW5:議 0.1219 +UW5:田 0.024 +UW5:選 -0.1018 +UW5:町 -0.3912 +UW5:] -0.2762 +UW5:席 0.0921 +UW5:告 0.0848 +UW5:県 -0.4003 +UW5:省 -0.1052 +TC1:AAA 0.1093 +TC1:HOM -0.0331 +TC1:HOH -0.039 +TC1:OOI -0.1832 +TC1:IOM 0.0467 +TC1:IHI 0.1169 +TC1:MMH 0.0187 +TC1:IOI -0.1015 +TC1:IOH -0.0142 +TC1:HII 0.0998 +TC1:HHH 0.1029 +TC1:HHM 0.058 +UC4:A -0.2643 +UC4:I -0.1032 +UC4:H 0.1809 +UC4:K -0.345 +UC4:M 0.3565 +UC4:O 0.6646 +UC4:N 0.3876 +UQ2:OK 0.1759 +UQ2:BH 0.0216 +UQ2:BI 0.0113 +UW4:ー -1.187 +UW4:行 -0.0792 +UW4:規 0.0792 +UW4:・ -0.4371 +UW4:園 -0.12 +UW4:ン -0.3637 +UW4:ラ -0.0881 +UW4:ル -0.0856 +UW4:リ -0.0541 +UW4:メ -0.1635 +UW4:ぎ -0.3821 +UW4:地 0.0866 +UW4:ト -0.0403 +UW4:庁 -0.4556 +UW4:ッ -0.0724 +UW4:率 0.0672 +UW4:予 0.0782 +UW4:事 -0.019 +UW4:井 -0.1768 +UW4:員 -0.091 +UW4:郎 -0.4866 +UW4:塁 -0.2094 +UW4:署 0.0749 +UW4:来 -0.0442 +UW4:力 -0.0302 +UW4:い -0.3435 +UW4:賞 0.073 +UW4:ほ 0.1464 +UW4:べ -0.0744 +UW4:へ 0.6665 +UW4:み -0.2082 +UW4:ま 0.1051 +UW4:び -0.4134 +UW4:ひ 0.4249 +UW4:ば 0.194 +UW4:共 -0.1212 +UW4:ふ 0.1345 +UW4:に 0.6499 +UW4:な 0.5433 +UW4:中 0.221 +UW4:と 0.4547 +UW4:は 0.8578 +UW4:の 0.7396 +UW4:ね 0.1413 +UW4:ぬ 0.1853 +UW4:っ -0.5882 +UW4:ち -0.3654 +UW4:だ 0.5408 +UW4:で 0.741 +UW4:て 0.3994 +UW4:つ -0.1659 +UW4:せ 0.0181 +UW4:ず 0.1251 +UW4:す -0.0731 +UW4:じ -0.2506 +UW4:た 0.5034 +UW4:そ 0.4091 +UW4:党 -0.2006 +UW4:こ 0.2255 +UW4:げ -0.4734 +UW4:け -0.4376 +UW4:し -0.0843 +UW4:さ 0.2864 +UW4:ご 0.1979 +UW4:か 0.053 +UW4:お 0.2405 +UW4:え -0.2514 +UW4:く -0.3788 +UW4:先 0.0601 +UW4:き -0.4482 +UW4:が 0.6006 +UW4:あ 0.4752 +UW4:う -0.064 +UW4:一 -0.2069 +UW4:島 -0.2056 +UW4:改 0.0787 +UW4:士 -0.1413 +UW4:政 0.2182 +UW4:区 0.4517 +UW4:野 -0.11 +UW4:支 0.0856 +UW4:系 0.0786 +UW4:館 -0.1984 +UW4:化 0.0776 +UW4:参 0.1555 +UW4:込 -0.337 +UW4:. 0.3508 +UW4:よ 0.3351 +UW4:子 -0.4802 +UW4:学 -0.1397 +UW4:感 0.0916 +UW4:校 -0.036 +UW4:般 -0.0852 +UW4:内 0.0584 +UW4:円 0.0788 +UW4:題 -0.0792 +UW4:高 0.212 +UW4:約 0.2171 +UW4:的 0.2586 +UW4:銀 -0.2213 +UW4:屋 -0.1328 +UW4:済 -0.0543 +UW4:ー -1.187 +UW4:輪 -0.1433 +UW4:山 -0.15 +UW4:コ 0.1789 +UW4:セ 0.1287 +UW4:」 0.3798 +UW4:「 0.1895 +UW4:際 -0.2604 +UW4:・ -0.4371 +UW4:ッ -0.0724 +UW4:産 -0.1101 +UW4:市 0.2771 +UW4:能 -0.073 +UW4:田 -0.29 +UW4:選 0.2596 +UW4:町 0.1826 +UW4:間 -0.2344 +UW4:カ 0.2145 +UW4:体 -0.1286 +UW4:初 0.1347 +UW4:作 0.053 +UW4:カ 0.2145 +UW4:寺 -0.0809 +UW4:側 0.4292 +UW4:道 -0.1291 +UW4:生 -0.1286 +UW4:月 -0.9066 +UW4:都 0.1192 +UW4:最 0.0845 +UW4:立 -0.2112 +UW4:電 -0.0878 +UW4:沢 -0.0939 +UW4:業 -0.1043 +UW4:, 0.393 +UW4:者 0.2145 +UW4:教 0.0704 +UW4:務 -0.2715 +UW4:動 -0.074 +UW4:車 -0.1481 +UW4:回 0.15 +UW4:軍 0.1158 +UW4:経 0.1146 +UW4:国 -0.0619 +UW4:目 0.0922 +UW4:統 -0.1169 +UW4:大 0.0571 +UW4:日 0.1798 +UW4:谷 -0.1 +UW4:空 -0.0867 +UW4:協 0.1013 +UW4:多 0.1067 +UW4:領 -0.1659 +UW4:物 -0.0735 +UW4:人 0.1036 +UW4:〓 -0.5156 +UW4:球 -0.1267 +UW4:「 0.1895 +UW4:」 0.3798 +UW4:、 0.393 +UW4:。 0.3508 +UW4:長 0.0357 +UW4:〇 0.4999 +UW4:川 -0.2667 +UW4:定 -0.1057 +UW4:性 0.0553 +UW4:合 -0.1834 +UW4:後 0.0456 +UW4:時 0.1829 +UW4:首 0.1749 +UW4:ル -0.0856 +UW4:近 0.0929 +UW4:メ -0.1635 +UW4:ラ -0.0881 +UW4:方 -0.0856 +UW4:― -0.4841 +UW4:ト -0.0403 +UW4:文 0.0522 +UW4:所 -0.1566 +UW4:米 0.2937 +UW4:も 0.4169 +UW4:ゃ -0.2666 +UW4:む -0.0882 +UW4:め -0.5046 +UW4:ょ -0.1544 +UW4:や 0.2795 +UW4:院 -0.2297 +UW4:り -0.9726 +UW4:る -1.4896 +UW4:氏 0.5388 +UW4:ら -0.2922 +UW4:わ -0.1783 +UW4:れ -0.2613 +UW4:ろ -0.457 +UW4:を 1.315 +UW4:ん -0.2352 +UW4:気 -0.091 +UW4:民 -0.2716 +UW4:場 -0.141 +UW4:リ -0.0541 +UW4:副 0.3879 +UW4:以 0.0544 +UW4:会 0.095 +UW4:ン -0.3637 +UW4:コ 0.1789 +UW4:年 0.0374 +UW4:和 -0.0681 +UW4:セ 0.1287 +UW4:前 0.1623 +UW4:器 -0.0851 +UW4:総 0.094 +UW4:議 -0.0244 +UW4:小 0.191 +UW4:警 -0.1184 +UW4:線 -0.0994 +UW4:第 0.0788 +UW4:県 0.2997 +UW4:木 -0.0485 +UW4:省 -0.3485 +UQ3:ON -0.3212 +UQ3:BA -0.0479 +UQ3:OI -0.0827 +UQ3:BM 0.316 +UQ3:BN 0.6427 +UQ3:BO 1.4761 +UQ3:BH 0.0042 +UQ3:BI 0.1913 +UQ3:BK -0.7198 +TQ1:OIHI 0.02 +TQ1:OIIH -0.0068 +TQ1:BIII 0.1595 +TQ1:OAKK 0.0482 +TQ1:BIHH 0.006 +TQ1:BHIH -0.0132 +TQ1:BHHH -0.0227 +TQ1:BHHI 0.0316 +TQ1:BOHH 0.0225 +TQ1:BOOO -0.0908 +TQ1:OHHH 0.0281 +TQ1:BNHH -0.0744 +TQ1:OHIH 0.0249 +UC5:I -0.1238 +UC5:H 0.0313 +UC5:K -0.0799 +UC5:M 0.0539 +UC5:O -0.0831 +TQ4:BIIH -0.0607 +TQ4:BIII -0.2181 +TQ4:OAKK 0.018 +TQ4:OIIH 0.0626 +TQ4:BHII -0.0966 +TQ4:OIHI -0.0493 +TQ4:BHHH -0.0721 +TQ4:OIII -0.4007 +TQ4:BHHM -0.3604 +TQ4:OIHH 0.1935 +TQ4:OHIH -0.1573 +TQ4:OKAK -0.8156 +TQ4:OHHI 0.2446 +TQ4:OHHH -0.0294 +TQ4:OAAA -0.2763 +TQ4:OHHO 0.048 +TW2:その後 -0.443 +TW2:社会党 -0.3216 +TW2:もので 0.1882 +TW2:ていた 0.1833 +TW2:大きな -0.1255 +TW2:ころが -0.2434 +TW2:同時に -0.8097 +TW2:一気に -0.0792 +TW2:ともに -0.4517 +TW2:だって -0.1049 +TW2:対して -0.2721 +TW2:として -0.4657 +TW2:いった -0.1256 +TW2:ある程 -0.2049 +TW2:初めて -0.1512 +TW2:しょう 0.3873 +TW1:東京都 0.2026 +TW1:につい -0.4681 +UW1:も -0.0466 +UW1:主 -0.0402 +UW1:大 0.0561 +UW1:や -0.047 +UW1:・ -0.0135 +UW1:り 0.0208 +UW1:日 -0.0141 +UW1:よ 0.0182 +UW1:ら -0.0292 +UW1:区 -0.0912 +UW1:れ 0.0169 +UW1:京 -0.0268 +UW1:に -0.0789 +UW1:ん -0.0137 +UW1:ど -0.0123 +UW1:と -0.0547 +UW1:は -0.0847 +UW1:の -0.0185 +UW1:都 -0.0718 +UW1:あ -0.0941 +UW1:市 -0.0411 +UW1:委 0.0729 +UW1:で -0.0201 +UW1:県 -0.0386 +UW1:を -0.0446 +UW1:国 -0.046 +UW1:・ -0.0135 +UW1:こ 0.0505 +UW1:理 0.0361 +UW1:午 0.0871 +UW1:, 0.0156 +UW1:「 -0.0463 +UW1:「 -0.0463 +UW1:き 0.0121 +UW1:が -0.0553 +UW1:、 0.0156 +UW1:う -0.0127 +UW1:生 -0.0408 +UP3:B 0.0189 +BP1:OO -0.0125 +BP1:OB 0.0304 +BP1:BB 0.0295 +BP1:UB 0.0352 +TW3:いただ -0.1734 +TW3:してい 0.1314 +TW3:十二月 -0.2287 +TW3:れから -0.3752 +TW3:のもの -0.06 +TW3:にとっ -0.5989 +TW3:に当た -0.6247 +TW3:ので、 -0.0727 +TW3:ので, -0.0727 +TW3:につい -0.5483 +TW3:として -0.4314 +BQ4:BMI -0.3385 +BQ4:OAH 0.0926 +BQ4:BOO -1.2396 +BQ4:OHH 0.0266 +BQ4:BHH -0.3895 +BQ4:ONN -0.0973 +BQ4:BIK 0.1348 +BQ4:BIH 0.3761 +BQ4:BII -0.4654 +BQ4:OHK -0.2036 +BQ4:BKK -0.1806 +BP2:OO -0.1762 +BP2:BO 0.006 +BQ2:BHI -0.1159 +BQ2:BHH 0.0118 +BQ2:UHI -0.1146 +BQ2:BHM 0.0466 +BQ2:BIH -0.0919 +BQ2:OHM -0.0181 +BQ2:OHH -0.1139 +BQ2:BKO 0.0864 +BQ2:OIH 0.0153 +BQ2:BKK -0.172 +BQ3:BHI 0.2664 +BQ3:BHH -0.0792 +BQ3:OHM 0.0439 +BQ3:OHH 0.2174 +BQ3:OII 0.028 +BQ3:BII -0.0299 +BQ3:BMH 0.0937 +BQ3:OMH -0.2402 +BQ3:BKI 0.0419 +BQ3:BMM 0.8335 +BQ3:BOH 0.0775 +BQ3:BNN 0.0998 +BQ3:OKI -0.0793 +BQ3:OKH 0.1798 +BQ3:OOO 1.1699 +BQ3:OKO -0.2242 +TQ2:BIII -0.1033 +TQ2:BIHH -0.1401 +TQ2:BKAK -0.0543 +TQ2:BOOO -0.5591 +BQ1:BOH -0.0091 +BQ1:BNH 0.0449 +BQ1:BOO -0.2597 +BQ1:BHH 0.115 +BQ1:BIM 0.0886 +BQ1:BHM 0.1521 +BQ1:OHI 0.0451 +BQ1:BII -0.1158 +BQ1:BMH 0.1208 +BQ1:OIH -0.0296 +BQ1:OKA 0.1851 +BQ1:OKH -0.102 +BQ1:OKK 0.0904 +BQ1:OOO 0.2965 +UQ1:OO -0.2422 +UQ1:OK 0.041 +UQ1:OI 0.0477 +UQ1:OH -0.0095 +UQ1:BN 0.0142 +UQ1:BO -0.0056 +UQ1:BH 0.0021 +UQ1:BI -0.0012 +UQ1:BK -0.0099 +UC2:A 0.0819 +UC2:I 0.0409 +UC2:H 0.1059 +UC2:M 0.3987 +UC2:O 0.0646 +UC2:N 0.5775 +UP1:O -0.0214 +UP2:B 0.0069 +UP2:O 0.0935 +BC1:II 0.2461 +BC1:HH 0.0006 +BC1:KH 0.0406 +BC1:OH -0.1378 +BC2:AA -0.3267 +BC2:OO -0.292 +BC2:AI 0.2744 +BC2:KI 0.3831 +BC2:IK 0.1721 +BC2:MK 0.3334 +BC2:AN -0.0878 +BC2:II -0.1332 +BC2:IH -0.1184 +BC2:HH -0.407 +BC2:MH -0.3132 +BC2:HN 0.4012 +BC2:HO 0.3761 +BC2:IO 0.5492 +BC2:HM -0.1711 +BC2:IA 0.1327 +BC2:KK -0.8741 +BC3:HK -0.0721 +BC3:HH 0.0996 +BC3:HI 0.0626 +BC3:HN -0.1307 +BC3:HO -0.0836 +BC3:IH -0.0301 +BC3:KK 0.2762 +BC3:OH 0.0266 +BC3:OA -0.1652 +BC3:MM 0.4034 +BC3:MK 0.1079 diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..f1add26 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,80 @@ +max_width = 100 +hard_tabs = false +tab_spaces = 4 +newline_style = "Auto" +# indent_style = "Block" +use_small_heuristics = "Default" +fn_call_width = 80 +attr_fn_like_width = 70 +struct_lit_width = 18 +struct_variant_width = 35 +array_width = 80 +chain_width = 80 +single_line_if_else_max_width = 80 +single_line_let_else_max_width = 80 +# wrap_comments = false +# format_code_in_doc_comments = false +# doc_comment_code_block_width = 100 +# comment_width = 80 +# normalize_comments = false +# normalize_doc_attributes = false +# format_strings = false +# format_macro_matchers = false +# format_macro_bodies = true +# skip_macro_invocations = [] +# hex_literal_case = "Preserve" +# empty_item_single_line = true +# struct_lit_single_line = true +# fn_single_line = false +# where_single_line = false +# imports_indent = "Block" +# imports_layout = "Mixed" +# imports_granularity = "Preserve" +# group_imports = "Preserve" +reorder_imports = true +reorder_modules = true +# reorder_impl_items = false +# type_punctuation_density = "Wide" +# space_before_colon = false +# space_after_colon = true +# spaces_around_ranges = false +# binop_separator = "Front" +remove_nested_parens = true +# combine_control_expr = true +short_array_element_width_threshold = 10 +# overflow_delimited_expr = false +# struct_field_align_threshold = 0 +# enum_discrim_align_threshold = 0 +# match_arm_blocks = true +match_arm_leading_pipes = "Never" +# force_multiline_blocks = false +fn_params_layout = "Tall" +# brace_style = "SameLineWhere" +# control_brace_style = "AlwaysSameLine" +# trailing_semicolon = true +# trailing_comma = "Vertical" +match_block_trailing_comma = false +# blank_lines_upper_bound = 1 +# blank_lines_lower_bound = 0 +edition = "2015" +style_edition = "2015" +# version = "One" +# inline_attribute_width = 0 +# format_generated_files = true +# generated_marker_line_search_limit = 5 +merge_derives = true +use_try_shorthand = false +use_field_init_shorthand = false +force_explicit_abi = true +# condense_wildcard_suffixes = false +# color = "Auto" +# required_version = "1.8.0" +# unstable_features = false +disable_all_formatting = false +# skip_children = false +# show_parse_errors = true +# error_on_line_overflow = false +# error_on_unformatted = false +# ignore = [] +# emit_mode = "Files" +# make_backup = false diff --git a/src/adaboost.rs b/src/adaboost.rs index cfa1188..e0f9f63 100644 --- a/src/adaboost.rs +++ b/src/adaboost.rs @@ -78,16 +78,10 @@ impl AdaBoost { } self.num_instances += 1; if self.num_instances % 1000 == 0 { - eprint!( - "\rfinding instances...: {} instances found", - self.num_instances - ); + eprint!("\rfinding instances...: {} instances found", self.num_instances); } } - eprintln!( - "\rfinding instances...: {} instances found", - self.num_instances - ); + eprintln!("\rfinding instances...: {} instances found", self.num_instances); map.insert("".to_string(), 0.0); self.features = map.keys().cloned().collect(); @@ -133,8 +127,7 @@ impl AdaBoost { let end = self.instances_buf.len(); self.instances.push((start, end)); - self.instance_weights - .push((-2.0 * label as f64 * score).exp()); + self.instance_weights.push((-2.0 * label as f64 * score).exp()); if self.instance_weights.len() % 1000 == 0 { eprint!( @@ -192,11 +185,7 @@ impl AdaBoost { } } - eprint!( - "\rIteration {} - margin: {}", - t, - (0.5 - best_error_rate).abs() - ); + eprint!("\rIteration {} - margin: {}", t, (0.5 - best_error_rate).abs()); if (0.5 - best_error_rate).abs() < self.threshold { break; } @@ -212,11 +201,7 @@ impl AdaBoost { let label = self.labels[i]; let (start, end) = self.instances[i]; let hs = &self.instances_buf[start..end]; - let prediction = if hs.binary_search(&h_best).is_ok() { - 1 - } else { - -1 - }; + let prediction = if hs.binary_search(&h_best).is_ok() { 1 } else { -1 }; if label * prediction < 0 { self.instance_weights[i] *= alpha_exp; } else { @@ -335,18 +320,10 @@ impl AdaBoost { let recall = pp as f64 / (pp + np).max(1) as f64 * 100.0; eprintln!("Result:"); - eprintln!( - "Accuracy: {:.2}% ({} / {})", - acc, - pp + nn, - self.num_instances - ); + eprintln!("Accuracy: {:.2}% ({} / {})", acc, pp + nn, self.num_instances); eprintln!("Precision: {:.2}% ({} / {})", prec, pp, pp + pn); eprintln!("Recall: {:.2}% ({} / {})", recall, pp, pp + np); - eprintln!( - "Confusion Matrix: TP: {}, FP: {}, FN: {}, TN: {}", - pp, pn, np, nn - ); + eprintln!("Confusion Matrix: TP: {}, FP: {}, FN: {}, TN: {}", pp, pn, np, nn); } /// Adds a new instance to the model. diff --git a/src/segmenter.rs b/src/segmenter.rs index ec613a8..e479fb6 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -19,15 +19,38 @@ impl Segmenter { /// A new Segmenter instance with the specified or default AdaBoost learner. pub fn new(learner: Option) -> Self { let patterns = vec![ - ( - Regex::new(r"[一二三四五六七八九十百千万億兆]").unwrap(), - "M", - ), + // Japanese Kanji numbers + (Regex::new(r"[一二三四五六七八九十百千万億兆]").unwrap(), "M"), + // Japanese Kanji (Regex::new(r"[一-龠々〆ヵヶ]").unwrap(), "H"), + // Japanese Hiragana (Regex::new(r"[ぁ-ん]").unwrap(), "I"), + // Japanese Katakana (Regex::new(r"[ァ-ヴーア-ン゙ー]").unwrap(), "K"), + // Latin alphabet (ASCII + full-width) (Regex::new(r"[a-zA-Za-zA-Z]").unwrap(), "A"), + // Numbers (ASCII + full-width) (Regex::new(r"[0-90-9]").unwrap(), "N"), + // // Japanese Kanji numbers + // (Regex::new(r"[一二三四五六七八九十百千万億兆]").unwrap(), "M"), + // // Japanese Kanji + // (Regex::new(r"[一-龠々〆ヵヶ]").unwrap(), "J"), + // // Chinese Kanji (CJK Unified Ideographs) + // (Regex::new(r"[㐀-䶵一-鿿]").unwrap(), "M"), + // // Hangul (Korean) + // (Regex::new(r"[가-힣]").unwrap(), "K"), + // // Hiragana (Japanese) + // (Regex::new(r"[ぁ-ん]").unwrap(), "I"), + // // Katakana (Japanese) + // (Regex::new(r"[ァ-ヴーア-ン゙゚]").unwrap(), "K"), + // // Latin alphabet (ASCII + full-width) + // (Regex::new(r"[a-zA-Za-zA-Z]").unwrap(), "A"), + // // Numbers (ASCII + full-width) + // (Regex::new(r"[0-90-9]").unwrap(), "N"), + // // Vietnamese Extended Latin + // (Regex::new(r"[À-ſ]").unwrap(), "V"), + // // Thai script + // (Regex::new(r"[ก-๛]").unwrap(), "T"), ]; Segmenter { @@ -279,9 +302,8 @@ mod tests { #[test] fn test_segmenter() { - let model_file = PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join("./resources") - .join("RWCP.model"); + let model_file = + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("./resources").join("RWCP.model"); let mut learner = AdaBoost::new(0.01, 100, 1); learner.load_model(model_file.as_path()).unwrap(); From 6ec56ca2e062fa60c40230c3a994e39fe82ee370 Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Tue, 3 Jun 2025 22:04:24 +0900 Subject: [PATCH 05/15] Support multi languages (#5) * Support multi languages * Fix document --- src/segmenter.rs | 48 +++++++++++++++++++----------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/src/segmenter.rs b/src/segmenter.rs index e479fb6..cf0e117 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -19,38 +19,26 @@ impl Segmenter { /// A new Segmenter instance with the specified or default AdaBoost learner. pub fn new(learner: Option) -> Self { let patterns = vec![ + // Numbers + (Regex::new(r"[0-90-9]").unwrap(), "N"), // Japanese Kanji numbers (Regex::new(r"[一二三四五六七八九十百千万億兆]").unwrap(), "M"), - // Japanese Kanji - (Regex::new(r"[一-龠々〆ヵヶ]").unwrap(), "H"), - // Japanese Hiragana + // Hiragana (Japanese) (Regex::new(r"[ぁ-ん]").unwrap(), "I"), - // Japanese Katakana - (Regex::new(r"[ァ-ヴーア-ン゙ー]").unwrap(), "K"), - // Latin alphabet (ASCII + full-width) + // Katakana (Japanese) + (Regex::new(r"[ァ-ヴーア-ン゙゚]").unwrap(), "K"), + // Hangul (Korean) + (Regex::new(r"[가-힣]").unwrap(), "G"), + // Thai script + (Regex::new(r"[ก-๛]").unwrap(), "T"), + // Kanji (Japanese) + (Regex::new(r"[一-龠々〆ヵヶ]").unwrap(), "H"), + // Kanji (CJK Unified Ideographs) + (Regex::new(r"[㐀-䶵一-鿿]").unwrap(), "Z"), + // Extended Latin (Vietnamese, etc.) + (Regex::new(r"[À-ÿĀ-ſƀ-ƿǍ-ɏ]").unwrap(), "E"), + // ASCII + Full-width Latin (Regex::new(r"[a-zA-Za-zA-Z]").unwrap(), "A"), - // Numbers (ASCII + full-width) - (Regex::new(r"[0-90-9]").unwrap(), "N"), - // // Japanese Kanji numbers - // (Regex::new(r"[一二三四五六七八九十百千万億兆]").unwrap(), "M"), - // // Japanese Kanji - // (Regex::new(r"[一-龠々〆ヵヶ]").unwrap(), "J"), - // // Chinese Kanji (CJK Unified Ideographs) - // (Regex::new(r"[㐀-䶵一-鿿]").unwrap(), "M"), - // // Hangul (Korean) - // (Regex::new(r"[가-힣]").unwrap(), "K"), - // // Hiragana (Japanese) - // (Regex::new(r"[ぁ-ん]").unwrap(), "I"), - // // Katakana (Japanese) - // (Regex::new(r"[ァ-ヴーア-ン゙゚]").unwrap(), "K"), - // // Latin alphabet (ASCII + full-width) - // (Regex::new(r"[a-zA-Za-zA-Z]").unwrap(), "A"), - // // Numbers (ASCII + full-width) - // (Regex::new(r"[0-90-9]").unwrap(), "N"), - // // Vietnamese Extended Latin - // (Regex::new(r"[À-ſ]").unwrap(), "V"), - // // Thai script - // (Regex::new(r"[ก-๛]").unwrap(), "T"), ]; Segmenter { @@ -65,7 +53,9 @@ impl Segmenter { /// * `ch` - A string slice representing a single character. /// /// # Returns - /// A static string representing the type of the character, such as "M", "H", "I", "K", "A", "N", or "O" (for others). + /// A string slice representing the type of the character, such as "N" for number, + /// "I" for Hiragana, "K" for Katakana, etc. If the character does not match any pattern, + /// it returns "O" for Other. pub fn get_type(&self, ch: &str) -> &str { for (pattern, label) in &self.patterns { if pattern.is_match(ch) { From ba89b7420ecc21a0180781bebd904d133c865f1c Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Wed, 4 Jun 2025 00:11:03 +0900 Subject: [PATCH 06/15] Refactoring (#6) --- src/adaboost.rs | 136 ++++++++++++++++++++++++++++++------------------ src/main.rs | 32 ++++++++++-- src/trainer.rs | 7 ++- 3 files changed, 117 insertions(+), 58 deletions(-) diff --git a/src/adaboost.rs b/src/adaboost.rs index e0f9f63..2272343 100644 --- a/src/adaboost.rs +++ b/src/adaboost.rs @@ -7,6 +7,26 @@ use std::sync::Arc; type Label = i8; +/// Structure to hold evaluation metrics. +pub struct Metrics { + /// Accuracy in percentage (%) + pub accuracy: f64, + /// Precision in percentage (%) + pub precision: f64, + /// Recall in percentage (%) + pub recall: f64, + /// Number of instances in the dataset + pub num_instances: usize, + /// True Positives count + pub true_positives: usize, + /// False Positives count + pub false_positives: usize, + /// False Negatives count + pub false_negatives: usize, + /// True Negatives count + pub true_negatives: usize, +} + /// AdaBoost implementation for binary classification /// This implementation uses a simple feature extraction method /// and is designed for educational purposes. @@ -72,16 +92,20 @@ impl AdaBoost { let line = line?; let mut parts = line.split_whitespace(); let _label = parts.next(); + for h in parts { map.entry(h.to_string()).or_insert(0.0); buf_size += 1; } + self.num_instances += 1; if self.num_instances % 1000 == 0 { eprint!("\rfinding instances...: {} instances found", self.num_instances); } } + eprintln!("\rfinding instances...: {} instances found", self.num_instances); + map.insert("".to_string(), 0.0); self.features = map.keys().cloned().collect(); @@ -128,7 +152,6 @@ impl AdaBoost { let end = self.instances_buf.len(); self.instances.push((start, end)); self.instance_weights.push((-2.0 * label as f64 * score).exp()); - if self.instance_weights.len() % 1000 == 0 { eprint!( "\rloading instances...: {}/{} instances loaded", @@ -137,7 +160,13 @@ impl AdaBoost { ); } } - eprintln!(); + + eprintln!( + "\rloading instances...: {}/{} instances loaded", + self.instance_weights.len(), + self.num_instances + ); + Ok(()) } @@ -277,55 +306,6 @@ impl AdaBoost { Ok(()) } - /// Gets the bias term of the model. - /// The bias is calculated as the negative sum of the model weights divided by 2. - /// - /// # Returns:The bias term as a `f64`. - pub fn get_bias(&self) -> f64 { - -self.model.iter().sum::() / 2.0 - } - - /// Displays the result of the model's performance on the training data. - /// It calculates accuracy, precision, recall, and confusion matrix. - pub fn show_result(&self) { - let bias = self.get_bias(); - let mut pp = 0; - let mut pn = 0; - let mut np = 0; - let mut nn = 0; - - for i in 0..self.num_instances { - let label = self.labels[i]; - let (start, end) = self.instances[i]; - let mut score = bias; - for &h in &self.instances_buf[start..end] { - score += self.model[h]; - } - - if score >= 0.0 { - if label > 0 { - pp += 1 - } else { - pn += 1 - } - } else if label > 0 { - np += 1 - } else { - nn += 1 - } - } - - let acc = (pp + nn) as f64 / self.num_instances as f64 * 100.0; - let prec = pp as f64 / (pp + pn).max(1) as f64 * 100.0; - let recall = pp as f64 / (pp + np).max(1) as f64 * 100.0; - - eprintln!("Result:"); - eprintln!("Accuracy: {:.2}% ({} / {})", acc, pp + nn, self.num_instances); - eprintln!("Precision: {:.2}% ({} / {})", prec, pp, pp + pn); - eprintln!("Recall: {:.2}% ({} / {})", recall, pp, pp + np); - eprintln!("Confusion Matrix: TP: {}, FP: {}, FN: {}, TN: {}", pp, pn, np, nn); - } - /// Adds a new instance to the model. /// The instance is represented by a set of attributes and a label. /// @@ -372,4 +352,58 @@ impl AdaBoost { -1 } } + + /// Gets the bias term of the model. + /// The bias is calculated as the negative sum of the model weights divided by 2. + /// + /// # Returns:The bias term as a `f64`. + pub fn get_bias(&self) -> f64 { + -self.model.iter().sum::() / 2.0 + } + + /// Calculates and returns the performance metrics of the model on the training data. + pub fn get_metrics(&self) -> Metrics { + let bias = self.get_bias(); + let mut true_positives = 0; // true positives + let mut false_positives = 0; // false positives + let mut false_negatives = 0; // false negatives + let mut true_negatives = 0; // true negatives + + for i in 0..self.num_instances { + let label = self.labels[i]; + let (start, end) = self.instances[i]; + let mut score = bias; + for &h in &self.instances_buf[start..end] { + score += self.model[h]; + } + if score >= 0.0 { + if label > 0 { + true_positives += 1; + } else { + false_positives += 1; + } + } else if label > 0 { + false_negatives += 1; + } else { + true_negatives += 1; + } + } + + let accuracy = (true_positives + true_negatives) as f64 / self.num_instances as f64 * 100.0; + let precision = + true_positives as f64 / (true_positives + false_positives).max(1) as f64 * 100.0; + let recall = + true_positives as f64 / (true_positives + false_negatives).max(1) as f64 * 100.0; + + Metrics { + accuracy, + precision, + recall, + num_instances: self.num_instances, + true_positives, + false_positives, + false_negatives, + true_negatives, + } + } } diff --git a/src/main.rs b/src/main.rs index e31f697..ff85dde 100644 --- a/src/main.rs +++ b/src/main.rs @@ -92,7 +92,7 @@ fn extract(args: ExtractArgs) -> Result<(), Box> { extractor.extract(args.corpus_file.as_path(), args.features_file.as_path())?; - println!("Feature extraction completed successfully."); + eprintln!("Feature extraction completed successfully."); Ok(()) } @@ -129,9 +129,35 @@ fn train(args: TrainArgs) -> Result<(), Box> { trainer.load_model(model_path.as_path())?; } - trainer.train(running, args.model_file.as_path())?; + let metrics = trainer.train(running, args.model_file.as_path())?; + + eprintln!("Result Metrics:"); + eprintln!( + " Accuracy: {:.2}% ( {} / {} )", + metrics.accuracy, + metrics.true_positives + metrics.true_negatives, + metrics.num_instances + ); + eprintln!( + " Precision: {:.2}% ( {} / {} )", + metrics.precision, + metrics.true_positives, + metrics.true_positives + metrics.false_positives + ); + eprintln!( + " Recall: {:.2}% ( {} / {} )", + metrics.recall, + metrics.true_positives, + metrics.true_positives + metrics.false_negatives + ); + eprintln!( + " Confusion Matrix:\n True Positives: {}\n False Positives: {}\n False Negatives: {}\n True Negatives: {}", + metrics.true_positives, + metrics.false_positives, + metrics.false_negatives, + metrics.true_negatives + ); - println!("Training completed successfully."); Ok(()) } diff --git a/src/trainer.rs b/src/trainer.rs index dd76a94..1bce18d 100644 --- a/src/trainer.rs +++ b/src/trainer.rs @@ -2,7 +2,7 @@ use std::path::Path; use std::sync::atomic::AtomicBool; use std::sync::Arc; -use crate::adaboost::AdaBoost; +use crate::adaboost::{AdaBoost, Metrics}; /// Trainer struct for managing the AdaBoost training process. /// It initializes the AdaBoost learner with the specified parameters, @@ -74,13 +74,12 @@ impl Trainer { &mut self, running: Arc, model_path: &Path, - ) -> Result<(), Box> { + ) -> Result> { self.learner.train(running.clone()); // Save the trained model to the specified file self.learner.save_model(model_path)?; - self.learner.show_result(); - Ok(()) + Ok(self.learner.get_metrics()) } } From 41438e854c9d3da5284e43682c2e53a5212eb6ce Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Wed, 4 Jun 2025 17:08:45 +0900 Subject: [PATCH 07/15] Add tests (#7) --- src/segmenter.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/segmenter.rs b/src/segmenter.rs index cf0e117..871dde4 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -304,6 +304,11 @@ mod tests { let result = segmenter.parse(sentence); assert!(!result.is_empty()); assert_eq!(result.len(), 5); // Adjust based on expected segmentation + assert_eq!(result[0], "これ"); + assert_eq!(result[1], "は"); + assert_eq!(result[2], "テスト"); + assert_eq!(result[3], "です"); + assert_eq!(result[4], "。"); } #[test] From 7fc5c5c42344e2731c122e31d73d6d16f29311f3 Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Wed, 4 Jun 2025 17:59:48 +0900 Subject: [PATCH 08/15] Rename function (#8) --- src/main.rs | 2 +- src/segmenter.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main.rs b/src/main.rs index ff85dde..06d9692 100644 --- a/src/main.rs +++ b/src/main.rs @@ -186,7 +186,7 @@ fn segment(args: SegmentArgs) -> Result<(), Box> { if line.is_empty() { continue; } - let tokens = segmenter.parse(line); + let tokens = segmenter.segment(line); writeln!(writer, "{}", tokens.join(" "))?; } diff --git a/src/segmenter.rs b/src/segmenter.rs index 871dde4..7d4b99d 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -160,14 +160,14 @@ impl Segmenter { } } - /// Parses a sentence and segments it into words. + /// Segments a sentence and segments it into words. /// /// # Arguments /// * `sentence` - A string slice representing the sentence to be parsed. /// /// # Returns /// A vector of strings, where each string is a segmented word from the sentence. - pub fn parse(&self, sentence: &str) -> Vec { + pub fn segment(&self, sentence: &str) -> Vec { if sentence.is_empty() { return Vec::new(); } @@ -301,7 +301,7 @@ mod tests { let mut segmenter = Segmenter::new(Some(learner)); let sentence = "これはテストです。"; segmenter.add_sentence(sentence); - let result = segmenter.parse(sentence); + let result = segmenter.segment(sentence); assert!(!result.is_empty()); assert_eq!(result.len(), 5); // Adjust based on expected segmentation assert_eq!(result[0], "これ"); From a55470806389aabd4de0298e2c9c3863b4d9de65 Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Wed, 4 Jun 2025 21:11:43 +0900 Subject: [PATCH 09/15] Update segmenter --- src/segmenter.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/segmenter.rs b/src/segmenter.rs index 7d4b99d..be740c7 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -292,16 +292,18 @@ mod tests { #[test] fn test_segmenter() { + let sentence = "これはテストです。"; + let model_file = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("./resources").join("RWCP.model"); - let mut learner = AdaBoost::new(0.01, 100, 1); learner.load_model(model_file.as_path()).unwrap(); let mut segmenter = Segmenter::new(Some(learner)); - let sentence = "これはテストです。"; + segmenter.add_sentence(sentence); let result = segmenter.segment(sentence); + assert!(!result.is_empty()); assert_eq!(result.len(), 5); // Adjust based on expected segmentation assert_eq!(result[0], "これ"); @@ -314,6 +316,7 @@ mod tests { #[test] fn test_get_type() { let segmenter = Segmenter::new(None); + assert_eq!(segmenter.get_type("あ"), "I"); // Hiragana assert_eq!(segmenter.get_type("漢"), "H"); // Kanji assert_eq!(segmenter.get_type("A"), "A"); // Latin From 1a8073f89f00038e115875cca7b91c206144dd9f Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Wed, 4 Jun 2025 21:35:41 +0900 Subject: [PATCH 10/15] Add tests (#9) --- src/segmenter.rs | 71 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/src/segmenter.rs b/src/segmenter.rs index be740c7..fb5af4a 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -290,6 +290,37 @@ mod tests { use super::*; + #[test] + fn test_add_sentence_with_writer() { + let mut segmenter = Segmenter::new(None); + let sentence = "テスト です"; + let mut collected = Vec::new(); + + segmenter.add_sentence_with_writer(sentence, |attrs, label| { + collected.push((attrs, label)); + }); + + // There should be as many instances as there are characters (excluding padding) + assert!(!collected.is_empty()); + + // Check that labels are either 1 or -1 + for (_, label) in &collected { + assert!(*label == 1 || *label == -1); + } + + // Check that attributes contain expected keys + let (attrs, _) = &collected[0]; + assert!(attrs.iter().any(|a| a.starts_with("UW"))); + assert!(attrs.iter().any(|a| a.starts_with("UC"))); + } + + #[test] + fn test_add_sentence_empty() { + let mut segmenter = Segmenter::new(None); + segmenter.add_sentence(""); + // Should not panic or add anything + } + #[test] fn test_segmenter() { let sentence = "これはテストです。"; @@ -313,6 +344,13 @@ mod tests { assert_eq!(result[4], "。"); } + #[test] + fn test_segment_empty_sentence() { + let segmenter = Segmenter::new(None); + let result = segmenter.segment(""); + assert!(result.is_empty()); + } + #[test] fn test_get_type() { let segmenter = Segmenter::new(None); @@ -321,5 +359,38 @@ mod tests { assert_eq!(segmenter.get_type("漢"), "H"); // Kanji assert_eq!(segmenter.get_type("A"), "A"); // Latin assert_eq!(segmenter.get_type("1"), "N"); // Digit + assert_eq!(segmenter.get_type("@"), "O"); // Not matching any pattern + } + + #[test] + fn test_get_attributes_content() { + let segmenter = Segmenter::new(None); + + let tags = vec!["U".to_string(); 7]; + + let chars = vec![ + "B3".to_string(), // index 0 + "B2".to_string(), // index 1 + "B1".to_string(), // index 2 + "あ".to_string(), // index 3 + "い".to_string(), // index 4 + "う".to_string(), // index 5 + "E1".to_string(), // index 6 + ]; + + let types = vec![ + "O".to_string(), // index 0 + "O".to_string(), // index 1 + "O".to_string(), // index 2 + "O".to_string(), // index 3 + "I".to_string(), // index 4 + "I".to_string(), // index 5 + "O".to_string(), // index 6 + ]; + + let attrs = segmenter.get_attributes(4, &tags, &chars, &types); + assert!(attrs.contains("UW4:い")); + assert!(attrs.contains("UC4:I")); + assert!(attrs.contains("UP3:U")); } } From f734991c7e6e3766a5a47b2f95b6cbb7398ccd7b Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Wed, 4 Jun 2025 21:46:28 +0900 Subject: [PATCH 11/15] Add test (#10) --- Cargo.lock | 91 ++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 3 ++ src/extractor.rs | 38 ++++++++++++++++++++ 3 files changed, 132 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index a878421..4594d15 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -166,6 +166,34 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "errno" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cea14ef9355e3beab063703aa9dab15afd25f0667c341310c1e5274bb1d0da18" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "getrandom" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasi", +] + [[package]] name = "heck" version = "0.5.0" @@ -190,6 +218,12 @@ version = "0.2.172" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" +[[package]] +name = "linux-raw-sys" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" + [[package]] name = "litsea" version = "0.1.0" @@ -200,6 +234,7 @@ dependencies = [ "regex", "serde", "serde_json", + "tempfile", ] [[package]] @@ -220,6 +255,12 @@ dependencies = [ "libc", ] +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + [[package]] name = "once_cell_polyfill" version = "1.70.1" @@ -244,6 +285,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" + [[package]] name = "rayon" version = "1.10.0" @@ -293,6 +340,19 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +[[package]] +name = "rustix" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + [[package]] name = "ryu" version = "1.0.20" @@ -348,6 +408,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys", +] + [[package]] name = "unicode-ident" version = "1.0.18" @@ -360,6 +433,15 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "wasi" +version = "0.14.2+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +dependencies = [ + "wit-bindgen-rt", +] + [[package]] name = "windows-sys" version = "0.59.0" @@ -432,3 +514,12 @@ name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "wit-bindgen-rt" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" +dependencies = [ + "bitflags", +] diff --git a/Cargo.toml b/Cargo.toml index ad4ee53..1083139 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,3 +21,6 @@ rayon = "1.10.0" regex = "1.10.5" serde = { version = "1.0.219", features = ["derive"] } serde_json = "1.0.140" + +[dev-dependencies] +tempfile = "3.20.0" diff --git a/src/extractor.rs b/src/extractor.rs index 6efa325..fbb7b92 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -84,3 +84,41 @@ impl Extractor { Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + + use std::fs::File; + use std::io::{Read, Write}; + + use tempfile::NamedTempFile; + + #[test] + fn test_extract() -> Result<(), Box> { + // Create a temporary file to simulate the corpus input + let mut corpus_file = NamedTempFile::new()?; + writeln!(corpus_file, "これ は テスト です 。")?; + writeln!(corpus_file, "別 の 文 も あり ます 。")?; + corpus_file.as_file().sync_all()?; + + // Create a temporary file for the features output + let features_file = NamedTempFile::new()?; + + // Create an instance of Extractor and extract features + let mut extractor = Extractor::new(); + extractor.extract(corpus_file.path(), features_file.path())?; + + // Read the output from the features file + let mut output = String::new(); + File::open(features_file.path())?.read_to_string(&mut output)?; + + // Check if the output is not empty + assert!(!output.is_empty(), "Extracted features should not be empty"); + + // Check if the output contains tab-separated values + assert!(output.contains("\t"), "Output should contain tab-separated values"); + + Ok(()) + } +} From 094909fa4f2261158d6d2b86de63f0c861517da3 Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Wed, 4 Jun 2025 22:01:38 +0900 Subject: [PATCH 12/15] Tests (#11) * Add test * Add test * Delete unused model --- resources/RWCP.model.bak | 1341 -------------------------------------- src/trainer.rs | 76 +++ 2 files changed, 76 insertions(+), 1341 deletions(-) delete mode 100644 resources/RWCP.model.bak diff --git a/resources/RWCP.model.bak b/resources/RWCP.model.bak deleted file mode 100644 index eef6ffa..0000000 --- a/resources/RWCP.model.bak +++ /dev/null @@ -1,1341 +0,0 @@ --0.0332 -UW6:ン -0.0496 -UW6:連 0.0463 -UW6:ル -0.0673 -UW6:か 0.0241 -UW6:業 -0.0697 -UW6:, 0.0227 -UW6:者 0.1811 -UW6:. 0.0808 -UW6:福 0.0974 -UW6:後 0.0535 -UW6:広 -0.0695 -UW6:ル -0.0673 -UW6:E1 0.0306 -UW6:ン -0.0496 -UW6:員 -0.1212 -UW6:に -0.0149 -UW6:学 -0.096 -UW6:郎 0.1082 -UW6:E1 0.0306 -UW6:相 0.0753 -UW6:も -0.0206 -UW6:り 0.0187 -UW6:る -0.0135 -UW6:社 -0.0507 -UW6:を 0.0195 -UW6:な -0.0253 -UW6:中 0.0201 -UW6:と -0.0105 -UW6:は -0.0236 -UW6:の -0.0417 -UW6:っ 0.0573 -UW6:で 0.0101 -UW6:て -0.1014 -UW6:す 0.0383 -UW6:じ 0.1782 -UW6:た -0.0428 -UW6:こ -0.02 -UW6:会 0.0624 -UW6:空 -0.0822 -UW6:件 -0.08 -UW6:く -0.0121 -UW6:が -0.0073 -UW6:あ -0.0307 -UW6:前 0.0302 -UW6:う 0.0189 -UW6:一 -0.0277 -UW6:1 -0.027 -UW6:市 0.0887 -UW6:委 0.0798 -UW6:区 0.1792 -UW6:1 -0.027 -UW6:、 0.0227 -UW6:。 0.0808 -BW1:には 0.1498 -BW1:B1あ 0.1404 -BW1:ため 0.0601 -BW1:大阪 0.1497 -BW1:に対 -0.0912 -BW1:引き -0.1336 -BW1:から 0.3472 -BW1:れた 0.2369 -BW1:うん 0.0665 -BW1:,同 0.0727 -BW1:毎日 -0.2113 -BW1:やむ -0.1947 -BW1:です 0.3445 -BW1:まで 0.1711 -BW1:いる 0.0672 -BW1:、同 0.0727 -BW1:」と 0.1682 -BW1:、と 0.066 -BW1:よっ -0.2565 -BW1:なっ 0.3015 -BW1:日本 -0.0195 -BW1:すで -0.3399 -BW1:平方 -0.2314 -BW1:れで -0.0913 -BW1:とい -0.4915 -BW1:ませ 0.2448 -BW1:つい -0.0802 -BW1:を見 0.0731 -BW1:てき 0.1249 -BW1:それ -0.0871 -BW1:こん -0.1262 -BW1:でき 0.1127 -BW1:をし 0.186 -BW1:てい 0.0805 -BW1:大き -0.2604 -BW1:B1あ 0.1404 -BW1:の中 0.0741 -BW1:して 0.1104 -BW1:では 0.0844 -BW1:さら -0.4143 -BW1:どこ 0.3887 -BW1:京都 0.2558 -BW1:いう 0.1743 -BW1:うし -0.4817 -BW1:ない 0.5713 -BW1:にし 0.2468 -BW1:まま 0.26 -BW1:あっ 0.1505 -BW1:の一 -0.0501 -BW1:亡く -0.1886 -BW1:こと 0.2083 -BW1:った 0.3463 -BW1:さん 0.4573 -BW1:にも 0.1671 -BW1:なん -0.1113 -BW1:B1同 0.0542 -BW1:とみ 0.1922 -BW1:」と 0.1682 -BW1:いっ -0.2055 -BW1:たち 0.1122 -BW1:本当 -0.2423 -BW1:,と 0.066 -BW1:がら 0.06 -BW1:こう -0.079 -BW1:取り -0.2784 -BW1:目指 -0.0724 -BW1:した 0.2641 -BW1:B1同 0.0542 -BW1:そこ 0.1977 -BW1:まる -0.2155 -BW1:など 0.7379 -BW2:との 0.072 -BW2:われ 0.7901 -BW2:とと -0.2279 -BW2:新聞 -0.4066 -BW2:を通 -1.1877 -BW2:とみ 0.5168 -BW2:朝鮮 -0.2355 -BW2:大阪 -0.2471 -BW2:同党 0.097 -BW2:とい 0.189 -BW2:がい 0.0853 -BW2:とこ -0.1746 -BW2:11 -0.0669 -BW2:米国 -0.4268 -BW2:れて 0.0849 -BW2:日新 -0.0722 -BW2:れば 0.4114 -BW2:うか 0.249 -BW2:くな -0.1597 -BW2:に関 -1.1388 -BW2:こと -0.8392 -BW2:かし -0.135 -BW2:この -0.4193 -BW2:なん 0.3099 -BW2:府県 -0.2363 -BW2:にし 0.2748 -BW2:会社 -0.1116 -BW2:同日 -0.0913 -BW2:れた 0.427 -BW2:かも -0.0602 -BW2:にな 0.2454 -BW2:然と -0.1384 -BW2:に対 -1.4943 -BW2:から -0.7194 -BW2:上が -0.4479 -BW2:かれ 0.4612 -BW2:んな -0.4115 -BW2:ては -0.311 -BW2:めて -0.3153 -BW2:んだ 0.0728 -BW2:らか -0.0944 -BW2:一方 -0.1375 -BW2:にお -0.1615 -BW2:分の -0.7758 -BW2:まし -0.1316 -BW2:てき 0.364 -BW2:てく 0.2551 -BW2:てい 0.6144 -BW2:らに -0.1897 -BW2:手権 -0.1982 -BW2:一日 0.097 -BW2:りし 0.0651 -BW2:はず -0.2532 -BW2:いう -0.1609 -BW2:少な -0.105 -BW2:はが -0.1033 -BW2:はい 0.1073 -BW2:ろう 0.6067 -BW2:でも -0.4203 -BW2:りま 0.162 -BW2:日本 -0.7068 -BW2:社会 -0.1276 -BW2:らし -0.1611 -BW2:もの -1.0713 -BW2:させ 0.4533 -BW2:.. -1.1822 -BW2:に従 -0.4688 -BW2:東京 -0.1543 -BW2:もい 0.223 -BW2:され 1.3168 -BW2:その -0.3744 -BW2:たい -0.1253 -BW2:たた -0.0662 -BW2:一部 -0.1051 -BW2:ばれ 0.1813 -BW2:委員 -0.125 -BW2:った 0.4589 -BW2:さん -0.3977 -BW2:たは -0.0939 -BW2:一人 0.0602 -BW2:って 0.1647 -BW2:たと 0.1224 -BW2:っと -0.2094 -BW2:たち -0.0786 -BW2:ただ -0.3857 -BW2:立て -0.099 -BW2:まれ 0.5409 -BW2:出て 0.2163 -BW2:ても -0.3065 -BW2:まで -0.6621 -BW2:11 -0.0669 -BW2:年度 -0.8669 -BW2:なの 0.2614 -BW2:など -0.6509 -BW2:でし -0.3828 -BW2:第に -0.1612 -BW2:曜日 -0.0601 -BW2:です -0.4761 -BW2:なが -0.1313 -BW2:でい 0.2666 -BW2:ない -0.2488 -BW2:でき -0.1528 -BW2:して 0.0972 -BW2:きた 0.1941 -BW2:ので -0.7059 -BW2:のに -0.6041 -BW2:しな 0.0939 -BW2:のの -0.6125 -BW2:本人 -0.2697 -BW2:―― -0.573 -BW2:しい -0.1819 -BW2:によ -0.7236 -BW2:のか 0.2093 -BW2:しか -0.0545 -BW2:年間 -0.1626 -BW2:がら -0.3198 -BW2:とも -0.3941 -BW2:−− -1.3175 -BW2:した 0.5078 -BW2:日米 0.3372 -BW3:との 0.0541 -BW3:われ -0.0605 -BW3:だ。 0.4098 -BW3:す. -0.131 -BW3:い。 -0.1185 -BW3:か。 0.2857 -BW3:がっ -0.0913 -BW3:がけ -0.1127 -BW3:とう -0.1387 -BW3:ず、 0.3426 -BW3:新聞 -0.5055 -BW3:日, 0.0974 -BW3:とし 0.2266 -BW3:会議 0.086 -BW3:がき -0.4855 -BW3:れて 0.1375 -BW3:する 0.6521 -BW3:ず, 0.3426 -BW3:に, -0.1021 -BW3:られ 0.682 -BW3:かけ -0.0743 -BW3:こと 0.7397 -BW3:この 0.1542 -BW3:かに -0.0669 -BW3:かっ -0.4098 -BW3:てい 0.624 -BW3:が、 0.1816 -BW3:うち 0.1117 -BW3:れた 0.185 -BW3:にな 0.1906 -BW3:には 0.2644 -BW3:かり -0.267 -BW3:から 0.652 -BW3:は、 0.1337 -BW3:まっ -0.1549 -BW3:まで 0.6154 -BW3:んで 0.0798 -BW3:んだ 0.0606 -BW3:うと 0.4798 -BW3:ころ -0.2757 -BW3:ます 0.6943 -BW3:てお 0.0855 -BW3:入り 0.1232 -BW3:にし 0.1771 -BW3:いえ 0.2079 -BW3:す。 -0.131 -BW3:いく 0.3029 -BW3:ずに 0.0841 -BW3:いい 0.5308 -BW3:るる 0.3818 -BW3:れ、 0.0854 -BW3:いた 0.2056 -BW3:の子 -0.1 -BW3:だっ 0.1004 -BW3:いっ 0.1883 -BW3:カ月 0.099 -BW3:か. 0.2857 -BW3:けど 0.1374 -BW3:た。 0.8875 -BW3:社会 0.2024 -BW3:さい -0.0714 -BW3:らし 0.1479 -BW3:い. -0.1185 -BW3:始め 0.1681 -BW3:の、 -0.0724 -BW3:が, 0.1816 -BW3:たい -0.0594 -BW3:った -0.4748 -BW3:さを 0.0976 -BW3:たの 0.0812 -BW3:日、 0.0974 -BW3:って 0.03 -BW3:べき 0.2181 -BW3:の, -0.0724 -BW3:に、 -0.1021 -BW3:そう 0.0428 -BW3:カ月 0.099 -BW3:まれ -0.0793 -BW3:ても 0.0302 -BW3:大会 0.2217 -BW3:たり -0.1183 -BW3:たる -0.0853 -BW3:では 0.2295 -BW3:など 0.2135 -BW3:いる 0.56 -BW3:し、 0.1557 -BW3:いわ 0.1527 -BW3:た. 0.8875 -BW3:ある 0.3846 -BW3:あり 0.0719 -BW3:れる 0.1091 -BW3:でに -0.1482 -BW3:は, 0.1337 -BW3:です 0.1437 -BW3:なく -0.0903 -BW3:ない 0.1796 -BW3:して 0.1449 -BW3:市 0.0965 -BW3:きた 0.1645 -BW3:しな 0.2608 -BW3:れ, 0.0854 -BW3:どう 0.4664 -BW3:しま 0.12 -BW3:まし 0.1113 -BW3:だ. 0.4098 -BW3:しい -0.3714 -BW3:し, 0.1557 -BW3:えと 0.1454 -BW3:れば -0.3246 -BW3:あた -0.2194 -BW3:がり -0.2064 -BW3:がら -0.4977 -BW3:とも -0.3543 -BW3:した 0.3562 -UC3:A -0.137 -UC3:I 0.2311 -TW4:からな -0.2348 -TW4:ません 0.1097 -TW4:という 0.1349 -TW4:ました 0.5543 -TW4:ようと -0.4258 -TW4:たが、 0.1516 -TW4:してい 0.2958 -TW4:たが, 0.1516 -TW4:ている 0.1538 -TW4:いう。 0.8576 -TW4:いう. 0.8576 -TW4:よると 0.5865 -UC1:A 0.0484 -UC1:K 0.0093 -UC1:M 0.0645 -UC1:O -0.0505 -UC6:I -0.0253 -UC6:H -0.0506 -UC6:K 0.0087 -UC6:M 0.0247 -UC6:O -0.0387 -UW3:・ -0.3794 -UW3:調 -0.0562 -UW3:ン 0.0278 -UW3:ロ 0.2201 -UW3:ル 0.1591 -UW3:度 0.1452 -UW3:非 0.2066 -UW3:ム 0.1109 -UW3:府 0.1605 -UW3:ト 0.0521 -UW3:く 0.1004 -UW3:ッ -0.135 -UW3:広 -0.103 -UW3:李 0.3094 -UW3:部 0.12 -UW3:予 -0.1193 -UW3:郡 0.4404 -UW3:二 0.0974 -UW3:法 0.1868 -UW3:員 0.4513 -UW3:森 0.2438 -UW3:村 0.0364 -UW3:郎 0.1026 -UW3:グ 0.1319 -UW3:力 0.0365 -UW3:い 0.1006 -UW3:東 -0.0805 -UW3:ほ -0.5516 -UW3:へ 0.1199 -UW3:主 -0.0758 -UW3:ま -0.4384 -UW3:ひ -0.2171 -UW3:共 -0.188 -UW3:ふ -0.1798 -UW3:開 -0.1432 -UW3:六 0.0755 -UW3:公 -0.303 -UW3:中 0.0653 -UW3:と 0.1691 -UW3:は 0.4555 -UW3:全 0.1574 -UW3:間 0.1302 -UW3:っ -0.1444 -UW3:ち -0.0521 -UW3:型 0.1389 -UW3:で 0.2318 -UW3:て 0.6167 -UW3:両 0.3815 -UW3:つ -0.1081 -UW3:せ 0.3685 -UW3:す 0.0584 -UW3:た 0.0842 -UW3:そ -0.5228 -UW3:党 0.3593 -UW3:こ -0.3552 -UW3:げ 0.0401 -UW3:け 0.0388 -UW3:し -0.0395 -UW3:さ -0.1058 -UW3:ご -0.3116 -UW3:か -0.1163 -UW3:お -0.4864 -UW3:え 0.1983 -UW3:下 -0.1759 -UW3:が 0.3271 -UW3:あ -0.2696 -UW3:う 0.2342 -UW3:元 0.4858 -UW3:一 -0.1619 -UW3:政 -0.2013 -UW3:区 0.4646 -UW3:税 0.0401 -UW3:系 0.3066 -UW3:化 0.1327 -UW3:北 -0.1038 -UW3:口 0.0483 -UW3:右 0.1233 -UW3:駅 0.162 -UW3:戸 -0.0488 -UW3:知 -0.1528 -UW3:− -0.1723 -UW3:妻 0.2016 -UW3:金 0.2163 -UW3:込 -0.1504 -UW3:無 0.0979 -UW3:よ -0.0202 -UW3:わ -0.1207 -UW3:を 0.662 -UW3:学 -0.1356 -UW3:当 -0.3885 -UW3:保 -0.2439 -UW3:再 0.3095 -UW3:円 0.5807 -UW3:約 0.3663 -UW3:的 0.7313 -UW3:級 0.1384 -UW3:ア 0.0551 -UW3:ス 0.0874 -UW3:1 -0.08 -UW3:・ -0.3794 -UW3:ッ -0.135 -UW3:市 0.3197 -UW3:用 0.0914 -UW3:能 0.0725 -UW3:別 0.1129 -UW3:昨 -0.0661 -UW3:町 0.1215 -UW3:何 0.4265 -UW3:初 0.2475 -UW3:作 -0.0361 -UW3:決 -0.1073 -UW3:低 0.0811 -UW3:生 -0.0273 -UW3:月 0.4125 -UW3:数 0.3222 -UW3:最 -0.0937 -UW3:選 -0.0681 -UW3:雨 0.2009 -UW3:立 -0.096 -UW3:期 0.036 -UW3:電 -0.1045 -UW3:」 0.267 -UW3:費 0.1777 -UW3:業 0.0484 -UW3:, 0.4889 -UW3:者 0.6457 -UW3:教 -0.1479 -UW3:務 -0.1872 -UW3:動 -0.0949 -UW3:財 -0.0733 -UW3:指 -0.3973 -UW3:車 0.1835 -UW3:軍 0.1375 -UW3:国 0.0642 -UW3:統 -0.4229 -UW3:直 -0.1835 -UW3:日 0.2099 -UW3:旧 0.5792 -UW3:千 -0.2309 -UW3:午 -0.0783 -UW3:協 -0.1006 -UW3:外 -0.0241 -UW3:建 -0.2352 -UW3:特 -0.385 -UW3:自 -0.2869 -UW3:物 0.0461 -UW3:平 -0.1804 -UW3:海 -0.0495 -UW3:人 0.2742 -UW3:〓 -0.3573 -UW3:」 0.267 -UW3:、 0.4889 -UW3:々 -0.2311 -UW3:長 0.0421 -UW3:〇 0.5827 -UW3:思 -0.1291 -UW3:安 -0.0423 -UW3:州 0.1155 -UW3:み -0.012 -UW3:実 -0.1008 -UW3:得 0.1905 -UW3:通 -0.1136 -UW3:性 0.1822 -UW3:同 0.3906 -UW3:合 -0.0241 -UW3:各 0.3588 -UW3:時 -0.1248 -UW3:ロ 0.2201 -UW3:ル 0.1591 -UW3:家 0.1078 -UW3:ン 0.0278 -UW3:ム 0.1109 -UW3:見 0.1044 -UW3:ト 0.0521 -UW3:新 0.1764 -UW3:に 0.2745 -UW3:な -0.2788 -UW3:文 -0.1489 -UW3:ど -0.0899 -UW3:米 0.7767 -UW3:の 0.4056 -UW3:も 0.2323 -UW3:め 0.1205 -UW3:や -0.0788 -UW3:り 0.0649 -UW3:る 0.5905 -UW3:氏 0.2613 -UW3:ら 0.0727 -UW3:今 0.0792 -UW3:核 0.5156 -UW3:れ 0.2773 -UW3:他 0.1889 -UW3:ん -0.0518 -UW3:民 -0.1694 -UW3:場 0.1219 -UW3:副 0.4437 -UW3:ア 0.0551 -UW3:分 0.0457 -UW3:以 -0.1368 -UW3:曜 -0.0951 -UW3:グ 0.1319 -UW3:年 0.2416 -UW3:和 -0.0837 -UW3:県 0.6293 -UW3:ス 0.0874 -UW3:前 0.2286 -UW3:1 -0.08 -UW3:総 0.1163 -UW3:少 -0.3102 -UW3:小 -0.0513 -UW3:線 0.1255 -UW3:第 0.1201 -UW3:関 -0.1282 -UW3:英 0.0785 -UW3:私 0.4231 -UW3:世 -0.2087 -UW3:省 0.0792 -UW2:行 0.0838 -UW2:最 -0.063 -UW2:調 0.101 -UW2:立 -0.0763 -UW2:朝 -0.1843 -UW2:本 -0.165 -UW2:, -0.0829 -UW2:ッ 0.0831 -UW2:事 0.0492 -UW2:目 -0.1584 -UW2:相 -0.0242 -UW2:人 -0.0123 -UW2:東 -0.0931 -UW2:べ 0.1261 -UW2:主 -0.0861 -UW2:ま 0.06 -UW2:太 -0.0483 -UW2:ひ -0.1273 -UW2:天 -0.0865 -UW2:強 0.1067 -UW2:開 0.1758 -UW2:に -0.1764 -UW2:な 0.1063 -UW2:ど 0.1273 -UW2:と -0.0981 -UW2:は -0.0409 -UW2:の 0.013 -UW2:間 -0.1257 -UW2:入 0.0548 -UW2:だ 0.1837 -UW2:で -0.0268 -UW2:て -0.0291 -UW2:つ -0.0949 -UW2:せ 0.03 -UW2:す -0.0675 -UW2:た 0.0188 -UW2:そ -0.1011 -UW2:こ 0.1141 -UW2:世 -0.0302 -UW2:し 0.1529 -UW2:ざ 0.054 -UW2:さ 0.0878 -UW2:か 0.1454 -UW2:お -0.0502 -UW2:不 -0.215 -UW2:く -0.0412 -UW2:三 -0.0758 -UW2:が -0.0856 -UW2:あ -0.0538 -UW2:う 0.0134 -UW2:い 0.0505 -UW2:政 0.1522 -UW2:区 -0.0422 -UW2:自 -0.1353 -UW2:揺 -0.1033 -UW2:大 -0.1769 -UW2:理 0.0752 -UW2:「 -0.0645 -UW2:」 0.3145 -UW2:次 -0.2378 -UW2:、 -0.0829 -UW2:発 0.0529 -UW2:〇 0.0892 -UW2:実 0.1023 -UW2:西 -0.0744 -UW2:込 0.3041 -UW2:日 -0.1815 -UW2:見 -0.3874 -UW2:子 -0.1519 -UW2:新 -0.1682 -UW2:学 0.076 -UW2:保 0.0362 -UW2:文 -0.1355 -UW2:中 -0.0968 -UW2:手 -0.1519 -UW2:米 0.0509 -UW2:も -0.1263 -UW2:や -0.0402 -UW2:り -0.0579 -UW2:る -0.0694 -UW2:よ 0.1639 -UW2:れ 0.0571 -UW2:を -0.2516 -UW2:ん 0.2095 -UW2:気 -0.174 -UW2:民 -0.018 -UW2:副 -0.1566 -UW2:ア -0.0587 -UW2:ア -0.0587 -UW2:果 -0.0665 -UW2:キ 0.0568 -UW2:カ 0.0306 -UW2:カ 0.0306 -UW2:キ 0.0568 -UW2:」 0.3145 -UW2:「 -0.0645 -UW2:年 -0.106 -UW2:ッ 0.0831 -UW2:市 -0.0813 -UW2:議 0.1198 -UW2:小 -0.2009 -UW2:第 0.081 -UW2:初 -0.3025 -UW2:北 -0.3414 -UW2:明 -0.1462 -UW2:県 -0.1165 -UW2:会 0.0978 -TC4:IOO 0.0054 -TC4:HIH 0.0804 -TC4:HII 0.0679 -TC4:IIO 0.0656 -TC4:III 0.1497 -TC4:IIH 0.0321 -TC4:IHO -0.2324 -TC4:MOM 0.0841 -TC4:MHH -0.0405 -TC4:MHI 0.0201 -TC4:HOH 0.0446 -TC4:KAK 0.4845 -TC4:HHO 0.0669 -TC4:MMM 0.0661 -TC4:IHH 0.0695 -TC4:MMH -0.0241 -TC4:KKK 0.3065 -TC4:HHK 0.0365 -TC4:HHI 0.1344 -TC4:HHH -0.0203 -TC4:KKA 0.3386 -TC4:HHN 0.0182 -TC4:HHM -0.0122 -TQ3:BIIH -0.0116 -TQ3:BIII -0.0105 -TQ3:OKHH 0.0587 -TQ3:OIIH 0.1344 -TQ3:BHII -0.0504 -TQ3:BHIH 0.0222 -TQ3:OOHH 0.011 -TQ3:OKAK 0.2792 -TQ3:BHHH 0.0478 -TQ3:BOMH 0.062 -TQ3:BHHM -0.1073 -TQ3:OIHH 0.0623 -TQ3:BMHM -0.0464 -TQ3:OOII -0.0685 -TQ3:OKKA 0.0679 -TQ3:BMHI -0.0863 -TQ3:OHHI 0.1729 -TQ3:OHHH 0.0346 -TQ3:OHMH 0.0481 -TQ3:OHII 0.0997 -TC2:OII -0.2649 -TC2:HMM -0.1154 -TC2:IHI -0.1965 -TC2:KKH 0.0703 -TC2:HII -0.1023 -TC2:HHO 0.2088 -TC3:KOK -0.1009 -TC3:AAA -0.0294 -TC3:NNO 0.0662 -TC3:OHO -0.3393 -TC3:NNH -0.1689 -TC3:KHH -0.1216 -TC3:IOI -0.0542 -TC3:IIM -0.1035 -TC3:HII -0.1088 -TC3:HIK 0.0731 -TC3:IIH -0.0825 -TC3:IHO -0.1935 -TC3:MHO 0.0123 -TC3:MHM -0.0457 -TC3:MHH -0.2694 -TC3:HOH -0.1486 -TC3:KKH -0.1217 -TC3:IHH 0.0128 -TC3:IHI -0.3041 -TC3:MMH -0.0471 -TC3:HHI -0.0341 -TC3:HHH 0.0346 -TC3:KKA 0.0491 -UW5:月 -0.4353 -UW5:ン -0.0343 -UW5:ル 0.0451 -UW5:挙 0.1618 -UW5:語 -0.1073 -UW5:, 0.0465 -UW5:者 -0.2233 -UW5:務 0.3519 -UW5:E2 -3.2768 -UW5:員 0.2104 -UW5:郎 -0.0368 -UW5:京 0.0722 -UW5:相 0.1319 -UW5:統 0.1955 -UW5:い 0.0331 -UW5:べ 0.1001 -UW5:み 0.0502 -UW5:大 -0.1296 -UW5:日 0.0218 -UW5:に -0.1224 -UW5:な -0.0787 -UW5:ど 0.1682 -UW5:と -0.0127 -UW5:は -0.0578 -UW5:の -0.0635 -UW5:間 0.1191 -UW5:っ 0.0052 -UW5:ち 0.1093 -UW5:だ -0.1186 -UW5:で -0.085 -UW5:て -0.0018 -UW5:つ 0.0921 -UW5:す -0.0852 -UW5:党 -0.0654 -UW5:研 -0.0997 -UW5:げ -0.0983 -UW5:し -0.1371 -UW5:空 -0.0813 -UW5:さ -0.1537 -UW5:か 0.0647 -UW5:お 0.0527 -UW5:え 0.1199 -UW5:く 0.0312 -UW5:ぎ 0.1971 -UW5:き 0.1624 -UW5:が -0.0421 -UW5:あ 0.1655 -UW5:う -0.0503 -UW5:E2 -3.2768 -UW5:表 0.0663 -UW5:区 -0.0901 -UW5:「 0.0363 -UW5:館 -0.0689 -UW5:、 0.0465 -UW5:。 -0.0299 -UW5:長 0.0786 -UW5:査 0.0932 -UW5:題 0.2368 -UW5:思 0.0872 -UW5:機 -0.1508 -UW5:定 0.1785 -UW5:. -0.0299 -UW5:格 0.1356 -UW5:氏 -0.1347 -UW5:ル 0.0451 -UW5:ン -0.0343 -UW5:社 -0.0278 -UW5:新 -0.1682 -UW5:学 -0.0548 -UW5:中 -0.0871 -UW5:所 -0.0814 -UW5:ゃ 0.335 -UW5:め 0.0865 -UW5:ょ 0.0854 -UW5:り -0.0208 -UW5:る 0.0429 -UW5:的 -0.3149 -UW5:わ 0.0419 -UW5:れ 0.0504 -UW5:を -0.1264 -UW5:ん 0.0327 -UW5:イ 0.0241 -UW5:イ 0.0241 -UW5:会 -0.1153 -UW5:嵐 -0.1304 -UW5:1 -0.0514 -UW5:「 0.0363 -UW5:年 0.1763 -UW5:1 -0.0514 -UW5:市 -0.2991 -UW5:議 0.1219 -UW5:田 0.024 -UW5:選 -0.1018 -UW5:町 -0.3912 -UW5:] -0.2762 -UW5:席 0.0921 -UW5:告 0.0848 -UW5:県 -0.4003 -UW5:省 -0.1052 -TC1:AAA 0.1093 -TC1:HOM -0.0331 -TC1:HOH -0.039 -TC1:OOI -0.1832 -TC1:IOM 0.0467 -TC1:IHI 0.1169 -TC1:MMH 0.0187 -TC1:IOI -0.1015 -TC1:IOH -0.0142 -TC1:HII 0.0998 -TC1:HHH 0.1029 -TC1:HHM 0.058 -UC4:A -0.2643 -UC4:I -0.1032 -UC4:H 0.1809 -UC4:K -0.345 -UC4:M 0.3565 -UC4:O 0.6646 -UC4:N 0.3876 -UQ2:OK 0.1759 -UQ2:BH 0.0216 -UQ2:BI 0.0113 -UW4:ー -1.187 -UW4:行 -0.0792 -UW4:規 0.0792 -UW4:・ -0.4371 -UW4:園 -0.12 -UW4:ン -0.3637 -UW4:ラ -0.0881 -UW4:ル -0.0856 -UW4:リ -0.0541 -UW4:メ -0.1635 -UW4:ぎ -0.3821 -UW4:地 0.0866 -UW4:ト -0.0403 -UW4:庁 -0.4556 -UW4:ッ -0.0724 -UW4:率 0.0672 -UW4:予 0.0782 -UW4:事 -0.019 -UW4:井 -0.1768 -UW4:員 -0.091 -UW4:郎 -0.4866 -UW4:塁 -0.2094 -UW4:署 0.0749 -UW4:来 -0.0442 -UW4:力 -0.0302 -UW4:い -0.3435 -UW4:賞 0.073 -UW4:ほ 0.1464 -UW4:べ -0.0744 -UW4:へ 0.6665 -UW4:み -0.2082 -UW4:ま 0.1051 -UW4:び -0.4134 -UW4:ひ 0.4249 -UW4:ば 0.194 -UW4:共 -0.1212 -UW4:ふ 0.1345 -UW4:に 0.6499 -UW4:な 0.5433 -UW4:中 0.221 -UW4:と 0.4547 -UW4:は 0.8578 -UW4:の 0.7396 -UW4:ね 0.1413 -UW4:ぬ 0.1853 -UW4:っ -0.5882 -UW4:ち -0.3654 -UW4:だ 0.5408 -UW4:で 0.741 -UW4:て 0.3994 -UW4:つ -0.1659 -UW4:せ 0.0181 -UW4:ず 0.1251 -UW4:す -0.0731 -UW4:じ -0.2506 -UW4:た 0.5034 -UW4:そ 0.4091 -UW4:党 -0.2006 -UW4:こ 0.2255 -UW4:げ -0.4734 -UW4:け -0.4376 -UW4:し -0.0843 -UW4:さ 0.2864 -UW4:ご 0.1979 -UW4:か 0.053 -UW4:お 0.2405 -UW4:え -0.2514 -UW4:く -0.3788 -UW4:先 0.0601 -UW4:き -0.4482 -UW4:が 0.6006 -UW4:あ 0.4752 -UW4:う -0.064 -UW4:一 -0.2069 -UW4:島 -0.2056 -UW4:改 0.0787 -UW4:士 -0.1413 -UW4:政 0.2182 -UW4:区 0.4517 -UW4:野 -0.11 -UW4:支 0.0856 -UW4:系 0.0786 -UW4:館 -0.1984 -UW4:化 0.0776 -UW4:参 0.1555 -UW4:込 -0.337 -UW4:. 0.3508 -UW4:よ 0.3351 -UW4:子 -0.4802 -UW4:学 -0.1397 -UW4:感 0.0916 -UW4:校 -0.036 -UW4:般 -0.0852 -UW4:内 0.0584 -UW4:円 0.0788 -UW4:題 -0.0792 -UW4:高 0.212 -UW4:約 0.2171 -UW4:的 0.2586 -UW4:銀 -0.2213 -UW4:屋 -0.1328 -UW4:済 -0.0543 -UW4:ー -1.187 -UW4:輪 -0.1433 -UW4:山 -0.15 -UW4:コ 0.1789 -UW4:セ 0.1287 -UW4:」 0.3798 -UW4:「 0.1895 -UW4:際 -0.2604 -UW4:・ -0.4371 -UW4:ッ -0.0724 -UW4:産 -0.1101 -UW4:市 0.2771 -UW4:能 -0.073 -UW4:田 -0.29 -UW4:選 0.2596 -UW4:町 0.1826 -UW4:間 -0.2344 -UW4:カ 0.2145 -UW4:体 -0.1286 -UW4:初 0.1347 -UW4:作 0.053 -UW4:カ 0.2145 -UW4:寺 -0.0809 -UW4:側 0.4292 -UW4:道 -0.1291 -UW4:生 -0.1286 -UW4:月 -0.9066 -UW4:都 0.1192 -UW4:最 0.0845 -UW4:立 -0.2112 -UW4:電 -0.0878 -UW4:沢 -0.0939 -UW4:業 -0.1043 -UW4:, 0.393 -UW4:者 0.2145 -UW4:教 0.0704 -UW4:務 -0.2715 -UW4:動 -0.074 -UW4:車 -0.1481 -UW4:回 0.15 -UW4:軍 0.1158 -UW4:経 0.1146 -UW4:国 -0.0619 -UW4:目 0.0922 -UW4:統 -0.1169 -UW4:大 0.0571 -UW4:日 0.1798 -UW4:谷 -0.1 -UW4:空 -0.0867 -UW4:協 0.1013 -UW4:多 0.1067 -UW4:領 -0.1659 -UW4:物 -0.0735 -UW4:人 0.1036 -UW4:〓 -0.5156 -UW4:球 -0.1267 -UW4:「 0.1895 -UW4:」 0.3798 -UW4:、 0.393 -UW4:。 0.3508 -UW4:長 0.0357 -UW4:〇 0.4999 -UW4:川 -0.2667 -UW4:定 -0.1057 -UW4:性 0.0553 -UW4:合 -0.1834 -UW4:後 0.0456 -UW4:時 0.1829 -UW4:首 0.1749 -UW4:ル -0.0856 -UW4:近 0.0929 -UW4:メ -0.1635 -UW4:ラ -0.0881 -UW4:方 -0.0856 -UW4:― -0.4841 -UW4:ト -0.0403 -UW4:文 0.0522 -UW4:所 -0.1566 -UW4:米 0.2937 -UW4:も 0.4169 -UW4:ゃ -0.2666 -UW4:む -0.0882 -UW4:め -0.5046 -UW4:ょ -0.1544 -UW4:や 0.2795 -UW4:院 -0.2297 -UW4:り -0.9726 -UW4:る -1.4896 -UW4:氏 0.5388 -UW4:ら -0.2922 -UW4:わ -0.1783 -UW4:れ -0.2613 -UW4:ろ -0.457 -UW4:を 1.315 -UW4:ん -0.2352 -UW4:気 -0.091 -UW4:民 -0.2716 -UW4:場 -0.141 -UW4:リ -0.0541 -UW4:副 0.3879 -UW4:以 0.0544 -UW4:会 0.095 -UW4:ン -0.3637 -UW4:コ 0.1789 -UW4:年 0.0374 -UW4:和 -0.0681 -UW4:セ 0.1287 -UW4:前 0.1623 -UW4:器 -0.0851 -UW4:総 0.094 -UW4:議 -0.0244 -UW4:小 0.191 -UW4:警 -0.1184 -UW4:線 -0.0994 -UW4:第 0.0788 -UW4:県 0.2997 -UW4:木 -0.0485 -UW4:省 -0.3485 -UQ3:ON -0.3212 -UQ3:BA -0.0479 -UQ3:OI -0.0827 -UQ3:BM 0.316 -UQ3:BN 0.6427 -UQ3:BO 1.4761 -UQ3:BH 0.0042 -UQ3:BI 0.1913 -UQ3:BK -0.7198 -TQ1:OIHI 0.02 -TQ1:OIIH -0.0068 -TQ1:BIII 0.1595 -TQ1:OAKK 0.0482 -TQ1:BIHH 0.006 -TQ1:BHIH -0.0132 -TQ1:BHHH -0.0227 -TQ1:BHHI 0.0316 -TQ1:BOHH 0.0225 -TQ1:BOOO -0.0908 -TQ1:OHHH 0.0281 -TQ1:BNHH -0.0744 -TQ1:OHIH 0.0249 -UC5:I -0.1238 -UC5:H 0.0313 -UC5:K -0.0799 -UC5:M 0.0539 -UC5:O -0.0831 -TQ4:BIIH -0.0607 -TQ4:BIII -0.2181 -TQ4:OAKK 0.018 -TQ4:OIIH 0.0626 -TQ4:BHII -0.0966 -TQ4:OIHI -0.0493 -TQ4:BHHH -0.0721 -TQ4:OIII -0.4007 -TQ4:BHHM -0.3604 -TQ4:OIHH 0.1935 -TQ4:OHIH -0.1573 -TQ4:OKAK -0.8156 -TQ4:OHHI 0.2446 -TQ4:OHHH -0.0294 -TQ4:OAAA -0.2763 -TQ4:OHHO 0.048 -TW2:その後 -0.443 -TW2:社会党 -0.3216 -TW2:もので 0.1882 -TW2:ていた 0.1833 -TW2:大きな -0.1255 -TW2:ころが -0.2434 -TW2:同時に -0.8097 -TW2:一気に -0.0792 -TW2:ともに -0.4517 -TW2:だって -0.1049 -TW2:対して -0.2721 -TW2:として -0.4657 -TW2:いった -0.1256 -TW2:ある程 -0.2049 -TW2:初めて -0.1512 -TW2:しょう 0.3873 -TW1:東京都 0.2026 -TW1:につい -0.4681 -UW1:も -0.0466 -UW1:主 -0.0402 -UW1:大 0.0561 -UW1:や -0.047 -UW1:・ -0.0135 -UW1:り 0.0208 -UW1:日 -0.0141 -UW1:よ 0.0182 -UW1:ら -0.0292 -UW1:区 -0.0912 -UW1:れ 0.0169 -UW1:京 -0.0268 -UW1:に -0.0789 -UW1:ん -0.0137 -UW1:ど -0.0123 -UW1:と -0.0547 -UW1:は -0.0847 -UW1:の -0.0185 -UW1:都 -0.0718 -UW1:あ -0.0941 -UW1:市 -0.0411 -UW1:委 0.0729 -UW1:で -0.0201 -UW1:県 -0.0386 -UW1:を -0.0446 -UW1:国 -0.046 -UW1:・ -0.0135 -UW1:こ 0.0505 -UW1:理 0.0361 -UW1:午 0.0871 -UW1:, 0.0156 -UW1:「 -0.0463 -UW1:「 -0.0463 -UW1:き 0.0121 -UW1:が -0.0553 -UW1:、 0.0156 -UW1:う -0.0127 -UW1:生 -0.0408 -UP3:B 0.0189 -BP1:OO -0.0125 -BP1:OB 0.0304 -BP1:BB 0.0295 -BP1:UB 0.0352 -TW3:いただ -0.1734 -TW3:してい 0.1314 -TW3:十二月 -0.2287 -TW3:れから -0.3752 -TW3:のもの -0.06 -TW3:にとっ -0.5989 -TW3:に当た -0.6247 -TW3:ので、 -0.0727 -TW3:ので, -0.0727 -TW3:につい -0.5483 -TW3:として -0.4314 -BQ4:BMI -0.3385 -BQ4:OAH 0.0926 -BQ4:BOO -1.2396 -BQ4:OHH 0.0266 -BQ4:BHH -0.3895 -BQ4:ONN -0.0973 -BQ4:BIK 0.1348 -BQ4:BIH 0.3761 -BQ4:BII -0.4654 -BQ4:OHK -0.2036 -BQ4:BKK -0.1806 -BP2:OO -0.1762 -BP2:BO 0.006 -BQ2:BHI -0.1159 -BQ2:BHH 0.0118 -BQ2:UHI -0.1146 -BQ2:BHM 0.0466 -BQ2:BIH -0.0919 -BQ2:OHM -0.0181 -BQ2:OHH -0.1139 -BQ2:BKO 0.0864 -BQ2:OIH 0.0153 -BQ2:BKK -0.172 -BQ3:BHI 0.2664 -BQ3:BHH -0.0792 -BQ3:OHM 0.0439 -BQ3:OHH 0.2174 -BQ3:OII 0.028 -BQ3:BII -0.0299 -BQ3:BMH 0.0937 -BQ3:OMH -0.2402 -BQ3:BKI 0.0419 -BQ3:BMM 0.8335 -BQ3:BOH 0.0775 -BQ3:BNN 0.0998 -BQ3:OKI -0.0793 -BQ3:OKH 0.1798 -BQ3:OOO 1.1699 -BQ3:OKO -0.2242 -TQ2:BIII -0.1033 -TQ2:BIHH -0.1401 -TQ2:BKAK -0.0543 -TQ2:BOOO -0.5591 -BQ1:BOH -0.0091 -BQ1:BNH 0.0449 -BQ1:BOO -0.2597 -BQ1:BHH 0.115 -BQ1:BIM 0.0886 -BQ1:BHM 0.1521 -BQ1:OHI 0.0451 -BQ1:BII -0.1158 -BQ1:BMH 0.1208 -BQ1:OIH -0.0296 -BQ1:OKA 0.1851 -BQ1:OKH -0.102 -BQ1:OKK 0.0904 -BQ1:OOO 0.2965 -UQ1:OO -0.2422 -UQ1:OK 0.041 -UQ1:OI 0.0477 -UQ1:OH -0.0095 -UQ1:BN 0.0142 -UQ1:BO -0.0056 -UQ1:BH 0.0021 -UQ1:BI -0.0012 -UQ1:BK -0.0099 -UC2:A 0.0819 -UC2:I 0.0409 -UC2:H 0.1059 -UC2:M 0.3987 -UC2:O 0.0646 -UC2:N 0.5775 -UP1:O -0.0214 -UP2:B 0.0069 -UP2:O 0.0935 -BC1:II 0.2461 -BC1:HH 0.0006 -BC1:KH 0.0406 -BC1:OH -0.1378 -BC2:AA -0.3267 -BC2:OO -0.292 -BC2:AI 0.2744 -BC2:KI 0.3831 -BC2:IK 0.1721 -BC2:MK 0.3334 -BC2:AN -0.0878 -BC2:II -0.1332 -BC2:IH -0.1184 -BC2:HH -0.407 -BC2:MH -0.3132 -BC2:HN 0.4012 -BC2:HO 0.3761 -BC2:IO 0.5492 -BC2:HM -0.1711 -BC2:IA 0.1327 -BC2:KK -0.8741 -BC3:HK -0.0721 -BC3:HH 0.0996 -BC3:HI 0.0626 -BC3:HN -0.1307 -BC3:HO -0.0836 -BC3:IH -0.0301 -BC3:KK 0.2762 -BC3:OH 0.0266 -BC3:OA -0.1652 -BC3:MM 0.4034 -BC3:MK 0.1079 diff --git a/src/trainer.rs b/src/trainer.rs index 1bce18d..4750dea 100644 --- a/src/trainer.rs +++ b/src/trainer.rs @@ -83,3 +83,79 @@ impl Trainer { Ok(self.learner.get_metrics()) } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::adaboost::Metrics; + use std::io::Write; + use std::sync::atomic::AtomicBool; + use std::sync::Arc; + use tempfile::NamedTempFile; + + // Helper: create a dummy features file. + // This file should contain at least one line for initialize_features and initialize_instances. + fn create_dummy_features_file() -> NamedTempFile { + let mut file = NamedTempFile::new().expect("Failed to create temp file for features"); + + // For example, it could contain "1 feature1" to represent one feature. + writeln!(file, "1 feature1").expect("Failed to write to features file"); + file + } + + // Helper: create a dummy model file. + // This file should contain the model weights and bias. + fn create_dummy_model_file() -> NamedTempFile { + let mut file = NamedTempFile::new().expect("Failed to create temp file for model"); + + // For example, it could contain a single feature weight and a bias term. + // The feature line is "BW1:こん -0.1262" and the last line is the bias term "100.0". + writeln!(file, "BW1:こん\t-0.1262").expect("Failed to write feature"); + writeln!(file, "100.0").expect("Failed to write bias"); + file + } + + #[test] + fn test_load_model() -> Result<(), Box> { + // Prepare a dummy features file + let features_file = create_dummy_features_file(); + + // Create a Trainer instance + let mut trainer = Trainer::new(0.01, 10, 1, features_file.path()); + + // Prepare a dummy model file + let model_file = create_dummy_model_file(); + + // Load the model file into the Trainer + // This should not return an error if the model file is correctly formatted. + // If the model file is not correctly formatted, it will return an error. + trainer.load_model(model_file.path())?; + + Ok(()) + } + + #[test] + fn test_train() -> Result<(), Box> { + // Prepare a dummy features file + let features_file = create_dummy_features_file(); + + // Create a Trainer instance with the dummy features file + let mut trainer = Trainer::new(0.01, 5, 1, features_file.path()); + + // Prepare a temporary file for the model output + let model_out = NamedTempFile::new()?; + + // Set AtomicBool to false and immediately exit the learning loop + let running = Arc::new(AtomicBool::new(false)); + + // Execute the train method. + let metrics: Metrics = trainer.train(running, model_out.path())?; + + // Check if the metrics are valie. + // Since metrics are dummy data, we will consider anything 0 or above to be OK here. + assert!(metrics.accuracy >= 0.0); + assert!(metrics.precision >= 0.0); + assert!(metrics.recall >= 0.0); + Ok(()) + } +} From cbe9a6d3601c5f3625b787ce9b62772e0d69593a Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Wed, 4 Jun 2025 22:26:14 +0900 Subject: [PATCH 13/15] Add tests --- src/adaboost.rs | 165 +++++++++++++++++++++++++++++++++++++++++++++++ src/segmenter.rs | 4 +- src/trainer.rs | 5 +- 3 files changed, 171 insertions(+), 3 deletions(-) diff --git a/src/adaboost.rs b/src/adaboost.rs index 2272343..7dd9cb9 100644 --- a/src/adaboost.rs +++ b/src/adaboost.rs @@ -407,3 +407,168 @@ impl AdaBoost { } } } + +#[cfg(test)] +mod tests { + use super::*; + + use std::collections::HashSet; + use std::io::Write; + use std::sync::atomic::AtomicBool; + use std::sync::Arc; + + use tempfile::NamedTempFile; + + #[test] + fn test_initialize_features() -> std::io::Result<()> { + // Create a dummy features file + let mut features_file = NamedTempFile::new()?; + writeln!(features_file, "1 feat1 feat2")?; + writeln!(features_file, "0 feat3")?; + features_file.as_file().sync_all()?; + + let mut learner = AdaBoost::new(0.01, 10, 1); + learner.initialize_features(features_file.path())?; + + // Features is an ordered set that should contain ""(empty string), "feat1", "feat2", "feat3" + assert!(learner.features.contains(&"".to_string())); + assert!(learner.features.contains(&"feat1".to_string())); + assert!(learner.features.contains(&"feat2".to_string())); + assert!(learner.features.contains(&"feat3".to_string())); + Ok(()) + } + + #[test] + fn test_initialize_instances() -> std::io::Result<()> { + // First, initialize features in the feature file. + let mut features_file = NamedTempFile::new()?; + writeln!(features_file, "1 feat1 feat2")?; + features_file.as_file().sync_all()?; + + let mut learner = AdaBoost::new(0.01, 10, 1); + learner.initialize_features(features_file.path())?; + + // Create a dummy instance file + let mut instance_file = NamedTempFile::new()?; + // Example: "1 feat1" line. The learner will consider feat1 as a candidate if found by binary_search. + writeln!(instance_file, "1 feat1")?; + instance_file.as_file().sync_all()?; + + learner.initialize_instances(instance_file.path())?; + + // The number of instances should be 1, and the instance_weights, labels, and instances should be updated accordingly. + assert_eq!(learner.num_instances, 1); + assert_eq!(learner.labels.len(), 1); + assert_eq!(learner.instance_weights.len(), 1); + assert_eq!(learner.instances.len(), 1); + + Ok(()) + } + + #[test] + fn test_train() -> std::io::Result<()> { + // Initialize features using a features file. + let mut features_file = NamedTempFile::new()?; + writeln!(features_file, "1 feat1 feat2")?; + features_file.as_file().sync_all()?; + + let mut learner = AdaBoost::new(0.01, 3, 1); + learner.initialize_features(features_file.path())?; + + // Create a dummy instance file with one instance. + let mut instance_file = NamedTempFile::new()?; + writeln!(instance_file, "1 feat1")?; + instance_file.as_file().sync_all()?; + learner.initialize_instances(instance_file.path())?; + + // Set running to false to immediately exit the learning loop. + let running = Arc::new(AtomicBool::new(false)); + learner.train(running.clone()); + + // If normalization of model or instance_weights is performed after learning, it should be OK. + let weight_sum: f64 = learner.instance_weights.iter().sum(); + + // weight_sum should be normalized to 1.0. + assert!((weight_sum - 1.0).abs() < 1e-6); + + Ok(()) + } + + #[test] + fn test_save_and_load_model() -> std::io::Result<()> { + // Prepare a dummy learner. + let mut learner = AdaBoost::new(0.01, 10, 1); + + // Set the features and weights in advance. + learner.features = vec!["feat1".to_string(), "feat2".to_string()]; + learner.model = vec![0.5, -0.3]; + + // Save the model to a temporary file. + let temp_model = NamedTempFile::new()?; + learner.save_model(temp_model.path())?; + + // Load the model with a new learner. + let mut learner2 = AdaBoost::new(0.01, 10, 1); + learner2.load_model(temp_model.path())?; + + // Check that the number of features and models match. + assert_eq!(learner2.features.len(), learner.features.len()); + assert_eq!(learner2.model.len(), learner.model.len()); + + Ok(()) + } + + #[test] + fn test_add_instance_and_predict() { + let mut learner = AdaBoost::new(0.01, 10, 1); + + // Here, features and model are empty in the initial state. They are newly registered by add_instance. + let mut attrs = HashSet::new(); + attrs.insert("A".to_string()); + learner.add_instance(attrs.clone(), 1); + + // When the same attribute is passed to predict, score returns 1 based on the initial model value (0.0) (because score>=0). + let prediction = learner.predict(attrs); + assert_eq!(prediction, 1); + } + + #[test] + fn test_get_bias() { + let mut learner = AdaBoost::new(0.01, 10, 1); + + // Set model weights as an example. + learner.model = vec![0.2, 0.3, -0.1]; + + // bias = -sum(model)/2 = -(0.2+0.3-0.1)/2 = -0.4/2 = -0.2 + assert!((learner.get_bias() + 0.2).abs() < 1e-6); + } + + #[test] + fn test_get_metrics() { + let mut learner = AdaBoost::new(0.01, 10, 1); + + // Set features and model for prediction + learner.features = vec!["A".to_string(), "B".to_string()]; + learner.model = vec![0.5, -1.0]; + + // Instance 1: Attribute “A” → score = 0.25 + 0.5 = 0.75 (positive example) + let mut attrs1 = HashSet::new(); + attrs1.insert("A".to_string()); + learner.add_instance(attrs1, 1); + + // Instance 2: Attribute “B” → score = 0.25 + (-1.0) = -0.75 (negative example) + let mut attrs2 = HashSet::new(); + attrs2.insert("B".to_string()); + learner.add_instance(attrs2, -1); + + let metrics = learner.get_metrics(); + assert_eq!(metrics.true_positives, 1); + assert_eq!(metrics.true_negatives, 1); + assert_eq!(metrics.false_positives, 0); + assert_eq!(metrics.false_negatives, 0); + assert_eq!(metrics.num_instances, 2); + + // Since this is a simple case, the accuracy is 100%. + assert!((metrics.accuracy - 100.0).abs() < 1e-6); + } +} diff --git a/src/segmenter.rs b/src/segmenter.rs index fb5af4a..7b69ecb 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -286,10 +286,10 @@ impl Segmenter { #[cfg(test)] mod tests { - use std::path::PathBuf; - use super::*; + use std::path::PathBuf; + #[test] fn test_add_sentence_with_writer() { let mut segmenter = Segmenter::new(None); diff --git a/src/trainer.rs b/src/trainer.rs index 4750dea..280e38b 100644 --- a/src/trainer.rs +++ b/src/trainer.rs @@ -87,12 +87,15 @@ impl Trainer { #[cfg(test)] mod tests { use super::*; - use crate::adaboost::Metrics; + use std::io::Write; use std::sync::atomic::AtomicBool; use std::sync::Arc; + use tempfile::NamedTempFile; + use crate::adaboost::Metrics; + // Helper: create a dummy features file. // This file should contain at least one line for initialize_features and initialize_instances. fn create_dummy_features_file() -> NamedTempFile { From f555d1710843f1f4ef9a35b46c46ae7ea3e7942d Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Wed, 4 Jun 2025 22:28:51 +0900 Subject: [PATCH 14/15] Update README.md --- README.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 727f0ed..6d0ad4d 100644 --- a/README.md +++ b/README.md @@ -80,14 +80,17 @@ The output from the `train` command is similar to: ```text finding instances...: 61 instances found - +loading instances...: 61/61 instances loaded Iteration 9999 - margin: 0.16068839956263622 -Result: -Accuracy: 100.00% (61 / 61) -Precision: 100.00% (24 / 24) -Recall: 100.00% (24 / 24) -Confusion Matrix: TP: 24, FP: 0, FN: 0, TN: 37 -Training completed successfully. +Result Metrics: + Accuracy: 100.00% ( 61 / 61 ) + Precision: 100.00% ( 24 / 24 ) + Recall: 100.00% ( 24 / 24 ) + Confusion Matrix: + True Positives: 24 + False Positives: 0 + False Negatives: 0 + True Negatives: 37 ``` ## How to segment sentences into words From f1f1373f3775cb21172d8c978da74ce4cb613903 Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Wed, 4 Jun 2025 22:29:37 +0900 Subject: [PATCH 15/15] Bump up version to 0.2.0 --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4594d15..16406bb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -226,7 +226,7 @@ checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" [[package]] name = "litsea" -version = "0.1.0" +version = "0.2.0" dependencies = [ "clap", "ctrlc", diff --git a/Cargo.toml b/Cargo.toml index 1083139..f4e8db2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "litsea" -version = "0.1.0" +version = "0.2.0" edition = "2021" description = "Litsea is an extreamely compact word segmentation and model training tool implemented in Rust." documentation = "https://docs.rs/litsea"