From a2813a87b73e2d2a54e713c75966d5c0092cce82 Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Wed, 4 Jun 2025 23:59:06 +0900 Subject: [PATCH 01/10] Refactoring --- src/adaboost.rs | 32 +++++++++ src/extractor.rs | 2 +- src/segmenter.rs | 179 ++++++++++++++++++++++++++++++++++++----------- 3 files changed, 171 insertions(+), 42 deletions(-) diff --git a/src/adaboost.rs b/src/adaboost.rs index 7dd9cb9..82bccfc 100644 --- a/src/adaboost.rs +++ b/src/adaboost.rs @@ -80,6 +80,15 @@ impl AdaBoost { /// # Returns: A result indicating success or failure. /// /// # Errors: Returns an error if the file cannot be opened or read. + /// + /// This method reads the file line by line, extracts features, + /// and initializes the model with the features and their corresponding weights. + /// It also counts the number of instances and reserves space in the vectors for efficient memory usage. + /// + /// # Note: The features are stored in a `BTreeMap` to preserve the order of insertion. + /// The last feature is an empty string, which is used as a bias term. + /// The model is initialized with zeros for each feature. + /// The number of instances is counted to ensure that the model can handle the data efficiently. pub fn initialize_features(&mut self, filename: &Path) -> std::io::Result<()> { let file = File::open(filename)?; let reader = BufReader::new(file); @@ -128,6 +137,12 @@ impl AdaBoost { /// # Returns: A result indicating success or failure. /// /// # Errors: Returns an error if the file cannot be opened or read. + /// + /// This method reads the file line by line, extracts the label and features, + /// and initializes the instances with their corresponding weights. + /// It calculates the score for each instance based on the features and updates the model accordingly. + /// The instance weights are initialized based on the label and score. + /// It also prints the progress of loading instances to the standard error output. pub fn initialize_instances(&mut self, filename: &Path) -> std::io::Result<()> { let file = File::open(filename)?; let reader = BufReader::new(file); @@ -175,6 +190,19 @@ impl AdaBoost { /// /// # Arguments /// * `running`: An `Arc` to control the running state of the training process. + /// + /// # Returns: This method does not return a value. + /// + /// # Errors: This method does not return an error, but it will stop training if `running` is set to false. + /// + /// This method performs the following steps: + /// 1. Initializes the error vector and sums of weights. + /// 2. Iterates through the training data for a specified number of iterations. + /// 3. For each instance, calculates the error based on the current model. + /// 4. Finds the best hypothesis based on the error rates. + /// 5. Updates the model with the best hypothesis and calculates the alpha value. + /// 6. Updates the instance weights based on the predictions. + /// 7. Normalizes the instance weights to ensure they sum to 1. pub fn train(&mut self, running: Arc) { let num_features = self.features.len(); @@ -257,6 +285,10 @@ impl AdaBoost { /// # Returns: A result indicating success or failure. /// /// # Errors: Returns an error if the file cannot be created or written to. + /// + /// This method writes the model to a file in a tab-separated format, + /// where each line contains a feature and its corresponding weight. + /// The last line contains the bias term, which is calculated as the negative sum of the model weights divided by 2. pub fn save_model(&self, filename: &Path) -> std::io::Result<()> { let mut file = File::create(filename)?; let mut bias = -self.model[0]; diff --git a/src/extractor.rs b/src/extractor.rs index fbb7b92..101fb22 100644 --- a/src/extractor.rs +++ b/src/extractor.rs @@ -72,7 +72,7 @@ impl Extractor { Ok(line) => { let line = line.trim(); if !line.is_empty() { - self.segmenter.add_sentence_with_writer(line, &mut learner); + self.segmenter.add_corpus_with_writer(line, &mut learner); } } Err(err) => { diff --git a/src/segmenter.rs b/src/segmenter.rs index 7b69ecb..1ee5c75 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -3,7 +3,7 @@ use regex::Regex; use std::collections::HashSet; /// Segmenter struct for text segmentation using AdaBoost -/// It uses predefined patterns to classify characters and segments sentences into words. +/// It uses predefined patterns to classify characters and segment sentences into words. pub struct Segmenter { patterns: Vec<(Regex, &'static str)>, pub learner: AdaBoost, @@ -17,6 +17,21 @@ impl Segmenter { /// /// # Returns /// A new Segmenter instance with the specified or default AdaBoost learner. + /// + /// # Note + /// The patterns are predefined to match various character types, including numbers, Japanese scripts, + /// Korean Hangul, Thai script, Kanji, and Latin characters. Each pattern is associated with a label + /// that indicates the type of character it matches, such as "N" for numbers, "I" for Hiragana, "K" for Katakana, + /// "G" for Hangul, "T" for Thai, "H" for Kanji, "Z" for CJK Unified Ideographs, "E" for Extended Latin, + /// and "A" for ASCII/Full-width Latin characters. + /// + /// # Example + /// ``` + /// use litsea::segmenter::Segmenter; + /// + /// let segmenter = Segmenter::new(None); + /// ``` + /// This will create a new Segmenter instance with a default AdaBoost learner. pub fn new(learner: Option) -> Self { let patterns = vec![ // Numbers @@ -53,9 +68,22 @@ impl Segmenter { /// * `ch` - A string slice representing a single character. /// /// # Returns - /// A string slice representing the type of the character, such as "N" for number, - /// "I" for Hiragana, "K" for Katakana, etc. If the character does not match any pattern, - /// it returns "O" for Other. + /// A string slice representing the type of the character, such as "N" for numbers, "I" for Hiragana, + /// "K" for Katakana, "G" for Hangul, "T" for Thai, "H" for Kanji, "Z" for CJK Unified Ideographs, + /// "E" for Extended Latin, and "A" for ASCII/Full-width Latin characters. + /// + /// # Note + /// If the character does not match any of the predefined patterns, it returns "O" for Other. + /// + /// # Example + /// ``` + /// use litsea::segmenter::Segmenter; + /// + /// let segmenter = Segmenter::new(None); + /// let char_type = segmenter.get_type("あ"); + /// assert_eq!(char_type, "I"); // Hiragana + /// ``` + /// This will return "I" for Hiragana characters. pub fn get_type(&self, ch: &str) -> &str { for (pattern, label) in &self.patterns { if pattern.is_match(ch) { @@ -65,27 +93,43 @@ impl Segmenter { "O" // Other } - /// Adds a sentence to the segmenter with a custom writer function. + /// Adds a corpus to the segmenter with a custom writer function. /// /// # Arguments - /// * `sentence` - A string slice representing the sentence to be added. - /// * `writer` - A closure that takes a `HashSet` of attributes and a label (`i8`) as arguments. + /// * `corpus` - A string slice representing the corpus to be added. + /// * `writer` - A closure that takes a HashSet of attributes and a label (i8) and writes them. + /// + /// # Note + /// The writer function is called for each word in the corpus, allowing for custom handling of the attributes and labels. + /// + /// # Example + /// ``` + /// use litsea::segmenter::Segmenter; /// - /// This closure is called for each instance created from the sentence. - /// This method processes the sentence, extracts features, and calls the writer function for each instance. - /// It constructs attributes based on the characters and their types, and uses the AdaBoost learner to add instances. - pub fn add_sentence_with_writer(&mut self, sentence: &str, mut writer: F) + /// let mut segmenter = Segmenter::new(None); + /// segmenter.add_corpus_with_writer("テスト です", |attrs, label| { + /// println!("Attributes: {:?}, Label: {}", attrs, label); + /// }); + /// ``` + /// + /// This will process the corpus and call the writer function for each word, passing the attributes and label. + /// + /// # Returns + /// Returns nothing. + /// + /// This method is useful for training the segmenter with a corpus of sentences, allowing it to learn how to segment text into words. + pub fn add_corpus_with_writer(&mut self, corpus: &str, mut writer: F) where F: FnMut(HashSet, i8), { - if sentence.is_empty() { + if corpus.is_empty() { return; } let mut tags = vec!["U".to_string(); 3]; let mut chars = vec!["B3".to_string(), "B2".to_string(), "B1".to_string()]; let mut types = vec!["O".to_string(); 3]; - for word in sentence.split(' ') { + for word in corpus.split(' ') { if word.is_empty() { continue; } @@ -114,23 +158,40 @@ impl Segmenter { } } - /// Adds a sentence to the segmenter for training. + /// Adds a corpus to the segmenter. /// /// # Arguments - /// * `sentence` - A string slice representing the sentence to be added. + /// * `corpus` - A string slice representing the corpus to be added. /// - /// This method processes the sentence, extracts features, and adds them to the AdaBoost learner. - /// It constructs attributes based on the characters and their types, and uses the AdaBoost learner to add instances. - /// If the sentence is empty or too short, it does nothing. - pub fn add_sentence(&mut self, sentence: &str) { - if sentence.is_empty() { + /// This method processes the corpus, extracts features, and adds instances to the AdaBoost learner. + /// If the corpus is empty, it does nothing. + /// # Note + /// The method constructs attributes based on the characters and their types, and uses the AdaBoost learner to add instances. + /// If the corpus is too short or does not contain enough characters, it will not add any instances. + /// The attributes are constructed based on the surrounding characters and their types, allowing for rich feature extraction. + /// + /// # Example + /// ``` + /// use litsea::segmenter::Segmenter; + /// + /// let mut segmenter = Segmenter::new(None); + /// segmenter.add_corpus("テスト です"); + /// ``` + /// This will process the corpus and add instances to the segmenter. + /// + /// # Returns + /// Returns nothing. + /// + /// This method is useful for training the segmenter with a corpus of sentences, allowing it to learn how to segment text into words. + pub fn add_corpus(&mut self, corpus: &str) { + if corpus.is_empty() { return; } let mut tags = vec!["U".to_string(); 3]; let mut chars = vec!["B3".to_string(), "B2".to_string(), "B1".to_string()]; let mut types = vec!["O".to_string(); 3]; - for word in sentence.split(' ') { + for word in corpus.split(' ') { if word.is_empty() { continue; } @@ -167,6 +228,29 @@ impl Segmenter { /// /// # Returns /// A vector of strings, where each string is a segmented word from the sentence. + /// + /// # Note + /// The method processes the sentence character by character, using the AdaBoost learner to predict whether a character is the beginning of a new word or not. + /// It constructs attributes based on the surrounding characters and their types, allowing for accurate segmentation. + /// If the sentence is empty, it returns an empty vector. + /// + /// # Example + /// ``` + /// use std::path::PathBuf; + /// + /// use litsea::segmenter::Segmenter; + /// use litsea::adaboost::AdaBoost; + /// + /// let model_file = + /// PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("./resources").join("RWCP.model"); + /// let mut learner = AdaBoost::new(0.01, 100, 1); + /// learner.load_model(model_file.as_path()).unwrap(); + /// + /// let segmenter = Segmenter::new(Some(learner)); + /// let result = segmenter.segment("これはテストです。"); + /// assert_eq!(result, vec!["これ", "は", "テスト", "です", "。"]); + /// ``` + /// This will segment the sentence into words and return them as a vector of strings. pub fn segment(&self, sentence: &str) -> Vec { if sentence.is_empty() { return Vec::new(); @@ -211,6 +295,12 @@ impl Segmenter { /// /// # Returns /// A HashSet of strings representing the attributes for the specified index. + /// + /// # Note + /// The attributes are constructed based on the surrounding characters and their types, allowing for rich feature extraction. + /// This method is used internally by the segmenter to create features for each character in the sentence. + /// + /// This will return a set of attributes for the character at index 4, which is "い" in this case. fn get_attributes( &self, i: usize, @@ -291,12 +381,23 @@ mod tests { use std::path::PathBuf; #[test] - fn test_add_sentence_with_writer() { + fn test_get_type() { + let segmenter = Segmenter::new(None); + + assert_eq!(segmenter.get_type("あ"), "I"); // Hiragana + assert_eq!(segmenter.get_type("漢"), "H"); // Kanji + assert_eq!(segmenter.get_type("A"), "A"); // Latin + assert_eq!(segmenter.get_type("1"), "N"); // Digit + assert_eq!(segmenter.get_type("@"), "O"); // Not matching any pattern + } + + #[test] + fn test_add_corpus_with_writer() { let mut segmenter = Segmenter::new(None); let sentence = "テスト です"; let mut collected = Vec::new(); - segmenter.add_sentence_with_writer(sentence, |attrs, label| { + segmenter.add_corpus_with_writer(sentence, |attrs, label| { collected.push((attrs, label)); }); @@ -315,14 +416,15 @@ mod tests { } #[test] - fn test_add_sentence_empty() { + fn test_add_corpus() { let mut segmenter = Segmenter::new(None); - segmenter.add_sentence(""); - // Should not panic or add anything + let sentence = "テスト です"; + segmenter.add_corpus(sentence); + // Should not panic or add anything, just a smoke test } #[test] - fn test_segmenter() { + fn test_segment() { let sentence = "これはテストです。"; let model_file = @@ -330,9 +432,8 @@ mod tests { let mut learner = AdaBoost::new(0.01, 100, 1); learner.load_model(model_file.as_path()).unwrap(); - let mut segmenter = Segmenter::new(Some(learner)); + let segmenter = Segmenter::new(Some(learner)); - segmenter.add_sentence(sentence); let result = segmenter.segment(sentence); assert!(!result.is_empty()); @@ -345,25 +446,21 @@ mod tests { } #[test] - fn test_segment_empty_sentence() { - let segmenter = Segmenter::new(None); - let result = segmenter.segment(""); - assert!(result.is_empty()); + fn test_add_sentence_empty() { + let mut segmenter = Segmenter::new(None); + segmenter.add_corpus(""); + // Should not panic or add anything } #[test] - fn test_get_type() { + fn test_segment_empty_sentence() { let segmenter = Segmenter::new(None); - - assert_eq!(segmenter.get_type("あ"), "I"); // Hiragana - assert_eq!(segmenter.get_type("漢"), "H"); // Kanji - assert_eq!(segmenter.get_type("A"), "A"); // Latin - assert_eq!(segmenter.get_type("1"), "N"); // Digit - assert_eq!(segmenter.get_type("@"), "O"); // Not matching any pattern + let result = segmenter.segment(""); + assert!(result.is_empty()); } #[test] - fn test_get_attributes_content() { + fn test_get_attributes() { let segmenter = Segmenter::new(None); let tags = vec!["U".to_string(); 7]; From 5d468f63182c3f2775073e02cd98077741d13c05 Mon Sep 17 00:00:00 2001 From: Minoru Osuka Date: Thu, 5 Jun 2025 13:41:30 +0900 Subject: [PATCH 02/10] Update LICENSE --- LICENSE | 31 +++++++++++++++++++++++++++++++ README.md | 6 ++++++ 2 files changed, 37 insertions(+) diff --git a/LICENSE b/LICENSE index d2614d5..bc67505 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ MIT License Copyright (c) 2025 Minoru OSUKA +Copyright (c) 2022 ICHINOSE Shogo Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -19,3 +20,33 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------- + +Copyright (c) 2008, Taku Kudo + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + * Neither the name of the nor the names of its +contributors may be used to endorse or promote products derived from this +software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index 6d0ad4d..65a6ff8 100644 --- a/README.md +++ b/README.md @@ -122,3 +122,9 @@ You can further improve performance by resuming training from an existing model ```sh ./target/release/litsea train -t 0.001 -i 10000 -m ./resources/model ./resources/new_features.txt ./resources/new_model ``` + +## License + +This project is distributed under the MIT License. +It also contains code originally developed by Taku Kudo and released under the BSD 3-Clause License. +See the LICENSE file for details. From 189411808aafee1ba9a72bc71fb4293455cdd590 Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Mon, 9 Jun 2025 15:19:17 +0900 Subject: [PATCH 03/10] Add text extraction script for Wikipedia (#12) * Add text extraction script for Wikipedia * Fix script * fix script * Add corpus.sh * Fix scripts --- scripts/corpus.sh | 108 ++++++++++++++++++ scripts/wikitexts.sh | 262 +++++++++++++++++++++++++++++++++++++++++++ src/segmenter.rs | 33 +++--- 3 files changed, 383 insertions(+), 20 deletions(-) create mode 100755 scripts/corpus.sh create mode 100755 scripts/wikitexts.sh diff --git a/scripts/corpus.sh b/scripts/corpus.sh new file mode 100755 index 0000000..abb5897 --- /dev/null +++ b/scripts/corpus.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +texts_file="${WIKI_TEXTS_FILE:-texts.txt}" +corpus_file="${WIKI_CORPUS_FILE:-corpus.txt}" + +############################################################################### +# usage function +# Displays the usage information for the script. +# Usage: usage +# This function is called when the script is run with the -h option or when an invalid option is provided. +# It prints the usage information and exits the script with a status code of 1. +############################################################################### +usage() { + echo "Usage: $0 [-h] [-t texts_file] [-c corpus_file]" + exit 1 +} + +while getopts "ht:c:" opt; do + case "$opt" in + h) usage ;; + t) texts_file="$OPTARG" ;; + c) corpus_file="$OPTARG" ;; + *) usage ;; + esac +done +shift $((OPTIND - 1)) + +echo "Texts file: ${texts_file}" +echo "Corpus file: ${corpus_file}" + +############################################################################### +# spinner definition +############################################################################### +spinner=( '|' '/' '-' '\' ) +spin_idx=0 + +############################################################################### +# cleanup function +# This function is called when the script exits or receives a signal. +# It kills the spinner process and exits the script. +# It is used to ensure that the spinner stops when the script is interrupted. +# Usage: cleanup +############################################################################### +cleanup() { + if [[ -n "$spinner_pid" ]]; then + kill "$spinner_pid" 2>/dev/null + fi + exit 1 +} + +############################################################################### +# Call cleanup when SIGINT, SIGTERM, or EXIT is received. +############################################################################### +trap cleanup INT TERM EXIT + + +############################################################################### +# spinner_loop function +# This function displays a spinner while a task is running. +# It takes a message as an argument to display. +# Usage: spinner_loop "Your message here" +############################################################################### +spinner_loop() { + local msg="$1" + while true; do + echo -ne "${msg} ... ${spinner[spin_idx]} \r" + spin_idx=$(( (spin_idx + 1) % ${#spinner[@]} )) + sleep 0.1 + done +} + + +############################################################################### +# Create the corpus file +############################################################################### +spinner_loop "Creating ${corpus_file}" & +spinner_pid=$! + +# Read one line at a time +while IFS= read -r sentence; do + ## Replace consecutive spaces with a single space + sentence=$(echo "$sentence" | tr -s ' ') + + # Remove leading and trailing whitespace + sentence=$(echo "${sentence}" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + + # Skip empty lines + if [[ -z "$sentence" ]]; then + continue + fi + + # Segment the sentence into words using Lindera + words=$(echo "$sentence" | lindera tokenize -k unidic \ + -o wakati \ + -t 'japanese_compound_word:{"kind":"unidic","tags":["名詞,数詞"],"new_tag":"複合語"}' \ + -t 'japanese_compound_word:{"kind":"unidic","tags":["記号,文字"],"new_tag":"複合語"}') + + ## Replace consecutive spaces with a single space + words=$(echo "$words" | tr -s ' ') + + # Append the segmented words to the corpus file + echo "$words" >> "$corpus_file" +done < "$texts_file" + +# Stop the spinner after the loop is complete. +kill "${spinner_pid}" 2>/dev/null +wait "${spinner_pid}" 2>/dev/null +echo "Creating ${corpus_file} completed." diff --git a/scripts/wikitexts.sh b/scripts/wikitexts.sh new file mode 100755 index 0000000..43ca393 --- /dev/null +++ b/scripts/wikitexts.sh @@ -0,0 +1,262 @@ +#!/bin/bash + +# Default value (uses the value defined in the environment variables, if defined) +timestamp="${WIKI_TIMESTAMP:-latest}" +title_count="${WIKI_TITLE_COUNT:-1000}" +texts_file="${WIKI_TEXTS_FILE:-wiki_texts.txt}" + +############################################################################### +# usage function +# Displays the usage information for the script. +# Usage: usage +# This function is called when the script is run with the -h option or when an invalid option is provided. +# It prints the usage information and exits the script with a status code of 1. +############################################################################### +usage() { + echo "Usage: $0 [-h] [-t timestamp] [-c title_count] [-o texts_file]" + exit 1 +} + +while getopts "hl:t:c:o:" opt; do + case "$opt" in + h) usage ;; + t) timestamp="$OPTARG" ;; + c) title_count="$OPTARG" ;; + o) texts_file="$OPTARG" ;; + *) usage ;; + esac +done +shift $((OPTIND - 1)) + +echo "Timestamp: ${timestamp}" +echo "Title count: ${title_count}" +echo "Texts file: ${texts_file}" + + +file_name="jawiki-${timestamp}-pages-articles-multistream-index.txt" +download_dir=/tmp +download_file="${file_name}.bz2" +download_url="https://dumps.wikimedia.org/jawiki/${timestamp}/${download_file}" + +############################################################################### +# spinner definition +############################################################################### +spinner=( '|' '/' '-' '\' ) +spin_idx=0 + + +############################################################################### +# cleanup function +# This function is called when the script exits or receives a signal. +# It kills the spinner process and exits the script. +# It is used to ensure that the spinner stops when the script is interrupted. +# Usage: cleanup +############################################################################### +cleanup() { + if [[ -n "$spinner_pid" ]]; then + kill "$spinner_pid" 2>/dev/null + fi + exit 1 +} + + +############################################################################### +# Call cleanup when SIGINT, SIGTERM, or EXIT is received. +############################################################################### +trap cleanup INT TERM EXIT + + +############################################################################### +# spinner_loop function +# This function displays a spinner while a task is running. +# It takes a message as an argument to display. +# Usage: spinner_loop "Your message here" +############################################################################### +spinner_loop() { + local msg="$1" + while true; do + echo -ne "${msg} ... ${spinner[spin_idx]} \r" + spin_idx=$(( (spin_idx + 1) % ${#spinner[@]} )) + sleep 0.1 + done +} + + +############################################################################### +# Download dump file +############################################################################### +spinner_loop "Downloading ${download_url}" & +spinner_pid=$! + +# Start curl in the background and obtain the process ID. +curl -s -o "${download_dir}/${download_file}" "${download_url}" + +# Stop the spinner after the download is complete. +kill "${spinner_pid}" 2>/dev/null +wait "${spinner_pid}" 2>/dev/null +echo "Downloading ${download_url} completed." + + +############################################################################### +# Decompressing dump file +############################################################################### +spinner_loop "Decompressing ${download_dir}/${download_file}" & +spinner_pid=$! + +# Start bunzip2 in the background and obtain the process ID. +bunzip2 -q "${download_dir}/${download_file}" 2>/dev/null + +# Stop the spinner after decompression is complete. +kill "${spinner_pid}" 2>/dev/null +wait "${spinner_pid}" 2>/dev/null +echo "Decompressing ${download_dir}/${download_file} completed." + + +############################################################################### +# Read the dump file, exclude unnecessary lines, extract titles, +# and save them to a temporary file. +############################################################################### +spinner_loop "Extracting titles from ${download_dir}/${file_name}" & +spinner_pid=$! + +tmpfile=$(mktemp /tmp/${file_name}.XXXXXX) + +# Read one line at a time +while IFS= read -r line; do + # Ignore empty lines + if [[ -z "${line}" ]]; then + continue + fi + + # If the line contains ":Category:", ignore it. + if [[ "${line}" == *":Category:"* ]]; then + continue + fi + + # If the line contains ":Template:", ignore it. + if [[ "${line}" == *":Template:"* ]]; then + continue + fi + + # If the line contains ":Wikipedia:", ignore it. + if [[ "${line}" == *":Wikipedia:"* ]]; then + continue + fi + + # If the line contains ":Portal:", ignore it. + if [[ "${line}" == *":Portal:"* ]]; then + continue + fi + + # Split the lines with ':' and get the rightmost part as the title. + title="${line##*:}" + + # Ignore empty titles + if [[ -z "${title}" ]]; then + continue + fi + + # Ignore titles containing "Help" + if [[ "${title}" == Help* ]]; then + continue + fi + + # Ignore titles containing "一覧" + if [[ "${title}" == *"一覧"* ]]; then + continue + fi + + # Ignore titles containing "曖昧さ回避" + if [[ "${title}" == *"曖昧さ回避"* ]]; then + continue + fi + + # Ignore titles containing "削除依頼" + if [[ "${title}" == *"削除依頼"* ]]; then + continue + fi + + # Ignore titles containing "削除記録" + if [[ "${title}" == *"削除記録"* ]]; then + continue + fi + + # Write title to file one line at a time + echo "${title}" >> ${tmpfile} +done < <(grep -Ev ':[^:]*[a-zA-Z][^:]*:' ${download_dir}/${file_name}) + +# Stop the spinner after the loop is complete. +kill "${spinner_pid}" 2>/dev/null +wait "${spinner_pid}" 2>/dev/null +echo "Extracting titles from ${download_dir}/${file_name} completed." + + +############################################################################### +# Select N titles at random +############################################################################### +spinner_loop "Creating ${texts_file}" & +spinner_pid=$! + +shuf -n ${title_count} ${tmpfile} | while read -r title; do + # If the title is blank, ignore it. + if [[ -z "${title}" ]]; then + continue + fi + + # URL encode title + encoded_title=$(echo -n "${title}" | jq -sRr @uri) + # echo "Processing title: ${title} (encoded: ${encoded_title})" + + # Generate Wikipedia URL + url="https://ja.wikipedia.org/wiki/${encoded_title}" + + # Generate Wikipedia API URL + url="https://ja.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&explaintext=1&redirects=1&titles=${encoded_title}" + + # Retrieve data from API and extract text + text=$(curl -s "${url}" | jq -r '.query.pages[] | .extract') + # echo "Extracted text: ${text}" + + # If the text is empty, ignore it. + if [[ -z "${text}" ]]; then + continue + fi + + # If the text is “null,” ignore it. + if [[ "${text}" == "null" ]]; then + continue + fi + + # Extract the longest line + longest_line=$(echo "${text}" | awk 'length > max_length { max_length = length; longest = $0 } END { print longest }') + # echo "Longest line: ${longest_line}" + + # Split text into sentences + readarray -t sentences < <(echo "${longest_line}" | sed -E 's/([!?\!?。]+)/\1\n/g') + + for sentence in "${sentences[@]}"; do + ## Replace consecutive spaces with a single space + line=$(echo "$line" | tr -s ' ') + + # Trim sentence + sentence=$(echo "${sentence}" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + + # If the sentence is empty, ignore it. + if [[ -z "${sentence}" ]]; then + continue + fi + + # 英数記号のみの行を除外 + if [[ "${sentence}" =~ ^[a-zA-Z0-9[:space:]\p{P}\p{S}]+$ ]]; then + continue + fi + + # Append the sentence to the texts file + echo "${sentence}" >> "${texts_file}" + done +done + +# Stop the spinner after the loop is complete. +kill "${spinner_pid}" 2>/dev/null +wait "${spinner_pid}" 2>/dev/null +echo "Creating ${texts_file} completed." diff --git a/src/segmenter.rs b/src/segmenter.rs index 1ee5c75..9eec379 100644 --- a/src/segmenter.rs +++ b/src/segmenter.rs @@ -19,11 +19,7 @@ impl Segmenter { /// A new Segmenter instance with the specified or default AdaBoost learner. /// /// # Note - /// The patterns are predefined to match various character types, including numbers, Japanese scripts, - /// Korean Hangul, Thai script, Kanji, and Latin characters. Each pattern is associated with a label - /// that indicates the type of character it matches, such as "N" for numbers, "I" for Hiragana, "K" for Katakana, - /// "G" for Hangul, "T" for Thai, "H" for Kanji, "Z" for CJK Unified Ideographs, "E" for Extended Latin, - /// and "A" for ASCII/Full-width Latin characters. + /// This method initializes the segmenter with predefined patterns for character classification. /// /// # Example /// ``` @@ -34,26 +30,18 @@ impl Segmenter { /// This will create a new Segmenter instance with a default AdaBoost learner. pub fn new(learner: Option) -> Self { let patterns = vec![ - // Numbers - (Regex::new(r"[0-90-9]").unwrap(), "N"), // Japanese Kanji numbers (Regex::new(r"[一二三四五六七八九十百千万億兆]").unwrap(), "M"), + // Kanji (Japanese) + (Regex::new(r"[一-龠々〆ヵヶ]").unwrap(), "H"), // Hiragana (Japanese) (Regex::new(r"[ぁ-ん]").unwrap(), "I"), // Katakana (Japanese) (Regex::new(r"[ァ-ヴーア-ン゙゚]").unwrap(), "K"), - // Hangul (Korean) - (Regex::new(r"[가-힣]").unwrap(), "G"), - // Thai script - (Regex::new(r"[ก-๛]").unwrap(), "T"), - // Kanji (Japanese) - (Regex::new(r"[一-龠々〆ヵヶ]").unwrap(), "H"), - // Kanji (CJK Unified Ideographs) - (Regex::new(r"[㐀-䶵一-鿿]").unwrap(), "Z"), - // Extended Latin (Vietnamese, etc.) - (Regex::new(r"[À-ÿĀ-ſƀ-ƿǍ-ɏ]").unwrap(), "E"), // ASCII + Full-width Latin (Regex::new(r"[a-zA-Za-zA-Z]").unwrap(), "A"), + // Numbers + (Regex::new(r"[0-90-9]").unwrap(), "N"), ]; Segmenter { @@ -68,9 +56,14 @@ impl Segmenter { /// * `ch` - A string slice representing a single character. /// /// # Returns - /// A string slice representing the type of the character, such as "N" for numbers, "I" for Hiragana, - /// "K" for Katakana, "G" for Hangul, "T" for Thai, "H" for Kanji, "Z" for CJK Unified Ideographs, - /// "E" for Extended Latin, and "A" for ASCII/Full-width Latin characters. + /// A string slice representing the type of the character: + /// - "M" for Kanji numbers + /// - "H" for Kanji + /// - "I" for Hiragana + /// - "K" for Katakana + /// - "A" for Latin characters (ASCII and Full-width) + /// - "N" for digits (0-9 and Full-width digits) + /// - "O" for Other characters (not matching any pattern) /// /// # Note /// If the character does not match any of the predefined patterns, it returns "O" for Other. From 96ab11343a2c5e058d8027b7ff2a7ac403baae9c Mon Sep 17 00:00:00 2001 From: SyoBoN Date: Thu, 16 Oct 2025 08:35:52 +0900 Subject: [PATCH 04/10] Separate CLI from the main crate (#15) --- Cargo.lock | 11 +++++++++-- Cargo.toml | 21 +++++---------------- litsea-cli/Cargo.toml | 17 +++++++++++++++++ {src => litsea-cli/src}/main.rs | 0 litsea/Cargo.toml | 21 +++++++++++++++++++++ {src => litsea/src}/adaboost.rs | 0 {src => litsea/src}/extractor.rs | 0 {src => litsea/src}/lib.rs | 0 {src => litsea/src}/segmenter.rs | 7 ++++--- {src => litsea/src}/trainer.rs | 0 10 files changed, 56 insertions(+), 21 deletions(-) create mode 100644 litsea-cli/Cargo.toml rename {src => litsea-cli/src}/main.rs (100%) create mode 100644 litsea/Cargo.toml rename {src => litsea/src}/adaboost.rs (100%) rename {src => litsea/src}/extractor.rs (100%) rename {src => litsea/src}/lib.rs (100%) rename {src => litsea/src}/segmenter.rs (98%) rename {src => litsea/src}/trainer.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index 16406bb..26dfcba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -228,8 +228,6 @@ checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" name = "litsea" version = "0.2.0" dependencies = [ - "clap", - "ctrlc", "rayon", "regex", "serde", @@ -237,6 +235,15 @@ dependencies = [ "tempfile", ] +[[package]] +name = "litsea-cli" +version = "0.2.0" +dependencies = [ + "clap", + "ctrlc", + "litsea", +] + [[package]] name = "memchr" version = "2.7.4" diff --git a/Cargo.toml b/Cargo.toml index f4e8db2..4016e6e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,8 @@ -[package] -name = "litsea" +[workspace] +resolver = "3" +members = ["litsea", "litsea-cli"] + +[workspace.package] version = "0.2.0" edition = "2021" description = "Litsea is an extreamely compact word segmentation and model training tool implemented in Rust." @@ -10,17 +13,3 @@ readme = "README.md" keywords = ["word", "segmentation", "training", "model"] categories = ["text-processing"] license = "MIT" - -[features] -default = [] - -[dependencies] -clap = { version = "4.5.39", features = ["derive"] } -ctrlc = "3.4.7" -rayon = "1.10.0" -regex = "1.10.5" -serde = { version = "1.0.219", features = ["derive"] } -serde_json = "1.0.140" - -[dev-dependencies] -tempfile = "3.20.0" diff --git a/litsea-cli/Cargo.toml b/litsea-cli/Cargo.toml new file mode 100644 index 0000000..b2f03d0 --- /dev/null +++ b/litsea-cli/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "litsea-cli" +version.workspace = true +edition.workspace = true +description.workspace = true +documentation.workspace = true +homepage.workspace = true +repository.workspace = true +readme.workspace = true +keywords.workspace = true +categories.workspace = true +license.workspace = true + +[dependencies] +clap = { version = "4.5.39", features = ["derive"] } +ctrlc = "3.4.7" +litsea = { version = "0.2.0", path = "../litsea" } diff --git a/src/main.rs b/litsea-cli/src/main.rs similarity index 100% rename from src/main.rs rename to litsea-cli/src/main.rs diff --git a/litsea/Cargo.toml b/litsea/Cargo.toml new file mode 100644 index 0000000..9cb76b8 --- /dev/null +++ b/litsea/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "litsea" +version.workspace = true +edition.workspace = true +description.workspace = true +documentation.workspace = true +homepage.workspace = true +repository.workspace = true +readme.workspace = true +keywords.workspace = true +categories.workspace = true +license.workspace = true + +[dependencies] +rayon = "1.10.0" +regex = "1.10.5" +serde = { version = "1.0.219", features = ["derive"] } +serde_json = "1.0.140" + +[dev-dependencies] +tempfile = "3.20.0" diff --git a/src/adaboost.rs b/litsea/src/adaboost.rs similarity index 100% rename from src/adaboost.rs rename to litsea/src/adaboost.rs diff --git a/src/extractor.rs b/litsea/src/extractor.rs similarity index 100% rename from src/extractor.rs rename to litsea/src/extractor.rs diff --git a/src/lib.rs b/litsea/src/lib.rs similarity index 100% rename from src/lib.rs rename to litsea/src/lib.rs diff --git a/src/segmenter.rs b/litsea/src/segmenter.rs similarity index 98% rename from src/segmenter.rs rename to litsea/src/segmenter.rs index 9eec379..48c3e3e 100644 --- a/src/segmenter.rs +++ b/litsea/src/segmenter.rs @@ -235,7 +235,7 @@ impl Segmenter { /// use litsea::adaboost::AdaBoost; /// /// let model_file = - /// PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("./resources").join("RWCP.model"); + /// PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../resources").join("RWCP.model"); /// let mut learner = AdaBoost::new(0.01, 100, 1); /// learner.load_model(model_file.as_path()).unwrap(); /// @@ -420,8 +420,9 @@ mod tests { fn test_segment() { let sentence = "これはテストです。"; - let model_file = - PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("./resources").join("RWCP.model"); + let model_file = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("../resources") + .join("RWCP.model"); let mut learner = AdaBoost::new(0.01, 100, 1); learner.load_model(model_file.as_path()).unwrap(); diff --git a/src/trainer.rs b/litsea/src/trainer.rs similarity index 100% rename from src/trainer.rs rename to litsea/src/trainer.rs From 5db09bcfb1d623d566ed2702e59eebf9e4d053d5 Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Thu, 16 Oct 2025 09:25:16 +0900 Subject: [PATCH 05/10] Update dependencies (#37) --- .gitignore | 3 ++ Cargo.lock | 93 ++++++++++++++++++++++++++++-------------- Cargo.toml | 13 +++++- litsea-cli/Cargo.toml | 26 ++++++------ litsea/Cargo.toml | 30 +++++++------- litsea/src/adaboost.rs | 4 +- 6 files changed, 108 insertions(+), 61 deletions(-) diff --git a/.gitignore b/.gitignore index 423b479..52c2a10 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,6 @@ target/ # Added by cargo /target + +.serena +.mcp.json diff --git a/Cargo.lock b/Cargo.lock index 26dfcba..fc4964d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -47,7 +47,7 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -58,7 +58,7 @@ checksum = "6680de5231bd6ee4c6191b8a1325daa282b415391ec9d3a37bd34f2060dc73fa" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -81,9 +81,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "clap" -version = "4.5.39" +version = "4.5.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd60e63e9be68e5fb56422e397cf9baddded06dae1d2e523401542383bc72a9f" +checksum = "f4512b90fa68d3a9932cea5184017c5d200f5921df706d45e853537dea51508f" dependencies = [ "clap_builder", "clap_derive", @@ -91,9 +91,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.39" +version = "4.5.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89cc6392a1f72bbeb820d71f32108f61fdaf18bc526e1d23954168a67759ef51" +checksum = "0025e98baa12e766c67ba13ff4695a887a1eba19569aad00a472546795bd6730" dependencies = [ "anstream", "anstyle", @@ -103,9 +103,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.32" +version = "4.5.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7" +checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" dependencies = [ "heck", "proc-macro2", @@ -152,14 +152,21 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "ctrlc" -version = "3.4.7" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46f93780a459b7d656ef7f071fe699c4d3d2cb201c4b24d085b6ddc505276e73" +checksum = "881c5d0a13b2f1498e2306e82cbada78390e152d4b1378fb28a84f4dcd0dc4f3" dependencies = [ + "dispatch", "nix", - "windows-sys", + "windows-sys 0.61.2", ] +[[package]] +name = "dispatch" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd0c93bb4b0c6d9b77f4435b0ae98c24d17f1c45b2ff844c6151a07256ca923b" + [[package]] name = "either" version = "1.15.0" @@ -173,7 +180,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cea14ef9355e3beab063703aa9dab15afd25f0667c341310c1e5274bb1d0da18" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -300,9 +307,9 @@ checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" [[package]] name = "rayon" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" dependencies = [ "either", "rayon-core", @@ -310,9 +317,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.12.1" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" dependencies = [ "crossbeam-deque", "crossbeam-utils", @@ -320,9 +327,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.11.1" +version = "1.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" dependencies = [ "aho-corasick", "memchr", @@ -332,9 +339,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.9" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" dependencies = [ "aho-corasick", "memchr", @@ -357,7 +364,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -368,18 +375,28 @@ checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" [[package]] name = "serde" -version = "1.0.219" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", @@ -388,14 +405,15 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.140" +version = "1.0.145" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" dependencies = [ "itoa", "memchr", "ryu", "serde", + "serde_core", ] [[package]] @@ -417,15 +435,15 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.20.0" +version = "3.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8a64e3985349f2441a1a9ef0b853f869006c3855f2cda6862a94d26ebb9d6a1" +checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" dependencies = [ "fastrand", "getrandom", "once_cell", "rustix", - "windows-sys", + "windows-sys 0.61.2", ] [[package]] @@ -449,6 +467,12 @@ dependencies = [ "wit-bindgen-rt", ] +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + [[package]] name = "windows-sys" version = "0.59.0" @@ -458,6 +482,15 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + [[package]] name = "windows-targets" version = "0.52.6" diff --git a/Cargo.toml b/Cargo.toml index 4016e6e..03ba31c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,7 @@ members = ["litsea", "litsea-cli"] [workspace.package] version = "0.2.0" -edition = "2021" +edition = "2024" description = "Litsea is an extreamely compact word segmentation and model training tool implemented in Rust." documentation = "https://docs.rs/litsea" homepage = "https://github.com/mosuka/litsea" @@ -13,3 +13,14 @@ readme = "README.md" keywords = ["word", "segmentation", "training", "model"] categories = ["text-processing"] license = "MIT" + +[workspace.dependencies] +clap = { version = "4.5.49", features = ["derive"] } +ctrlc = "3.5.0" +rayon = "1.11.0" +regex = "1.12.2" +serde = { version = "1.0.228", features = ["derive"] } +serde_json = "1.0.145" +tempfile = "3.23.0" + +litsea = { version = "0.2.0", path = "./litsea" } diff --git a/litsea-cli/Cargo.toml b/litsea-cli/Cargo.toml index b2f03d0..37e7f8b 100644 --- a/litsea-cli/Cargo.toml +++ b/litsea-cli/Cargo.toml @@ -1,17 +1,17 @@ [package] name = "litsea-cli" -version.workspace = true -edition.workspace = true -description.workspace = true -documentation.workspace = true -homepage.workspace = true -repository.workspace = true -readme.workspace = true -keywords.workspace = true -categories.workspace = true -license.workspace = true +version = "0.2.0" +edition = "2024" +description = "Command line interface for Litsea." +documentation = "https://docs.rs/litsea-cli" +homepage = "https://github.com/mosuka/litsea/litsea-cli" +repository = "https://github.com/mosuka/litsea" +readme = "README.md" +keywords = ["word", "segmentation", "training", "model"] +categories = ["text-processing"] +license = "MIT" [dependencies] -clap = { version = "4.5.39", features = ["derive"] } -ctrlc = "3.4.7" -litsea = { version = "0.2.0", path = "../litsea" } +clap.workspace = true +ctrlc.workspace = true +litsea.workspace = true diff --git a/litsea/Cargo.toml b/litsea/Cargo.toml index 9cb76b8..5507375 100644 --- a/litsea/Cargo.toml +++ b/litsea/Cargo.toml @@ -1,21 +1,21 @@ [package] name = "litsea" -version.workspace = true -edition.workspace = true -description.workspace = true -documentation.workspace = true -homepage.workspace = true -repository.workspace = true -readme.workspace = true -keywords.workspace = true -categories.workspace = true -license.workspace = true +version = "0.2.0" +edition = "2024" +description = "Litsea is an extreamely compact word segmentation and model training tool implemented in Rust." +documentation = "https://docs.rs/litsea" +homepage = "https://github.com/mosuka/litsea/litsea" +repository = "https://github.com/mosuka/litsea" +readme = "README.md" +keywords = ["word", "segmentation", "training", "model"] +categories = ["text-processing"] +license = "MIT" [dependencies] -rayon = "1.10.0" -regex = "1.10.5" -serde = { version = "1.0.219", features = ["derive"] } -serde_json = "1.0.140" +rayon.workspace = true +regex.workspace = true +serde.workspace = true +serde_json.workspace = true [dev-dependencies] -tempfile = "3.20.0" +tempfile.workspace = true diff --git a/litsea/src/adaboost.rs b/litsea/src/adaboost.rs index 82bccfc..a310c69 100644 --- a/litsea/src/adaboost.rs +++ b/litsea/src/adaboost.rs @@ -108,7 +108,7 @@ impl AdaBoost { } self.num_instances += 1; - if self.num_instances % 1000 == 0 { + if self.num_instances.is_multiple_of(1000) { eprint!("\rfinding instances...: {} instances found", self.num_instances); } } @@ -167,7 +167,7 @@ impl AdaBoost { let end = self.instances_buf.len(); self.instances.push((start, end)); self.instance_weights.push((-2.0 * label as f64 * score).exp()); - if self.instance_weights.len() % 1000 == 0 { + if self.instance_weights.len().is_multiple_of(1000) { eprint!( "\rloading instances...: {}/{} instances loaded", self.instance_weights.len(), From 494d7048d92bfade315f9b4f5e1ac6c48cdfa5d0 Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Thu, 16 Oct 2025 09:40:14 +0900 Subject: [PATCH 06/10] Update Makefile (#38) --- Cargo.lock | 4 ++-- Cargo.toml | 4 ++-- Makefile | 13 ++++++++++++- litsea-cli/Cargo.toml | 16 ++++++++-------- litsea/Cargo.toml | 18 +++++++++--------- 5 files changed, 33 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fc4964d..01b75bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -233,7 +233,7 @@ checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" [[package]] name = "litsea" -version = "0.2.0" +version = "0.3.0" dependencies = [ "rayon", "regex", @@ -244,7 +244,7 @@ dependencies = [ [[package]] name = "litsea-cli" -version = "0.2.0" +version = "0.3.0" dependencies = [ "clap", "ctrlc", diff --git a/Cargo.toml b/Cargo.toml index 03ba31c..681d14a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "3" members = ["litsea", "litsea-cli"] [workspace.package] -version = "0.2.0" +version = "0.3.0" edition = "2024" description = "Litsea is an extreamely compact word segmentation and model training tool implemented in Rust." documentation = "https://docs.rs/litsea" @@ -23,4 +23,4 @@ serde = { version = "1.0.228", features = ["derive"] } serde_json = "1.0.145" tempfile = "3.23.0" -litsea = { version = "0.2.0", path = "./litsea" } +litsea = { version = "0.3.0", path = "./litsea" } diff --git a/Makefile b/Makefile index 1dc3763..b82e9a2 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,9 @@ LITSEA_VERSION ?= $(shell cargo metadata --no-deps --format-version=1 | jq -r '.packages[] | select(.name=="litsea") | .version') +LITSEA_CLI_VERSION ?= $(shell cargo metadata --no-deps --format-version=1 | jq -r '.packages[] | select(.name=="litsea-cli") | .version') + +USER_AGENT ?= $(shell curl --version | head -n1 | awk '{print $1"/"$2}') +USER ?= $(shell whoami) +HOSTNAME ?= $(shell hostname) .DEFAULT_GOAL := help @@ -26,4 +31,10 @@ tag: ## Make a new tag for the current version git push origin v$(LITSEA_VERSION) publish: ## Publish the crate to crates.io - cargo package && cargo publish +ifeq ($(shell curl -s -XGET -H "User-Agent: $(USER_AGENT) ($(USER)@$(HOSTNAME))" https://crates.io/api/v1/crates/litsea | jq -r '.versions[].num' | grep $(LITSEA_VERSION)),) + (cd litsea && cargo package && cargo publish) + sleep 10 +endif +ifeq ($(shell curl -s -XGET -H "User-Agent: $(USER_AGENT) ($(USER)@$(HOSTNAME))" https://crates.io/api/v1/crates/litsea-cli | jq -r '.versions[].num' | grep $(LITSEA_CLI_VERSION)),) + (cd litsea-cli && cargo package && cargo publish) +endif diff --git a/litsea-cli/Cargo.toml b/litsea-cli/Cargo.toml index 37e7f8b..137b1f9 100644 --- a/litsea-cli/Cargo.toml +++ b/litsea-cli/Cargo.toml @@ -1,15 +1,15 @@ [package] name = "litsea-cli" -version = "0.2.0" -edition = "2024" -description = "Command line interface for Litsea." +version.workspace = true +edition.workspace = true +description.workspace = true documentation = "https://docs.rs/litsea-cli" homepage = "https://github.com/mosuka/litsea/litsea-cli" -repository = "https://github.com/mosuka/litsea" -readme = "README.md" -keywords = ["word", "segmentation", "training", "model"] -categories = ["text-processing"] -license = "MIT" +repository.workspace = true +readme.workspace = true +keywords.workspace = true +categories.workspace = true +license.workspace = true [dependencies] clap.workspace = true diff --git a/litsea/Cargo.toml b/litsea/Cargo.toml index 5507375..4c53bec 100644 --- a/litsea/Cargo.toml +++ b/litsea/Cargo.toml @@ -1,15 +1,15 @@ [package] name = "litsea" -version = "0.2.0" -edition = "2024" -description = "Litsea is an extreamely compact word segmentation and model training tool implemented in Rust." -documentation = "https://docs.rs/litsea" +version.workspace = true +edition.workspace = true +description.workspace = true +documentation.workspace = true homepage = "https://github.com/mosuka/litsea/litsea" -repository = "https://github.com/mosuka/litsea" -readme = "README.md" -keywords = ["word", "segmentation", "training", "model"] -categories = ["text-processing"] -license = "MIT" +repository.workspace = true +readme.workspace = true +keywords.workspace = true +categories.workspace = true +license.workspace = true [dependencies] rayon.workspace = true From 098baae8aa83c4da8cff46292d7287e678b20ee3 Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Thu, 16 Oct 2025 09:55:42 +0900 Subject: [PATCH 07/10] Update workflows (#39) --- .github/workflows/regression.yml | 4 +-- .github/workflows/release.yml | 46 ++++++++++++++++++++++++++++---- 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml index 09b9bcd..2a8b880 100644 --- a/.github/workflows/regression.yml +++ b/.github/workflows/regression.yml @@ -21,7 +21,7 @@ jobs: runs-on: ${{ matrix.platform.runner }} steps: - name: Run checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Install toolchain uses: dtolnay/rust-toolchain@v1 @@ -48,7 +48,7 @@ jobs: runs-on: ${{ matrix.platform.runner }} steps: - name: Run checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Install toolchain uses: dtolnay/rust-toolchain@v1 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 327840d..e2d8183 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -20,7 +20,7 @@ jobs: runs-on: ${{ matrix.platform.runner }} steps: - name: Run checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Install toolchain uses: dtolnay/rust-toolchain@v1 @@ -47,7 +47,7 @@ jobs: runs-on: ${{ matrix.platform.runner }} steps: - name: Run checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Install toolchain uses: dtolnay/rust-toolchain@v1 @@ -59,6 +59,23 @@ jobs: - name: Run test run: cargo test --target "${{ matrix.platform.target }}" --all-features + create-release: + name: Create Release + needs: [test] + runs-on: ubuntu-latest + steps: + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + name: Release ${{ github.ref_name }} + tag_name: ${{ github.ref_name }} + draft: false + prerelease: false + generate_release_notes: true + make_latest: true + release: name: Build needs: [test] @@ -82,7 +99,7 @@ jobs: runs-on: ${{ matrix.platform.runner }} steps: - name: Run checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Cache ~/.cargo/registry uses: actions/cache@v4 @@ -164,7 +181,26 @@ jobs: LITSEA_VERSION=$(cargo metadata --no-deps --format-version=1 | jq -r '.packages[] | select(.name=="litsea") | .version') LITSEA_VERSIONS=$(curl -s -XGET https://crates.io/api/v1/crates/litsea | jq -r 'select(.versions != null) | .versions[].num') if echo ${LITSEA_VERSIONS} | grep ${LITSEA_VERSION} >/dev/null; then - echo "litsea ${LITSEA_VERSION} has already been published" + echo "litsea ${LITSEA_VERSION} has already published" + else + pushd litsea + cargo publish + popd + fi + sleep 20 + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CRATES_TOKEN }} + + - name: Publish litsea-cli + run: | + LITSEA_CLI_VERSION=$(cargo metadata --no-deps --format-version=1 | jq -r '.packages[] | select(.name=="litsea-cli") | .version') + LITSEA_CLI_VERSIONS=$(curl -s -XGET https://crates.io/api/v1/crates/litsea-cli | jq -r 'select(.versions != null) | .versions[].num') + if echo ${LITSEA_CLI_VERSIONS} | grep ${LITSEA_CLI_VERSION} >/dev/null; then + echo "litsea-cli ${LITSEA_CLI_VERSION} has already published" else - cargo publish --token ${{ secrets.CRATES_TOKEN }} + pushd litsea-cli + cargo publish + popd fi + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CRATES_TOKEN }} From 29296aa570b90c8e04677c40a55cb1715f515cb1 Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Thu, 16 Oct 2025 10:26:41 +0900 Subject: [PATCH 08/10] Fix release.yml --- .github/workflows/release.yml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e2d8183..e664f58 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -95,7 +95,6 @@ jobs: archive: .zip extension: .exe toolchain: [stable] - feature: [default] runs-on: ${{ matrix.platform.runner }} steps: - name: Run checkout @@ -130,26 +129,26 @@ jobs: uses: microsoft/setup-msbuild@v2 - name: Compile - run: cargo build --release --features="${{ matrix.feature }}" --target=${{ matrix.platform.target }} --target-dir=target/${{ matrix.feature }} + run: cargo build --release --target=${{ matrix.platform.target }} - name: Create artifact for Linux if: matrix.platform.runner == 'ubuntu-latest' - run: zip --junk-paths litsea-${{ matrix.feature }}-${{ matrix.platform.target }}-${{ github.ref_name }}${{ matrix.platform.archive }} target/${{ matrix.feature }}/${{ matrix.platform.target }}/release/litsea${{ matrix.platform.extension }} + run: zip --junk-paths litsea-${{ matrix.platform.target }}-${{ github.ref_name }}${{ matrix.platform.archive }} target/${{ matrix.platform.target }}/release/litsea-cli${{ matrix.platform.extension }} - name: Create artifact for Windows if: matrix.platform.runner == 'windows-latest' - run: powershell Compress-Archive -DestinationPath litsea-${{ matrix.feature }}-${{ matrix.platform.target }}-${{ github.ref_name }}${{ matrix.platform.archive }} -Path target/${{ matrix.feature }}/${{ matrix.platform.target }}/release/litsea${{ matrix.platform.extension }} + run: powershell Compress-Archive -DestinationPath litsea-${{ matrix.platform.target }}-${{ github.ref_name }}${{ matrix.platform.archive }} -Path target/${{ matrix.platform.target }}/release/litsea-cli${{ matrix.platform.extension }} - name: Create artifact for OSX if: matrix.platform.runner == 'macos-latest' - run: zip --junk-paths litsea-${{ matrix.feature }}-${{ matrix.platform.target }}-${{ github.ref_name }}${{ matrix.platform.archive }} target/${{ matrix.feature }}/${{ matrix.platform.target }}/release/litsea${{ matrix.platform.extension }} + run: zip --junk-paths litsea-${{ matrix.platform.target }}-${{ github.ref_name }}${{ matrix.platform.archive }} target/${{ matrix.platform.target }}/release/litsea-cli${{ matrix.platform.extension }} - name: Upload artifact uses: softprops/action-gh-release@v2 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: - files: litsea-${{ matrix.feature }}-${{ matrix.platform.target }}-${{ github.ref_name }}${{ matrix.platform.archive }} + files: litsea-${{ matrix.platform.target }}-${{ github.ref_name }}${{ matrix.platform.archive }} name: Release ${{ github.ref_name }} tag_name: ${{ github.ref_name }} draft: false From 869545cd1c200dc2610f8810b233971432055108 Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Thu, 16 Oct 2025 10:40:59 +0900 Subject: [PATCH 09/10] Add README.md --- litsea-cli/README.md | 130 +++++++++++++++++++++++++++++++++++++++++++ litsea/README.md | 130 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 260 insertions(+) create mode 100644 litsea-cli/README.md create mode 100644 litsea/README.md diff --git a/litsea-cli/README.md b/litsea-cli/README.md new file mode 100644 index 0000000..65a6ff8 --- /dev/null +++ b/litsea-cli/README.md @@ -0,0 +1,130 @@ +# Litsea + +Litsea is an extremely compact word segmentation software implemented in Rust, inspired by [TinySegmenter](http://chasen.org/~taku/software/TinySegmenter/) and [TinySegmenterMaker](https://github.com/shogo82148/TinySegmenterMaker). Unlike traditional morphological analyzers such as [MeCab](https://taku910.github.io/mecab/) and [Lindera](https://github.com/lindera/lindera), Litsea does not rely on large-scale dictionaries but instead performs segmentation using a compact pre-trained model. It features a fast and safe Rust implementation along with a learner designed to be simple and highly extensible. + +There is a small plant called Litsea cubeba (Aomoji) in the same camphoraceae family as Lindera (Kuromoji). This is the origin of the name Litsea. + +## How to build Litsea + +Litsea is implemented in Rust. To build it, follow these steps: + +### Prerequisites + +- Install Rust (stable channel) from [rust-lang.org](https://www.rust-lang.org/). +- Ensure Cargo (Rust’s package manager) is available. + +### Build Instructions + +1. **Clone the Repository** + + If you haven't already cloned the repository, run: + + ```sh + git clone https://github.com/mosuka/litsea.git + cd litsea + ``` + +2. **Obtain Dependencies and Build** + + In the project's root directory, run: + + ```sh + cargo build --release + ``` + + The `--release` flag produces an optimized build. + +3. **Verify the Build** + + Once complete, the executable will be in the `target/release` folder. Verify by running: + + ```sh + ./target/release/litsea --help + ``` + +### Additional Notes + +- Using the latest stable Rust ensures compatibility with dependencies and allows use of modern features. +- Run `cargo update` to refresh your dependencies if needed. + +## How to train models + +Prepare a corpus with words separated by spaces in advance. + +- corpus.txt + + ```text + Litsea は TinySegmenter を 参考 に 開発 さ れ た 、 Rust で 実装 さ れ た 極めて コンパクト な 単語 分割 ソフトウェア です 。 + + ``` + +Extract the information and features from the corpus: + +```sh +./target/release/litsea extract ./resources/corpus.txt ./resources/features.txt +``` + +The output from the `extract` command is similar to: + +```text +Feature extraction completed successfully. +``` + +Train the features output by the above command using AdaBoost. Training stops if the new weak classifier’s accuracy falls below 0.001 or after 10,000 iterations. + +```sh +./target/release/litsea train -t 0.001 -i 10000 ./resources/features.txt ./resources/model +``` + +The output from the `train` command is similar to: + +```text +finding instances...: 61 instances found +loading instances...: 61/61 instances loaded +Iteration 9999 - margin: 0.16068839956263622 +Result Metrics: + Accuracy: 100.00% ( 61 / 61 ) + Precision: 100.00% ( 24 / 24 ) + Recall: 100.00% ( 24 / 24 ) + Confusion Matrix: + True Positives: 24 + False Positives: 0 + False Negatives: 0 + True Negatives: 37 +``` + +## How to segment sentences into words + +Use the trained model to segment sentences: + +```sh +echo "LitseaはTinySegmenterを参考に開発された、Rustで実装された極めてコンパクトな単語分割ソフトウェアです。" | ./target/release/litsea segment ./resources/model +``` + +The output will look like: + +```text +Litsea は TinySegmenter を 参考 に 開発 さ れ た 、 Rust で 実装 さ れ た 極めて コンパクト な 単語 分割 ソフトウェア です 。 +``` + +## Pre-trained models + +- **JEITA_Genpaku_ChaSen_IPAdic.model** + This model is trained using the morphologically analyzed corpus published by the Japan Electronics and Information Technology Industries Association (JEITA). It employs data from [Project Sugita Genpaku] analyzed with ChaSen+IPAdic. + +- **RWCP.model** + Extracted from the original [TinySegmenter](http://chasen.org/~taku/software/TinySegmenter/), this model contains only the segmentation component. + +## How to retrain existing models + +You can further improve performance by resuming training from an existing model with new corpora: + +```sh +./target/release/litsea train -t 0.001 -i 10000 -m ./resources/model ./resources/new_features.txt ./resources/new_model +``` + +## License + +This project is distributed under the MIT License. +It also contains code originally developed by Taku Kudo and released under the BSD 3-Clause License. +See the LICENSE file for details. diff --git a/litsea/README.md b/litsea/README.md new file mode 100644 index 0000000..65a6ff8 --- /dev/null +++ b/litsea/README.md @@ -0,0 +1,130 @@ +# Litsea + +Litsea is an extremely compact word segmentation software implemented in Rust, inspired by [TinySegmenter](http://chasen.org/~taku/software/TinySegmenter/) and [TinySegmenterMaker](https://github.com/shogo82148/TinySegmenterMaker). Unlike traditional morphological analyzers such as [MeCab](https://taku910.github.io/mecab/) and [Lindera](https://github.com/lindera/lindera), Litsea does not rely on large-scale dictionaries but instead performs segmentation using a compact pre-trained model. It features a fast and safe Rust implementation along with a learner designed to be simple and highly extensible. + +There is a small plant called Litsea cubeba (Aomoji) in the same camphoraceae family as Lindera (Kuromoji). This is the origin of the name Litsea. + +## How to build Litsea + +Litsea is implemented in Rust. To build it, follow these steps: + +### Prerequisites + +- Install Rust (stable channel) from [rust-lang.org](https://www.rust-lang.org/). +- Ensure Cargo (Rust’s package manager) is available. + +### Build Instructions + +1. **Clone the Repository** + + If you haven't already cloned the repository, run: + + ```sh + git clone https://github.com/mosuka/litsea.git + cd litsea + ``` + +2. **Obtain Dependencies and Build** + + In the project's root directory, run: + + ```sh + cargo build --release + ``` + + The `--release` flag produces an optimized build. + +3. **Verify the Build** + + Once complete, the executable will be in the `target/release` folder. Verify by running: + + ```sh + ./target/release/litsea --help + ``` + +### Additional Notes + +- Using the latest stable Rust ensures compatibility with dependencies and allows use of modern features. +- Run `cargo update` to refresh your dependencies if needed. + +## How to train models + +Prepare a corpus with words separated by spaces in advance. + +- corpus.txt + + ```text + Litsea は TinySegmenter を 参考 に 開発 さ れ た 、 Rust で 実装 さ れ た 極めて コンパクト な 単語 分割 ソフトウェア です 。 + + ``` + +Extract the information and features from the corpus: + +```sh +./target/release/litsea extract ./resources/corpus.txt ./resources/features.txt +``` + +The output from the `extract` command is similar to: + +```text +Feature extraction completed successfully. +``` + +Train the features output by the above command using AdaBoost. Training stops if the new weak classifier’s accuracy falls below 0.001 or after 10,000 iterations. + +```sh +./target/release/litsea train -t 0.001 -i 10000 ./resources/features.txt ./resources/model +``` + +The output from the `train` command is similar to: + +```text +finding instances...: 61 instances found +loading instances...: 61/61 instances loaded +Iteration 9999 - margin: 0.16068839956263622 +Result Metrics: + Accuracy: 100.00% ( 61 / 61 ) + Precision: 100.00% ( 24 / 24 ) + Recall: 100.00% ( 24 / 24 ) + Confusion Matrix: + True Positives: 24 + False Positives: 0 + False Negatives: 0 + True Negatives: 37 +``` + +## How to segment sentences into words + +Use the trained model to segment sentences: + +```sh +echo "LitseaはTinySegmenterを参考に開発された、Rustで実装された極めてコンパクトな単語分割ソフトウェアです。" | ./target/release/litsea segment ./resources/model +``` + +The output will look like: + +```text +Litsea は TinySegmenter を 参考 に 開発 さ れ た 、 Rust で 実装 さ れ た 極めて コンパクト な 単語 分割 ソフトウェア です 。 +``` + +## Pre-trained models + +- **JEITA_Genpaku_ChaSen_IPAdic.model** + This model is trained using the morphologically analyzed corpus published by the Japan Electronics and Information Technology Industries Association (JEITA). It employs data from [Project Sugita Genpaku] analyzed with ChaSen+IPAdic. + +- **RWCP.model** + Extracted from the original [TinySegmenter](http://chasen.org/~taku/software/TinySegmenter/), this model contains only the segmentation component. + +## How to retrain existing models + +You can further improve performance by resuming training from an existing model with new corpora: + +```sh +./target/release/litsea train -t 0.001 -i 10000 -m ./resources/model ./resources/new_features.txt ./resources/new_model +``` + +## License + +This project is distributed under the MIT License. +It also contains code originally developed by Taku Kudo and released under the BSD 3-Clause License. +See the LICENSE file for details. From 1631e18d0b22d6dd9ffcfbf97833fadcf84466ac Mon Sep 17 00:00:00 2001 From: Minoru OSUKA Date: Thu, 16 Oct 2025 10:58:36 +0900 Subject: [PATCH 10/10] Fix release.yml --- litsea-cli/Cargo.toml | 2 +- litsea/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/litsea-cli/Cargo.toml b/litsea-cli/Cargo.toml index 137b1f9..22d5e85 100644 --- a/litsea-cli/Cargo.toml +++ b/litsea-cli/Cargo.toml @@ -6,7 +6,7 @@ description.workspace = true documentation = "https://docs.rs/litsea-cli" homepage = "https://github.com/mosuka/litsea/litsea-cli" repository.workspace = true -readme.workspace = true +readme = "README.md" keywords.workspace = true categories.workspace = true license.workspace = true diff --git a/litsea/Cargo.toml b/litsea/Cargo.toml index 4c53bec..c8f4ca6 100644 --- a/litsea/Cargo.toml +++ b/litsea/Cargo.toml @@ -6,7 +6,7 @@ description.workspace = true documentation.workspace = true homepage = "https://github.com/mosuka/litsea/litsea" repository.workspace = true -readme.workspace = true +readme = "README.md" keywords.workspace = true categories.workspace = true license.workspace = true