Add main files

2025-12-18 13:34:09 +01:00 · 2024-01-04 12:24:04 +07:00 · 2024-01-04 12:24:04 +07:00 · 0f80667d4b
commit 0f80667d4b
parent c145f2f6fa
8 changed files with 970683 additions and 0 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,3 @@
+{
+  "rust-analyzer.linkedProjects": [".\\Cargo.toml", ".\\Cargo.toml", ".\\Cargo.toml"]
+}
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,12 @@
+[package]
+name = "vocabcollector"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+select = "0.6.0"
+serde = { version = "1.0", features = ["derive"] }
+serde-xml-rs = "0.6.0"
+zip = "0.6.6"
--- a/data/example/data.epub
+++ b/data/example/data.epub
--- a/data/words/th-en.xml
+++ b/data/words/th-en.xml
--- a/src/main.rs
+++ b/src/main.rs
@ -0,0 +1,50 @@
+use std::fs::File;
+use std::io::{self, Read};
+use zip::read::ZipArchive;
+
+pub mod utils {
+    pub mod extractor;
+    pub mod translator;
+    pub mod xmlreader;
+}
+
+use utils::extractor::extract_text_from_html;
+use utils::translator::{self, Translator};
+
+fn read_epub(epub_path: &str) -> io::Result<Vec<String>> {
+    let file = File::open(epub_path)?;
+    let mut archive = ZipArchive::new(file)?;
+
+    let mut all_html_content = Vec::new();
+
+    for i in 0..archive.len() {
+        let mut file = archive.by_index(i)?;
+
+        if file.name().to_lowercase().ends_with(".html") {
+            let mut content = String::new();
+            file.read_to_string(&mut content)?;
+
+            all_html_content.push(content);
+        }
+    }
+
+    Ok(all_html_content)
+}
+
+fn main() {
+    let epub_path = "./data/example/data.epub";
+    let translator = Translator::new(translator::Thai);
+
+    match read_epub(epub_path) {
+        Ok(all_html_content) => {
+            for html_content in all_html_content {
+                let unique_words = extract_text_from_html(&html_content);
+                let translation_map = translator.translate(&unique_words);
+                for (headword, translation) in &translation_map {
+                    println!("Headword: {}, Translation: {}", headword, translation);
+                }
+            }
+        }
+        Err(e) => eprintln!("Error reading EPUB: {}", e),
+    }
+}
--- a/src/utils/extractor.rs
+++ b/src/utils/extractor.rs
@ -0,0 +1,18 @@
+use select::document::Document;
+use select::predicate::Name;
+use std::collections::HashSet;
+
+pub fn extract_text_from_html(html_content: &str) -> Vec<String> {
+    let document = Document::from_read(html_content.as_bytes()).unwrap();
+
+    let mut unique_words = HashSet::new();
+
+    for node in document.find(Name("text")) {
+        let text = node.text();
+        let lowercase_text = text.to_lowercase();
+        unique_words.insert(lowercase_text);
+    }
+
+    let result: Vec<String> = unique_words.into_iter().collect();
+    result
+}
--- a/src/utils/translator.rs
+++ b/src/utils/translator.rs
@ -0,0 +1,38 @@
+use std::collections::HashMap;
+
+use super::xmlreader::read_xml;
+
+pub trait Language {
+    fn translate(&self, word: &Vec<String>) -> HashMap<String, String>;
+}
+
+pub struct Thai;
+
+impl Language for Thai {
+    fn translate(&self, words: &Vec<String>) -> HashMap<String, String> {
+        let words_map = read_xml("./data/words/th-en.xml");
+        let mut translation_result: HashMap<String, String> = HashMap::new();
+
+        for word in words {
+            if let Some(translation) = words_map.get(word) {
+                translation_result.insert(word.clone(), translation.clone());
+            }
+        }
+
+        translation_result
+    }
+}
+
+pub struct Translator<T: Language> {
+    language: T,
+}
+
+impl<T: Language> Translator<T> {
+    pub fn new(language: T) -> Self {
+        Self { language }
+    }
+
+    pub fn translate(&self, words: &Vec<String>) -> HashMap<String, String> {
+        self.language.translate(words)
+    }
+}
--- a/src/utils/xmlreader.rs
+++ b/src/utils/xmlreader.rs
@ -0,0 +1,55 @@
+use serde::Deserialize;
+use serde_xml_rs::from_reader;
+use std::collections::HashMap;
+use std::fs::File;
+
+#[derive(Debug, Deserialize)]
+pub struct Entry {
+    pub translation: Translation,
+    pub headword: String,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct Translation {
+    #[serde(rename = "lang")]
+    pub lang: String,
+    #[serde(rename = "$value")]
+    pub value: String,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct Lexitron {
+    #[serde(rename = "id")]
+    pub id: String,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct Lexicon {
+    pub entry: Vec<Entry>,
+}
+
+pub fn read_xml(xml_path: &str) -> HashMap<String, String> {
+    let xml_content = match File::open(xml_path) {
+        Ok(file) => file,
+        Err(_) => {
+            eprintln!("Error opening file: {}", xml_path);
+            return HashMap::new();
+        }
+    };
+
+    let lexicon: Lexicon = match from_reader(xml_content) {
+        Ok(lex) => lex,
+        Err(err) => {
+            eprintln!("Error parsing XML: {:?}", err);
+            return HashMap::new();
+        }
+    };
+
+    let mut result_map: HashMap<String, String> = HashMap::new();
+
+    for entry in lexicon.entry {
+        result_map.insert(entry.headword, entry.translation.value);
+    }
+
+    result_map
+}