Add main files

This commit is contained in:
sosokker 2024-01-04 12:24:04 +07:00
parent c145f2f6fa
commit 0f80667d4b
8 changed files with 970683 additions and 0 deletions

3
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,3 @@
{
"rust-analyzer.linkedProjects": [".\\Cargo.toml", ".\\Cargo.toml", ".\\Cargo.toml"]
}

12
Cargo.toml Normal file
View File

@ -0,0 +1,12 @@
[package]
name = "vocabcollector"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
select = "0.6.0"
serde = { version = "1.0", features = ["derive"] }
serde-xml-rs = "0.6.0"
zip = "0.6.6"

BIN
data/example/data.epub Normal file

Binary file not shown.

970507
data/words/th-en.xml Normal file

File diff suppressed because it is too large Load Diff

50
src/main.rs Normal file
View File

@ -0,0 +1,50 @@
use std::fs::File;
use std::io::{self, Read};
use zip::read::ZipArchive;
pub mod utils {
pub mod extractor;
pub mod translator;
pub mod xmlreader;
}
use utils::extractor::extract_text_from_html;
use utils::translator::{self, Translator};
fn read_epub(epub_path: &str) -> io::Result<Vec<String>> {
let file = File::open(epub_path)?;
let mut archive = ZipArchive::new(file)?;
let mut all_html_content = Vec::new();
for i in 0..archive.len() {
let mut file = archive.by_index(i)?;
if file.name().to_lowercase().ends_with(".html") {
let mut content = String::new();
file.read_to_string(&mut content)?;
all_html_content.push(content);
}
}
Ok(all_html_content)
}
fn main() {
let epub_path = "./data/example/data.epub";
let translator = Translator::new(translator::Thai);
match read_epub(epub_path) {
Ok(all_html_content) => {
for html_content in all_html_content {
let unique_words = extract_text_from_html(&html_content);
let translation_map = translator.translate(&unique_words);
for (headword, translation) in &translation_map {
println!("Headword: {}, Translation: {}", headword, translation);
}
}
}
Err(e) => eprintln!("Error reading EPUB: {}", e),
}
}

18
src/utils/extractor.rs Normal file
View File

@ -0,0 +1,18 @@
use select::document::Document;
use select::predicate::Name;
use std::collections::HashSet;
pub fn extract_text_from_html(html_content: &str) -> Vec<String> {
let document = Document::from_read(html_content.as_bytes()).unwrap();
let mut unique_words = HashSet::new();
for node in document.find(Name("text")) {
let text = node.text();
let lowercase_text = text.to_lowercase();
unique_words.insert(lowercase_text);
}
let result: Vec<String> = unique_words.into_iter().collect();
result
}

38
src/utils/translator.rs Normal file
View File

@ -0,0 +1,38 @@
use std::collections::HashMap;
use super::xmlreader::read_xml;
pub trait Language {
fn translate(&self, word: &Vec<String>) -> HashMap<String, String>;
}
pub struct Thai;
impl Language for Thai {
fn translate(&self, words: &Vec<String>) -> HashMap<String, String> {
let words_map = read_xml("./data/words/th-en.xml");
let mut translation_result: HashMap<String, String> = HashMap::new();
for word in words {
if let Some(translation) = words_map.get(word) {
translation_result.insert(word.clone(), translation.clone());
}
}
translation_result
}
}
pub struct Translator<T: Language> {
language: T,
}
impl<T: Language> Translator<T> {
pub fn new(language: T) -> Self {
Self { language }
}
pub fn translate(&self, words: &Vec<String>) -> HashMap<String, String> {
self.language.translate(words)
}
}

55
src/utils/xmlreader.rs Normal file
View File

@ -0,0 +1,55 @@
use serde::Deserialize;
use serde_xml_rs::from_reader;
use std::collections::HashMap;
use std::fs::File;
#[derive(Debug, Deserialize)]
pub struct Entry {
pub translation: Translation,
pub headword: String,
}
#[derive(Debug, Deserialize)]
pub struct Translation {
#[serde(rename = "lang")]
pub lang: String,
#[serde(rename = "$value")]
pub value: String,
}
#[derive(Debug, Deserialize)]
pub struct Lexitron {
#[serde(rename = "id")]
pub id: String,
}
#[derive(Debug, Deserialize)]
pub struct Lexicon {
pub entry: Vec<Entry>,
}
pub fn read_xml(xml_path: &str) -> HashMap<String, String> {
let xml_content = match File::open(xml_path) {
Ok(file) => file,
Err(_) => {
eprintln!("Error opening file: {}", xml_path);
return HashMap::new();
}
};
let lexicon: Lexicon = match from_reader(xml_content) {
Ok(lex) => lex,
Err(err) => {
eprintln!("Error parsing XML: {:?}", err);
return HashMap::new();
}
};
let mut result_map: HashMap<String, String> = HashMap::new();
for entry in lexicon.entry {
result_map.insert(entry.headword, entry.translation.value);
}
result_map
}