mirror of
https://github.com/Sosokker/VocabCollector.git
synced 2025-12-18 21:44:09 +01:00
Add main files
This commit is contained in:
parent
c145f2f6fa
commit
0f80667d4b
3
.vscode/settings.json
vendored
Normal file
3
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
{
|
||||||
|
"rust-analyzer.linkedProjects": [".\\Cargo.toml", ".\\Cargo.toml", ".\\Cargo.toml"]
|
||||||
|
}
|
||||||
12
Cargo.toml
Normal file
12
Cargo.toml
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
[package]
|
||||||
|
name = "vocabcollector"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
select = "0.6.0"
|
||||||
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
|
serde-xml-rs = "0.6.0"
|
||||||
|
zip = "0.6.6"
|
||||||
BIN
data/example/data.epub
Normal file
BIN
data/example/data.epub
Normal file
Binary file not shown.
970507
data/words/th-en.xml
Normal file
970507
data/words/th-en.xml
Normal file
File diff suppressed because it is too large
Load Diff
50
src/main.rs
Normal file
50
src/main.rs
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
use std::fs::File;
|
||||||
|
use std::io::{self, Read};
|
||||||
|
use zip::read::ZipArchive;
|
||||||
|
|
||||||
|
pub mod utils {
|
||||||
|
pub mod extractor;
|
||||||
|
pub mod translator;
|
||||||
|
pub mod xmlreader;
|
||||||
|
}
|
||||||
|
|
||||||
|
use utils::extractor::extract_text_from_html;
|
||||||
|
use utils::translator::{self, Translator};
|
||||||
|
|
||||||
|
fn read_epub(epub_path: &str) -> io::Result<Vec<String>> {
|
||||||
|
let file = File::open(epub_path)?;
|
||||||
|
let mut archive = ZipArchive::new(file)?;
|
||||||
|
|
||||||
|
let mut all_html_content = Vec::new();
|
||||||
|
|
||||||
|
for i in 0..archive.len() {
|
||||||
|
let mut file = archive.by_index(i)?;
|
||||||
|
|
||||||
|
if file.name().to_lowercase().ends_with(".html") {
|
||||||
|
let mut content = String::new();
|
||||||
|
file.read_to_string(&mut content)?;
|
||||||
|
|
||||||
|
all_html_content.push(content);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(all_html_content)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let epub_path = "./data/example/data.epub";
|
||||||
|
let translator = Translator::new(translator::Thai);
|
||||||
|
|
||||||
|
match read_epub(epub_path) {
|
||||||
|
Ok(all_html_content) => {
|
||||||
|
for html_content in all_html_content {
|
||||||
|
let unique_words = extract_text_from_html(&html_content);
|
||||||
|
let translation_map = translator.translate(&unique_words);
|
||||||
|
for (headword, translation) in &translation_map {
|
||||||
|
println!("Headword: {}, Translation: {}", headword, translation);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => eprintln!("Error reading EPUB: {}", e),
|
||||||
|
}
|
||||||
|
}
|
||||||
18
src/utils/extractor.rs
Normal file
18
src/utils/extractor.rs
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
use select::document::Document;
|
||||||
|
use select::predicate::Name;
|
||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
|
pub fn extract_text_from_html(html_content: &str) -> Vec<String> {
|
||||||
|
let document = Document::from_read(html_content.as_bytes()).unwrap();
|
||||||
|
|
||||||
|
let mut unique_words = HashSet::new();
|
||||||
|
|
||||||
|
for node in document.find(Name("text")) {
|
||||||
|
let text = node.text();
|
||||||
|
let lowercase_text = text.to_lowercase();
|
||||||
|
unique_words.insert(lowercase_text);
|
||||||
|
}
|
||||||
|
|
||||||
|
let result: Vec<String> = unique_words.into_iter().collect();
|
||||||
|
result
|
||||||
|
}
|
||||||
38
src/utils/translator.rs
Normal file
38
src/utils/translator.rs
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use super::xmlreader::read_xml;
|
||||||
|
|
||||||
|
pub trait Language {
|
||||||
|
fn translate(&self, word: &Vec<String>) -> HashMap<String, String>;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Thai;
|
||||||
|
|
||||||
|
impl Language for Thai {
|
||||||
|
fn translate(&self, words: &Vec<String>) -> HashMap<String, String> {
|
||||||
|
let words_map = read_xml("./data/words/th-en.xml");
|
||||||
|
let mut translation_result: HashMap<String, String> = HashMap::new();
|
||||||
|
|
||||||
|
for word in words {
|
||||||
|
if let Some(translation) = words_map.get(word) {
|
||||||
|
translation_result.insert(word.clone(), translation.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
translation_result
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Translator<T: Language> {
|
||||||
|
language: T,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Language> Translator<T> {
|
||||||
|
pub fn new(language: T) -> Self {
|
||||||
|
Self { language }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn translate(&self, words: &Vec<String>) -> HashMap<String, String> {
|
||||||
|
self.language.translate(words)
|
||||||
|
}
|
||||||
|
}
|
||||||
55
src/utils/xmlreader.rs
Normal file
55
src/utils/xmlreader.rs
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
use serde::Deserialize;
|
||||||
|
use serde_xml_rs::from_reader;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::fs::File;
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct Entry {
|
||||||
|
pub translation: Translation,
|
||||||
|
pub headword: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct Translation {
|
||||||
|
#[serde(rename = "lang")]
|
||||||
|
pub lang: String,
|
||||||
|
#[serde(rename = "$value")]
|
||||||
|
pub value: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct Lexitron {
|
||||||
|
#[serde(rename = "id")]
|
||||||
|
pub id: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct Lexicon {
|
||||||
|
pub entry: Vec<Entry>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn read_xml(xml_path: &str) -> HashMap<String, String> {
|
||||||
|
let xml_content = match File::open(xml_path) {
|
||||||
|
Ok(file) => file,
|
||||||
|
Err(_) => {
|
||||||
|
eprintln!("Error opening file: {}", xml_path);
|
||||||
|
return HashMap::new();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let lexicon: Lexicon = match from_reader(xml_content) {
|
||||||
|
Ok(lex) => lex,
|
||||||
|
Err(err) => {
|
||||||
|
eprintln!("Error parsing XML: {:?}", err);
|
||||||
|
return HashMap::new();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut result_map: HashMap<String, String> = HashMap::new();
|
||||||
|
|
||||||
|
for entry in lexicon.entry {
|
||||||
|
result_map.insert(entry.headword, entry.translation.value);
|
||||||
|
}
|
||||||
|
|
||||||
|
result_map
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user