mirror of
https://github.com/Sosokker/VocabCollector.git
synced 2025-12-18 13:34:09 +01:00
Add main files
This commit is contained in:
parent
c145f2f6fa
commit
0f80667d4b
3
.vscode/settings.json
vendored
Normal file
3
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
{
|
||||
"rust-analyzer.linkedProjects": [".\\Cargo.toml", ".\\Cargo.toml", ".\\Cargo.toml"]
|
||||
}
|
||||
12
Cargo.toml
Normal file
12
Cargo.toml
Normal file
@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "vocabcollector"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
select = "0.6.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde-xml-rs = "0.6.0"
|
||||
zip = "0.6.6"
|
||||
BIN
data/example/data.epub
Normal file
BIN
data/example/data.epub
Normal file
Binary file not shown.
970507
data/words/th-en.xml
Normal file
970507
data/words/th-en.xml
Normal file
File diff suppressed because it is too large
Load Diff
50
src/main.rs
Normal file
50
src/main.rs
Normal file
@ -0,0 +1,50 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, Read};
|
||||
use zip::read::ZipArchive;
|
||||
|
||||
pub mod utils {
|
||||
pub mod extractor;
|
||||
pub mod translator;
|
||||
pub mod xmlreader;
|
||||
}
|
||||
|
||||
use utils::extractor::extract_text_from_html;
|
||||
use utils::translator::{self, Translator};
|
||||
|
||||
fn read_epub(epub_path: &str) -> io::Result<Vec<String>> {
|
||||
let file = File::open(epub_path)?;
|
||||
let mut archive = ZipArchive::new(file)?;
|
||||
|
||||
let mut all_html_content = Vec::new();
|
||||
|
||||
for i in 0..archive.len() {
|
||||
let mut file = archive.by_index(i)?;
|
||||
|
||||
if file.name().to_lowercase().ends_with(".html") {
|
||||
let mut content = String::new();
|
||||
file.read_to_string(&mut content)?;
|
||||
|
||||
all_html_content.push(content);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(all_html_content)
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let epub_path = "./data/example/data.epub";
|
||||
let translator = Translator::new(translator::Thai);
|
||||
|
||||
match read_epub(epub_path) {
|
||||
Ok(all_html_content) => {
|
||||
for html_content in all_html_content {
|
||||
let unique_words = extract_text_from_html(&html_content);
|
||||
let translation_map = translator.translate(&unique_words);
|
||||
for (headword, translation) in &translation_map {
|
||||
println!("Headword: {}, Translation: {}", headword, translation);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => eprintln!("Error reading EPUB: {}", e),
|
||||
}
|
||||
}
|
||||
18
src/utils/extractor.rs
Normal file
18
src/utils/extractor.rs
Normal file
@ -0,0 +1,18 @@
|
||||
use select::document::Document;
|
||||
use select::predicate::Name;
|
||||
use std::collections::HashSet;
|
||||
|
||||
pub fn extract_text_from_html(html_content: &str) -> Vec<String> {
|
||||
let document = Document::from_read(html_content.as_bytes()).unwrap();
|
||||
|
||||
let mut unique_words = HashSet::new();
|
||||
|
||||
for node in document.find(Name("text")) {
|
||||
let text = node.text();
|
||||
let lowercase_text = text.to_lowercase();
|
||||
unique_words.insert(lowercase_text);
|
||||
}
|
||||
|
||||
let result: Vec<String> = unique_words.into_iter().collect();
|
||||
result
|
||||
}
|
||||
38
src/utils/translator.rs
Normal file
38
src/utils/translator.rs
Normal file
@ -0,0 +1,38 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use super::xmlreader::read_xml;
|
||||
|
||||
pub trait Language {
|
||||
fn translate(&self, word: &Vec<String>) -> HashMap<String, String>;
|
||||
}
|
||||
|
||||
pub struct Thai;
|
||||
|
||||
impl Language for Thai {
|
||||
fn translate(&self, words: &Vec<String>) -> HashMap<String, String> {
|
||||
let words_map = read_xml("./data/words/th-en.xml");
|
||||
let mut translation_result: HashMap<String, String> = HashMap::new();
|
||||
|
||||
for word in words {
|
||||
if let Some(translation) = words_map.get(word) {
|
||||
translation_result.insert(word.clone(), translation.clone());
|
||||
}
|
||||
}
|
||||
|
||||
translation_result
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Translator<T: Language> {
|
||||
language: T,
|
||||
}
|
||||
|
||||
impl<T: Language> Translator<T> {
|
||||
pub fn new(language: T) -> Self {
|
||||
Self { language }
|
||||
}
|
||||
|
||||
pub fn translate(&self, words: &Vec<String>) -> HashMap<String, String> {
|
||||
self.language.translate(words)
|
||||
}
|
||||
}
|
||||
55
src/utils/xmlreader.rs
Normal file
55
src/utils/xmlreader.rs
Normal file
@ -0,0 +1,55 @@
|
||||
use serde::Deserialize;
|
||||
use serde_xml_rs::from_reader;
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Entry {
|
||||
pub translation: Translation,
|
||||
pub headword: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Translation {
|
||||
#[serde(rename = "lang")]
|
||||
pub lang: String,
|
||||
#[serde(rename = "$value")]
|
||||
pub value: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Lexitron {
|
||||
#[serde(rename = "id")]
|
||||
pub id: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Lexicon {
|
||||
pub entry: Vec<Entry>,
|
||||
}
|
||||
|
||||
pub fn read_xml(xml_path: &str) -> HashMap<String, String> {
|
||||
let xml_content = match File::open(xml_path) {
|
||||
Ok(file) => file,
|
||||
Err(_) => {
|
||||
eprintln!("Error opening file: {}", xml_path);
|
||||
return HashMap::new();
|
||||
}
|
||||
};
|
||||
|
||||
let lexicon: Lexicon = match from_reader(xml_content) {
|
||||
Ok(lex) => lex,
|
||||
Err(err) => {
|
||||
eprintln!("Error parsing XML: {:?}", err);
|
||||
return HashMap::new();
|
||||
}
|
||||
};
|
||||
|
||||
let mut result_map: HashMap<String, String> = HashMap::new();
|
||||
|
||||
for entry in lexicon.entry {
|
||||
result_map.insert(entry.headword, entry.translation.value);
|
||||
}
|
||||
|
||||
result_map
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user