diff --git a/src/dic.rs b/src/dic.rs new file mode 100644 index 0000000..f665309 --- /dev/null +++ b/src/dic.rs @@ -0,0 +1,241 @@ +// SPDX-FileCopyrightText: 2024 Joshua Goins +// SPDX-License-Identifier: GPL-3.0-or-later + +use std::collections::HashMap; +use std::io::{Cursor, Seek, SeekFrom}; + +use binrw::{BinRead, BinReaderExt}; +use binrw::binrw; +use crate::ByteSpan; + +// Based off of https://github.com/Lotlab/ffxiv-vulgar-words-reader/ +// Credit goes to Jim Kirisame for documenting this format +// TODO: double check I'm reading everything correctly + +#[binrw] +#[derive(Debug)] +#[brw(little)] +pub struct EntryItem { + flag: u32, + sibling: u32, + child: u32, + offset: u32 +} + +#[binrw] +#[derive(Debug)] +#[brw(little)] +struct DictionaryHeader { + #[br(seek_before = SeekFrom::Start(0x8124))] + #[br(count = 256)] + chara_replace1: Vec, + + #[br(count = 256)] + chara_replace2: Vec, + + #[br(count = 256)] + chara_replace3: Vec, + + #[br(count = 5)] + block_offsets: Vec, + + #[br(count = 5)] + block_lengths: Vec, + + #[br(pad_before = 4)] + #[br(count = 256)] + chara_block: Vec, + + #[br(ignore)] + begin_node: Vec, + + #[br(ignore)] + inner_node: Vec, + + #[br(ignore)] + chara: Vec, + + #[br(ignore)] + word: Vec, + + #[br(ignore)] + entries: Vec, +} + +pub struct Dictionary { + header: DictionaryHeader, + pub words: Vec +} + +impl Dictionary { + /// Parses an existing dictionary file. + pub fn from_existing(buffer: ByteSpan) -> Option { + let mut cursor = Cursor::new(buffer); + let mut dict = DictionaryHeader::read(&mut cursor).unwrap(); + + let map_start = 0x8750u32; + let map_size = 0x200u32; + + // fix up offsets + for offset in &mut dict.block_offsets { + *offset = *offset + map_start + map_size; + } + + for i in 0..dict.block_lengths[0] / 2 { + let offset = dict.block_offsets[0] + i * 2; + cursor.seek(SeekFrom::Start(offset as u64)).ok()?; + dict.begin_node.push(cursor.read_le::().ok()?); + } + + for i in 0..dict.block_lengths[1] / 2 { + let offset = dict.block_offsets[1] + i * 2; + cursor.seek(SeekFrom::Start(offset as u64)).ok()?; + dict.inner_node.push(cursor.read_le::().ok()?); + } + + for i in 0..dict.block_lengths[2] / 2 { + let offset = dict.block_offsets[2] + i * 2; + cursor.seek(SeekFrom::Start(offset as u64)).ok()?; + dict.chara.push(cursor.read_le::().ok()?); + } + + for i in 0..dict.block_lengths[3] / 2 { + let offset = dict.block_offsets[3] + i * 2; + cursor.seek(SeekFrom::Start(offset as u64)).ok()?; + dict.word.push(cursor.read_le::().ok()?); + } + + for i in 0..dict.block_lengths[4] / 16 { + let offset = dict.block_offsets[4] + i * 16; + cursor.seek(SeekFrom::Start(offset as u64)).ok()?; + dict.entries.push(cursor.read_le::().ok()?); + } + + let mut dict = Dictionary { + header: dict, + words: Vec::new() + }; + + // TODO: lol + dict.words = dict.list_words()?; + + Some(dict) + } + + fn list_words(&self) -> Option> { + let mut result = Vec::new(); + let lut = self.generate_index_rune_lookup_table(); + for (id, v) in self.header.begin_node.iter().enumerate() { + if *v == 0 { + continue; + } + + let chara = Dictionary::index_to_rune(&lut, id as u32); + self.dump_dict_node(&mut result, *v as i32, String::from(chara as u8 as char)) + } + + Some(result) + } + + fn generate_index_rune_lookup_table(&self) -> HashMap { + let mut map = HashMap::new(); + for i in 0..self.header.chara_block.len() { + map.insert(self.header.chara_block[i] as u16, i as u16); + } + + map + } + + fn index_to_rune(lookup_table: &HashMap, index: u32) -> i32 { + let higher = index >> 8; + let lower = index & 0xFF; + + if higher == 0 { + return 0; + } + + return if let Some(new_val) = lookup_table.get(&(higher as u16)) { + (((*new_val as u32) << 8) + lower) as i32 + } else { + 0 + } + } + + fn dump_dict_node(&self, vec: &mut Vec, entry_id: i32, prev: String) { + let node = &self.header.entries[entry_id as usize]; + for i in 0..node.sibling { + let Some(current) = self.get_string(entry_id, i as i32) else { + return; + }; + + if node.child == 0 { + vec.push(prev.clone() + ¤t); + continue; + } + + let value = self.header.inner_node[(node.child + i) as usize]; + if value == 0 { + vec.push(prev.clone() + ¤t); + continue; + } + + self.dump_dict_node(vec, value as i32, prev.clone() + ¤t); + } + } + + fn get_string(&self, entry_id: i32, sibling_id: i32) -> Option { + if let Some(characters) = self.get_string_characters(entry_id, sibling_id) { + return String::from_utf16(&characters).ok(); + } + + None + } + + fn get_string_characters(&self, entry_id: i32, sibling_id: i32) -> Option> { + if entry_id as usize >= self.header.entries.len() { + return None; + } + + let entry = self.header.entries.get(entry_id as usize)?; + + if entry.flag == 0 { + let pos = (entry.offset / 2) as i32 + sibling_id; + if pos as usize > self.header.chara.len() { + return None; + } + + if self.header.chara[pos as usize] == 0 { + return None; + } + + return Some(vec![self.header.chara[pos as usize]]); + } + + let begin = entry.offset / 2; + let mut end = begin + 1; + + while (end as usize) < self.header.word.len() && self.header.word[end as usize] != 0 { + end += 1; + } + + Some(self.header.word[begin as usize..end as usize].to_vec()) + } +} + +#[cfg(test)] +mod tests { + use std::fs::read; + use std::path::PathBuf; + + use super::*; + + #[test] + fn test_invalid() { + let mut d = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + d.push("resources/tests"); + d.push("random"); + + // Feeding it invalid data should not panic + Dictionary::from_existing(&read(d).unwrap()); + } +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index e24335c..b195b19 100755 --- a/src/lib.rs +++ b/src/lib.rs @@ -118,5 +118,8 @@ pub mod tera; mod common_file_operations; +/// Reading word dictionaries, such as the vulgar word list. +pub mod dic; + #[doc(hidden)] pub const PHYSIS_VERSION: &str = env!("CARGO_PKG_VERSION");