Skip to content

Commit

Permalink
feat: reading skipped in unidic lex that matches surface in katakana
Browse files Browse the repository at this point in the history
  • Loading branch information
BlueGreenMagick committed Jun 17, 2024
1 parent 2a49a63 commit c016e8f
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 2 deletions.
14 changes: 14 additions & 0 deletions rust/src/japanese.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ pub trait JapaneseChar {
fn is_kana(&self) -> bool;
fn is_hiragana(&self) -> bool;
fn is_katakana(&self) -> bool;
fn to_katakana(&self) -> char;
}

impl JapaneseChar for char {
Expand All @@ -17,16 +18,29 @@ impl JapaneseChar for char {
fn is_katakana(&self) -> bool {
matches!(*self, '\u{30a0}'..='\u{30ff}')
}

fn to_katakana(&self) -> char {
if *self >= '\u{3041}' && *self <= '\u{3096}' {
char::from_u32(*self as u32 + 96).unwrap()
} else {
*self
}
}
}

pub trait JapaneseString {
fn contains_only_kana(&self) -> bool;
fn to_katakana(&self) -> String;
}

impl JapaneseString for str {
fn contains_only_kana(&self) -> bool {
self.chars().all(|c| c.is_kana())
}

fn to_katakana(&self) -> String {
self.chars().map(|c| c.to_katakana()).collect()
}
}

/// 五段
Expand Down
2 changes: 1 addition & 1 deletion rust/src/tokenize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ impl TokenDetails {
.next()
.map(|r| {
if r.is_empty() {
surface.into()
surface.to_katakana()
} else {
r.to_string()
}
Expand Down
12 changes: 11 additions & 1 deletion unidic/src/transform.rs
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,8 @@ fn identical_base_to_empty_string(items: &mut Vec<LexItem>) {
/// if item.surface == item.reading (most kana-only words), set reading to "" to save space
fn identical_reading_to_empty_string(items: &mut Vec<LexItem>) {
for item in items {
if item.surface == item.reading {
let katakana_surface: String = item.surface.chars().map(|c| c.to_katakana()).collect();
if katakana_surface == item.reading {
item.reading = "".into()
}
}
Expand Down Expand Up @@ -492,6 +493,7 @@ pub trait JapaneseChar {
fn is_kana(&self) -> bool;
fn is_hiragana(&self) -> bool;
fn is_katakana(&self) -> bool;
fn to_katakana(&self) -> char;
}

impl JapaneseChar for char {
Expand All @@ -506,4 +508,12 @@ impl JapaneseChar for char {
fn is_katakana(&self) -> bool {
matches!(*self, '\u{30a0}'..='\u{30ff}')
}

fn to_katakana(&self) -> char {
if *self >= '\u{3041}' && *self <= '\u{3096}' {
char::from_u32(*self as u32 + 96).unwrap()
} else {
*self
}
}
}

0 comments on commit c016e8f

Please sign in to comment.