diff --git a/japanese_converters.py b/japanese_converters.py index 802c660..9228dc4 100644 --- a/japanese_converters.py +++ b/japanese_converters.py @@ -71,6 +71,7 @@ def debug_parse_manual_furi_line(line: str): print(romaji_syllables) re_hiragana = re.compile(r'[\u3041-\u309f]+') +re_kana = re.compile(r'[\u3041-\u30ff]+') def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]: # On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for # The problem is okurigana matching to the hiragana @@ -82,14 +83,14 @@ def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]: kanji_block_indices = [] last_unmatched_pos = 0 furi_regex_pattern = '' - for match in re_hiragana.finditer(kanji): + for match in re_kana.finditer(kanji): start = match.start() if start > last_unmatched_pos: furi_regex_pattern += '(.+)' kanji_block_indices.append(len(furi_blocks)) furi_blocks.append(kanji[last_unmatched_pos:start]) # str not FuriBlock! furi = match.group(0) - furi_regex_pattern += furi + furi_regex_pattern += furi.translate(katakana_to_hiragana_dict) last_unmatched_pos = match.end() furi_blocks.append(FuriBlock(furi, '')) if remainder := kanji[last_unmatched_pos:]: @@ -99,7 +100,7 @@ def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]: furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous! for idx, furi in zip(kanji_block_indices, furi_groups): - furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi) # str -> FuriBlock + furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi.strip()) # str -> FuriBlock return furi_blocks def parse_japanese_line(line: str):