Haphazard katakana and fullwidth space handling

This commit is contained in:
Luke Hubmayer-Werner 2024-12-17 21:28:23 +10:30
parent 4888f1c99f
commit e7ffe3e7e7
1 changed files with 4 additions and 3 deletions

View File

@ -71,6 +71,7 @@ def debug_parse_manual_furi_line(line: str):
print(romaji_syllables)
re_hiragana = re.compile(r'[\u3041-\u309f]+')
re_kana = re.compile(r'[\u3041-\u30ff]+')
def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
# On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for
# The problem is okurigana matching to the hiragana
@ -82,14 +83,14 @@ def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
kanji_block_indices = []
last_unmatched_pos = 0
furi_regex_pattern = ''
for match in re_hiragana.finditer(kanji):
for match in re_kana.finditer(kanji):
start = match.start()
if start > last_unmatched_pos:
furi_regex_pattern += '(.+)'
kanji_block_indices.append(len(furi_blocks))
furi_blocks.append(kanji[last_unmatched_pos:start]) # str not FuriBlock!
furi = match.group(0)
furi_regex_pattern += furi
furi_regex_pattern += furi.translate(katakana_to_hiragana_dict)
last_unmatched_pos = match.end()
furi_blocks.append(FuriBlock(furi, ''))
if remainder := kanji[last_unmatched_pos:]:
@ -99,7 +100,7 @@ def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous!
for idx, furi in zip(kanji_block_indices, furi_groups):
furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi) # str -> FuriBlock
furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi.strip()) # str -> FuriBlock
return furi_blocks
def parse_japanese_line(line: str):