Haphazard katakana and fullwidth space handling
This commit is contained in:
parent
4888f1c99f
commit
e7ffe3e7e7
|
@ -71,6 +71,7 @@ def debug_parse_manual_furi_line(line: str):
|
|||
print(romaji_syllables)
|
||||
|
||||
re_hiragana = re.compile(r'[\u3041-\u309f]+')
|
||||
re_kana = re.compile(r'[\u3041-\u30ff]+')
|
||||
def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
|
||||
# On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for
|
||||
# The problem is okurigana matching to the hiragana
|
||||
|
@ -82,14 +83,14 @@ def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
|
|||
kanji_block_indices = []
|
||||
last_unmatched_pos = 0
|
||||
furi_regex_pattern = ''
|
||||
for match in re_hiragana.finditer(kanji):
|
||||
for match in re_kana.finditer(kanji):
|
||||
start = match.start()
|
||||
if start > last_unmatched_pos:
|
||||
furi_regex_pattern += '(.+)'
|
||||
kanji_block_indices.append(len(furi_blocks))
|
||||
furi_blocks.append(kanji[last_unmatched_pos:start]) # str not FuriBlock!
|
||||
furi = match.group(0)
|
||||
furi_regex_pattern += furi
|
||||
furi_regex_pattern += furi.translate(katakana_to_hiragana_dict)
|
||||
last_unmatched_pos = match.end()
|
||||
furi_blocks.append(FuriBlock(furi, ''))
|
||||
if remainder := kanji[last_unmatched_pos:]:
|
||||
|
@ -99,7 +100,7 @@ def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
|
|||
|
||||
furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous!
|
||||
for idx, furi in zip(kanji_block_indices, furi_groups):
|
||||
furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi) # str -> FuriBlock
|
||||
furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi.strip()) # str -> FuriBlock
|
||||
return furi_blocks
|
||||
|
||||
def parse_japanese_line(line: str):
|
||||
|
|
Loading…
Reference in New Issue