Haphazard katakana and fullwidth space handling
This commit is contained in:
parent
4888f1c99f
commit
e7ffe3e7e7
|
@ -71,6 +71,7 @@ def debug_parse_manual_furi_line(line: str):
|
||||||
print(romaji_syllables)
|
print(romaji_syllables)
|
||||||
|
|
||||||
re_hiragana = re.compile(r'[\u3041-\u309f]+')
|
re_hiragana = re.compile(r'[\u3041-\u309f]+')
|
||||||
|
re_kana = re.compile(r'[\u3041-\u30ff]+')
|
||||||
def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
|
def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
|
||||||
# On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for
|
# On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for
|
||||||
# The problem is okurigana matching to the hiragana
|
# The problem is okurigana matching to the hiragana
|
||||||
|
@ -82,14 +83,14 @@ def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
|
||||||
kanji_block_indices = []
|
kanji_block_indices = []
|
||||||
last_unmatched_pos = 0
|
last_unmatched_pos = 0
|
||||||
furi_regex_pattern = ''
|
furi_regex_pattern = ''
|
||||||
for match in re_hiragana.finditer(kanji):
|
for match in re_kana.finditer(kanji):
|
||||||
start = match.start()
|
start = match.start()
|
||||||
if start > last_unmatched_pos:
|
if start > last_unmatched_pos:
|
||||||
furi_regex_pattern += '(.+)'
|
furi_regex_pattern += '(.+)'
|
||||||
kanji_block_indices.append(len(furi_blocks))
|
kanji_block_indices.append(len(furi_blocks))
|
||||||
furi_blocks.append(kanji[last_unmatched_pos:start]) # str not FuriBlock!
|
furi_blocks.append(kanji[last_unmatched_pos:start]) # str not FuriBlock!
|
||||||
furi = match.group(0)
|
furi = match.group(0)
|
||||||
furi_regex_pattern += furi
|
furi_regex_pattern += furi.translate(katakana_to_hiragana_dict)
|
||||||
last_unmatched_pos = match.end()
|
last_unmatched_pos = match.end()
|
||||||
furi_blocks.append(FuriBlock(furi, ''))
|
furi_blocks.append(FuriBlock(furi, ''))
|
||||||
if remainder := kanji[last_unmatched_pos:]:
|
if remainder := kanji[last_unmatched_pos:]:
|
||||||
|
@ -99,7 +100,7 @@ def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
|
||||||
|
|
||||||
furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous!
|
furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous!
|
||||||
for idx, furi in zip(kanji_block_indices, furi_groups):
|
for idx, furi in zip(kanji_block_indices, furi_groups):
|
||||||
furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi) # str -> FuriBlock
|
furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi.strip()) # str -> FuriBlock
|
||||||
return furi_blocks
|
return furi_blocks
|
||||||
|
|
||||||
def parse_japanese_line(line: str):
|
def parse_japanese_line(line: str):
|
||||||
|
|
Loading…
Reference in New Issue