Add some は=わ hack to prevent konnichiha romaji

This commit is contained in:
Luke Hubmayer-Werner 2024-12-18 20:54:50 +10:30
parent e0919ec472
commit c4f5748963
2 changed files with 77 additions and 16 deletions

View File

@ -3,6 +3,7 @@ from fugashi import Tagger
import pykakasi import pykakasi
import re import re
kks = pykakasi.kakasi() kks = pykakasi.kakasi()
def to_romaji(japanese: str, separator=' ') -> str: def to_romaji(japanese: str, separator=' ') -> str:
return separator.join([word['hepburn'] for word in kks.convert(japanese)]) return separator.join([word['hepburn'] for word in kks.convert(japanese)])
@ -10,6 +11,7 @@ def to_romaji(japanese: str, separator=' ') -> str:
katakana_to_hiragana_dict = {0x30A0+i:chr(0x3040+i) for i in range(1, 87)} katakana_to_hiragana_dict = {0x30A0+i:chr(0x3040+i) for i in range(1, 87)}
katakana_to_hiragana_dict[ord('*')] = ' ' katakana_to_hiragana_dict[ord('*')] = ' '
kana_merge_previous_syllable = {k for k in 'ゃゅょぁぃぇぉぅゎんャュョァィェォゥヮン'} kana_merge_previous_syllable = {k for k in 'ゃゅょぁぃぇぉぅゎんャュョァィェォゥヮン'}
kana_merge_next_syllable = {k for k in 'っッ'} kana_merge_next_syllable = {k for k in 'っッ'}
def kana_to_syllable_string(kana: str): def kana_to_syllable_string(kana: str):
@ -29,15 +31,30 @@ def kana_to_syllable_list(kana: str) -> list[str]:
# Kinda inefficient to work it as a string and then split it afterwards, but elegant # Kinda inefficient to work it as a string and then split it afterwards, but elegant
return kana_to_syllable_string(kana)[1:].split('|') return kana_to_syllable_string(kana)[1:].split('|')
tagger = Tagger('-Owakati')
tagger = Tagger('-Owakati')
re_wa = re.compile(r'')
def _parse_jp_token(token) -> tuple[str, str]:
s = str(token)
pronunciation: str = token.feature.pron # katakana with ー for long vowels, might not be 1:1 hiragana/romaji conversion
reading: str = token.feature.kana # katakana with correct vowels
# This is a hack to deal with は=わ morae
if len(pronunciation) == len(reading):
for match in re_wa.finditer(pronunciation):
i = match.start()
if reading[i] == '':
print('Found ハ=ワ:', s, pronunciation, reading)
reading = reading[:i] + '' + reading[i+1:]
else:
print('Different lengths pronunciation and reading:', s, pronunciation, reading)
return (s, reading)
def parse_jp_text(text: str) -> list[tuple[str, str]]: def parse_jp_text(text: str) -> list[tuple[str, str]]:
return [(str(token), token.feature.kana) for token in tagger(text)] return [_parse_jp_token(token) for token in tagger(text)]
# Our custom word overrides have two levels: # Our custom word overrides have two levels:
# - One is a simple search-replace to turn matches into manual furigana "{kanji|furi}" format. This could have false hits on short words. # - One is a simple search-replace to turn matches into manual furigana "{kanji|furi}" format. This could have false hits on short words.
# - The latter is to override a word's kana post-tokenization, which requires it to be a dictionary word with multiple readings. # - The latter is to override a word's kana post-tokenization, which requires it to be a dictionary word with multiple readings.
word_overrides = {'': 'わたし', '': 'しゅ'} default_word_overrides = {'': ('わた', 'わたし'), '': 'しゅ'}
re_manual_furi = re.compile(r'{(.+?)\|(.+?)}') re_manual_furi = re.compile(r'{(.+?)\|(.+?)}')
def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]: def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]:
@ -54,6 +71,7 @@ def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]:
blocks.append(FuriBlock(remainder, '')) blocks.append(FuriBlock(remainder, ''))
return blocks return blocks
# For debugging # For debugging
def furi_blocks_reconstruction(blocks: list[FuriBlock]): def furi_blocks_reconstruction(blocks: list[FuriBlock]):
kanji = ''.join([b.kanji for b in blocks]) kanji = ''.join([b.kanji for b in blocks])
@ -70,9 +88,11 @@ def debug_parse_manual_furi_line(line: str):
romaji_syllables = '|'.join([to_romaji(syl) for syl in syllables.split('|')]) romaji_syllables = '|'.join([to_romaji(syl) for syl in syllables.split('|')])
print(romaji_syllables) print(romaji_syllables)
re_hiragana = re.compile(r'[\u3041-\u309f]+') re_hiragana = re.compile(r'[\u3041-\u309f]+')
re_kana = re.compile(r'[\u3041-\u30ff]+') re_kana = re.compile(r'[\u3041-\u30ff]+')
def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]: re_kana_no_ha = re.compile(r'[\u3041-\u306e\u3070-\u30ff]+')
def _word_to_furi_blocks(kanji, hiragana, regex) -> list[FuriBlock]:
# On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for # On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for
# The problem is okurigana matching to the hiragana # The problem is okurigana matching to the hiragana
# In words with multiple kanji split by okurigana, this is a hard problem. # In words with multiple kanji split by okurigana, this is a hard problem.
@ -83,7 +103,7 @@ def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
kanji_block_indices = [] kanji_block_indices = []
last_unmatched_pos = 0 last_unmatched_pos = 0
furi_regex_pattern = '' furi_regex_pattern = ''
for match in re_kana.finditer(kanji): for match in regex.finditer(kanji):
start = match.start() start = match.start()
if start > last_unmatched_pos: if start > last_unmatched_pos:
furi_regex_pattern += '(.+)' furi_regex_pattern += '(.+)'
@ -97,13 +117,45 @@ def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
furi_regex_pattern += '(.+)' furi_regex_pattern += '(.+)'
kanji_block_indices.append(len(furi_blocks)) kanji_block_indices.append(len(furi_blocks))
furi_blocks.append(remainder) # str not FuriBlock! furi_blocks.append(remainder) # str not FuriBlock!
# This will throw on mismatch, e.g. from は=わ
furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous! try:
furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous!
except:
# print(furi_regex_pattern)
# print(kanji)
# print(hiragana)
raise
for idx, furi in zip(kanji_block_indices, furi_groups): for idx, furi in zip(kanji_block_indices, furi_groups):
furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi.strip()) # str -> FuriBlock furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi.strip()) # str -> FuriBlock
return furi_blocks return furi_blocks
def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
# Workaround for は=わ - try treating は as a kanji if first parse fails
try:
return _word_to_furi_blocks(kanji, hiragana, re_kana)
except AttributeError:
return _word_to_furi_blocks(kanji, hiragana, re_kana_no_ha)
def parse_japanese_line(line: str):
def parse_overrides_str(overrides_str: str) -> dict:
# Convert '私|わたくし|わたし' to {'私': ('わたくし', 'わたし')}
# Convert '私|わたし' to {'私': 'わたし'}
overrides_dict = default_word_overrides.copy()
for line in overrides_str.splitlines():
line = line.strip()
if not line:
continue
kanji, *replacement = line.split('|')
if not kanji or not replacement:
continue
if len(replacement) > 1:
overrides_dict[kanji] = (replacement[0], replacement[1])
else:
overrides_dict[kanji] = replacement[0]
return overrides_dict
def parse_japanese_line(line: str, overrides_str: str=''):
word_overrides = parse_overrides_str(overrides_str)
# Split line into plaintext segments to be tokenized, and manual furigana segments # Split line into plaintext segments to be tokenized, and manual furigana segments
last_unmatched_pos = 0 last_unmatched_pos = 0
word_tokens = [] word_tokens = []
@ -124,6 +176,8 @@ def parse_japanese_line(line: str):
output['word_pairs'] = [] output['word_pairs'] = []
for token in (it := iter(word_tokens)): for token in (it := iter(word_tokens)):
word, katakana = token word, katakana = token
if katakana is None: # TODO: Weird edge case with nonsense input, should fix elsewhere
katakana = ''
if word[-1] == '': # MeCab splits ?って into ?っ, て so we merge it back if word[-1] == '': # MeCab splits ?って into ?っ, て so we merge it back
try: try:
next_word, next_katakana = next(it) next_word, next_katakana = next(it)
@ -133,7 +187,12 @@ def parse_japanese_line(line: str):
pass pass
hiragana = katakana.translate(katakana_to_hiragana_dict) hiragana = katakana.translate(katakana_to_hiragana_dict)
if word in word_overrides: # Note that most word replacements will instead need to be handled BEFORE tokenization! if word in word_overrides: # Note that most word replacements will instead need to be handled BEFORE tokenization!
hiragana = word_overrides[word] override = word_overrides[word]
if isinstance(override, tuple):
if hiragana == override[0]:
hiragana = override[1]
else:
hiragana = override
output['word_pairs'].append((word, hiragana)) output['word_pairs'].append((word, hiragana))
# Process our (kanji, hiragana) word pairs into furigana blocks # Process our (kanji, hiragana) word pairs into furigana blocks

View File

@ -13,11 +13,11 @@ PlayResY: {PlayResY}
[V4+ Styles] [V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Default,Arial,72,&H002A0A00,&H000019FF,&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1 Style: Default,{LatinFont},72,&H002A0A00,&H000019FF,&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1
Style: Kanji,Migu 1P,{KanjiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,4.0,0,2,30,30,{KanjiVMargin},1 Style: Kanji,{JapaneseFont},{KanjiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,4.0,0,2,30,30,{KanjiVMargin},1
Style: Furigana,Migu 1P,{FuriSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,2,0,0,{FuriVMargin},1 Style: Furigana,{JapaneseFont},{FuriSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,2,0,0,{FuriVMargin},1
Style: Romaji,Migu 1P,{RomajiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1 Style: Romaji,{LatinFont},{RomajiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,20,1
Style: Translation,Migu 1P,{TranslationSize},&H00FFFFFF,&H000019FF,&H00000000,&H00000000,0,1,0,0,100,100,0,0,1,1.0,3,8,30,30,25,1 Style: Translation,{LatinFont},{TranslationSize},&H00FFFFFF,&H000019FF,&H00000000,&H00000000,0,1,0,0,100,100,0,0,1,1.0,3,8,30,30,20,1
[Events] [Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
@ -26,8 +26,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
format_defaults = { format_defaults = {
'PlayResX': 1280, 'PlayResX': 1280,
'PlayResY': 720, 'PlayResY': 720,
'TranslationSize': 48, 'LatinFont': 'Droid Sans',
'RomajiSize': 60, 'JapaneseFont': 'Droid Sans Japanese',
'TranslationSize': 36,
'RomajiSize': 48,
'KanjiSize': 72, 'KanjiSize': 72,
'KanjiVMargin': 20, 'KanjiVMargin': 20,
'FuriSize': 36, 'FuriSize': 36,