Add some は=わ hack to prevent konnichiha romaji
This commit is contained in:
parent
e0919ec472
commit
c4f5748963
|
@ -3,6 +3,7 @@ from fugashi import Tagger
|
||||||
import pykakasi
|
import pykakasi
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
kks = pykakasi.kakasi()
|
kks = pykakasi.kakasi()
|
||||||
def to_romaji(japanese: str, separator=' ') -> str:
|
def to_romaji(japanese: str, separator=' ') -> str:
|
||||||
return separator.join([word['hepburn'] for word in kks.convert(japanese)])
|
return separator.join([word['hepburn'] for word in kks.convert(japanese)])
|
||||||
|
@ -10,6 +11,7 @@ def to_romaji(japanese: str, separator=' ') -> str:
|
||||||
katakana_to_hiragana_dict = {0x30A0+i:chr(0x3040+i) for i in range(1, 87)}
|
katakana_to_hiragana_dict = {0x30A0+i:chr(0x3040+i) for i in range(1, 87)}
|
||||||
katakana_to_hiragana_dict[ord('*')] = ' '
|
katakana_to_hiragana_dict[ord('*')] = ' '
|
||||||
|
|
||||||
|
|
||||||
kana_merge_previous_syllable = {k for k in 'ゃゅょぁぃぇぉぅゎんャュョァィェォゥヮン'}
|
kana_merge_previous_syllable = {k for k in 'ゃゅょぁぃぇぉぅゎんャュョァィェォゥヮン'}
|
||||||
kana_merge_next_syllable = {k for k in 'っッ'}
|
kana_merge_next_syllable = {k for k in 'っッ'}
|
||||||
def kana_to_syllable_string(kana: str):
|
def kana_to_syllable_string(kana: str):
|
||||||
|
@ -29,15 +31,30 @@ def kana_to_syllable_list(kana: str) -> list[str]:
|
||||||
# Kinda inefficient to work it as a string and then split it afterwards, but elegant
|
# Kinda inefficient to work it as a string and then split it afterwards, but elegant
|
||||||
return kana_to_syllable_string(kana)[1:].split('|')
|
return kana_to_syllable_string(kana)[1:].split('|')
|
||||||
|
|
||||||
tagger = Tagger('-Owakati')
|
|
||||||
|
|
||||||
|
tagger = Tagger('-Owakati')
|
||||||
|
re_wa = re.compile(r'ワ')
|
||||||
|
def _parse_jp_token(token) -> tuple[str, str]:
|
||||||
|
s = str(token)
|
||||||
|
pronunciation: str = token.feature.pron # katakana with ー for long vowels, might not be 1:1 hiragana/romaji conversion
|
||||||
|
reading: str = token.feature.kana # katakana with correct vowels
|
||||||
|
# This is a hack to deal with は=わ morae
|
||||||
|
if len(pronunciation) == len(reading):
|
||||||
|
for match in re_wa.finditer(pronunciation):
|
||||||
|
i = match.start()
|
||||||
|
if reading[i] == 'ハ':
|
||||||
|
print('Found ハ=ワ:', s, pronunciation, reading)
|
||||||
|
reading = reading[:i] + 'ワ' + reading[i+1:]
|
||||||
|
else:
|
||||||
|
print('Different lengths pronunciation and reading:', s, pronunciation, reading)
|
||||||
|
return (s, reading)
|
||||||
def parse_jp_text(text: str) -> list[tuple[str, str]]:
|
def parse_jp_text(text: str) -> list[tuple[str, str]]:
|
||||||
return [(str(token), token.feature.kana) for token in tagger(text)]
|
return [_parse_jp_token(token) for token in tagger(text)]
|
||||||
|
|
||||||
# Our custom word overrides have two levels:
|
# Our custom word overrides have two levels:
|
||||||
# - One is a simple search-replace to turn matches into manual furigana "{kanji|furi}" format. This could have false hits on short words.
|
# - One is a simple search-replace to turn matches into manual furigana "{kanji|furi}" format. This could have false hits on short words.
|
||||||
# - The latter is to override a word's kana post-tokenization, which requires it to be a dictionary word with multiple readings.
|
# - The latter is to override a word's kana post-tokenization, which requires it to be a dictionary word with multiple readings.
|
||||||
word_overrides = {'私': 'わたし', '主': 'しゅ'}
|
default_word_overrides = {'私': ('わたくし', 'わたし'), '主': 'しゅ'}
|
||||||
re_manual_furi = re.compile(r'{(.+?)\|(.+?)}')
|
re_manual_furi = re.compile(r'{(.+?)\|(.+?)}')
|
||||||
|
|
||||||
def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]:
|
def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]:
|
||||||
|
@ -54,6 +71,7 @@ def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]:
|
||||||
blocks.append(FuriBlock(remainder, ''))
|
blocks.append(FuriBlock(remainder, ''))
|
||||||
return blocks
|
return blocks
|
||||||
|
|
||||||
|
|
||||||
# For debugging
|
# For debugging
|
||||||
def furi_blocks_reconstruction(blocks: list[FuriBlock]):
|
def furi_blocks_reconstruction(blocks: list[FuriBlock]):
|
||||||
kanji = ''.join([b.kanji for b in blocks])
|
kanji = ''.join([b.kanji for b in blocks])
|
||||||
|
@ -70,9 +88,11 @@ def debug_parse_manual_furi_line(line: str):
|
||||||
romaji_syllables = '|'.join([to_romaji(syl) for syl in syllables.split('|')])
|
romaji_syllables = '|'.join([to_romaji(syl) for syl in syllables.split('|')])
|
||||||
print(romaji_syllables)
|
print(romaji_syllables)
|
||||||
|
|
||||||
|
|
||||||
re_hiragana = re.compile(r'[\u3041-\u309f]+')
|
re_hiragana = re.compile(r'[\u3041-\u309f]+')
|
||||||
re_kana = re.compile(r'[\u3041-\u30ff]+')
|
re_kana = re.compile(r'[\u3041-\u30ff]+')
|
||||||
def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
|
re_kana_no_ha = re.compile(r'[\u3041-\u306e\u3070-\u30ff]+')
|
||||||
|
def _word_to_furi_blocks(kanji, hiragana, regex) -> list[FuriBlock]:
|
||||||
# On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for
|
# On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for
|
||||||
# The problem is okurigana matching to the hiragana
|
# The problem is okurigana matching to the hiragana
|
||||||
# In words with multiple kanji split by okurigana, this is a hard problem.
|
# In words with multiple kanji split by okurigana, this is a hard problem.
|
||||||
|
@ -83,7 +103,7 @@ def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
|
||||||
kanji_block_indices = []
|
kanji_block_indices = []
|
||||||
last_unmatched_pos = 0
|
last_unmatched_pos = 0
|
||||||
furi_regex_pattern = ''
|
furi_regex_pattern = ''
|
||||||
for match in re_kana.finditer(kanji):
|
for match in regex.finditer(kanji):
|
||||||
start = match.start()
|
start = match.start()
|
||||||
if start > last_unmatched_pos:
|
if start > last_unmatched_pos:
|
||||||
furi_regex_pattern += '(.+)'
|
furi_regex_pattern += '(.+)'
|
||||||
|
@ -97,13 +117,45 @@ def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
|
||||||
furi_regex_pattern += '(.+)'
|
furi_regex_pattern += '(.+)'
|
||||||
kanji_block_indices.append(len(furi_blocks))
|
kanji_block_indices.append(len(furi_blocks))
|
||||||
furi_blocks.append(remainder) # str not FuriBlock!
|
furi_blocks.append(remainder) # str not FuriBlock!
|
||||||
|
# This will throw on mismatch, e.g. from は=わ
|
||||||
|
try:
|
||||||
furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous!
|
furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous!
|
||||||
|
except:
|
||||||
|
# print(furi_regex_pattern)
|
||||||
|
# print(kanji)
|
||||||
|
# print(hiragana)
|
||||||
|
raise
|
||||||
for idx, furi in zip(kanji_block_indices, furi_groups):
|
for idx, furi in zip(kanji_block_indices, furi_groups):
|
||||||
furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi.strip()) # str -> FuriBlock
|
furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi.strip()) # str -> FuriBlock
|
||||||
return furi_blocks
|
return furi_blocks
|
||||||
|
def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
|
||||||
|
# Workaround for は=わ - try treating は as a kanji if first parse fails
|
||||||
|
try:
|
||||||
|
return _word_to_furi_blocks(kanji, hiragana, re_kana)
|
||||||
|
except AttributeError:
|
||||||
|
return _word_to_furi_blocks(kanji, hiragana, re_kana_no_ha)
|
||||||
|
|
||||||
def parse_japanese_line(line: str):
|
|
||||||
|
def parse_overrides_str(overrides_str: str) -> dict:
|
||||||
|
# Convert '私|わたくし|わたし' to {'私': ('わたくし', 'わたし')}
|
||||||
|
# Convert '私|わたし' to {'私': 'わたし'}
|
||||||
|
overrides_dict = default_word_overrides.copy()
|
||||||
|
for line in overrides_str.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
kanji, *replacement = line.split('|')
|
||||||
|
if not kanji or not replacement:
|
||||||
|
continue
|
||||||
|
if len(replacement) > 1:
|
||||||
|
overrides_dict[kanji] = (replacement[0], replacement[1])
|
||||||
|
else:
|
||||||
|
overrides_dict[kanji] = replacement[0]
|
||||||
|
return overrides_dict
|
||||||
|
|
||||||
|
|
||||||
|
def parse_japanese_line(line: str, overrides_str: str=''):
|
||||||
|
word_overrides = parse_overrides_str(overrides_str)
|
||||||
# Split line into plaintext segments to be tokenized, and manual furigana segments
|
# Split line into plaintext segments to be tokenized, and manual furigana segments
|
||||||
last_unmatched_pos = 0
|
last_unmatched_pos = 0
|
||||||
word_tokens = []
|
word_tokens = []
|
||||||
|
@ -124,6 +176,8 @@ def parse_japanese_line(line: str):
|
||||||
output['word_pairs'] = []
|
output['word_pairs'] = []
|
||||||
for token in (it := iter(word_tokens)):
|
for token in (it := iter(word_tokens)):
|
||||||
word, katakana = token
|
word, katakana = token
|
||||||
|
if katakana is None: # TODO: Weird edge case with nonsense input, should fix elsewhere
|
||||||
|
katakana = ''
|
||||||
if word[-1] == 'っ': # MeCab splits ?って into ?っ, て so we merge it back
|
if word[-1] == 'っ': # MeCab splits ?って into ?っ, て so we merge it back
|
||||||
try:
|
try:
|
||||||
next_word, next_katakana = next(it)
|
next_word, next_katakana = next(it)
|
||||||
|
@ -133,7 +187,12 @@ def parse_japanese_line(line: str):
|
||||||
pass
|
pass
|
||||||
hiragana = katakana.translate(katakana_to_hiragana_dict)
|
hiragana = katakana.translate(katakana_to_hiragana_dict)
|
||||||
if word in word_overrides: # Note that most word replacements will instead need to be handled BEFORE tokenization!
|
if word in word_overrides: # Note that most word replacements will instead need to be handled BEFORE tokenization!
|
||||||
hiragana = word_overrides[word]
|
override = word_overrides[word]
|
||||||
|
if isinstance(override, tuple):
|
||||||
|
if hiragana == override[0]:
|
||||||
|
hiragana = override[1]
|
||||||
|
else:
|
||||||
|
hiragana = override
|
||||||
output['word_pairs'].append((word, hiragana))
|
output['word_pairs'].append((word, hiragana))
|
||||||
|
|
||||||
# Process our (kanji, hiragana) word pairs into furigana blocks
|
# Process our (kanji, hiragana) word pairs into furigana blocks
|
||||||
|
|
|
@ -13,11 +13,11 @@ PlayResY: {PlayResY}
|
||||||
|
|
||||||
[V4+ Styles]
|
[V4+ Styles]
|
||||||
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
|
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
|
||||||
Style: Default,Arial,72,&H002A0A00,&H000019FF,&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1
|
Style: Default,{LatinFont},72,&H002A0A00,&H000019FF,&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1
|
||||||
Style: Kanji,Migu 1P,{KanjiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,4.0,0,2,30,30,{KanjiVMargin},1
|
Style: Kanji,{JapaneseFont},{KanjiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,4.0,0,2,30,30,{KanjiVMargin},1
|
||||||
Style: Furigana,Migu 1P,{FuriSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,2,0,0,{FuriVMargin},1
|
Style: Furigana,{JapaneseFont},{FuriSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,2,0,0,{FuriVMargin},1
|
||||||
Style: Romaji,Migu 1P,{RomajiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1
|
Style: Romaji,{LatinFont},{RomajiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,20,1
|
||||||
Style: Translation,Migu 1P,{TranslationSize},&H00FFFFFF,&H000019FF,&H00000000,&H00000000,0,1,0,0,100,100,0,0,1,1.0,3,8,30,30,25,1
|
Style: Translation,{LatinFont},{TranslationSize},&H00FFFFFF,&H000019FF,&H00000000,&H00000000,0,1,0,0,100,100,0,0,1,1.0,3,8,30,30,20,1
|
||||||
|
|
||||||
[Events]
|
[Events]
|
||||||
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
||||||
|
@ -26,8 +26,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
||||||
format_defaults = {
|
format_defaults = {
|
||||||
'PlayResX': 1280,
|
'PlayResX': 1280,
|
||||||
'PlayResY': 720,
|
'PlayResY': 720,
|
||||||
'TranslationSize': 48,
|
'LatinFont': 'Droid Sans',
|
||||||
'RomajiSize': 60,
|
'JapaneseFont': 'Droid Sans Japanese',
|
||||||
|
'TranslationSize': 36,
|
||||||
|
'RomajiSize': 48,
|
||||||
'KanjiSize': 72,
|
'KanjiSize': 72,
|
||||||
'KanjiVMargin': 20,
|
'KanjiVMargin': 20,
|
||||||
'FuriSize': 36,
|
'FuriSize': 36,
|
||||||
|
|
Loading…
Reference in New Issue