Add some は＝わ hack to prevent konnichiha romaji

2024-12-18 20:54:50 +10:30 · 2024-12-18 20:54:50 +10:30 · c4f5748963
parent e0919ec472
commit c4f5748963
2 changed files with 77 additions and 16 deletions
--- a/japanese_converters.py
+++ b/japanese_converters.py
@ -3,6 +3,7 @@ from fugashi import Tagger
 import pykakasi
 import re
 kks = pykakasi.kakasi()
 def to_romaji(japanese: str, separator=' ') -> str:
 	return separator.join([word['hepburn'] for word in kks.convert(japanese)])
@ -10,6 +11,7 @@ def to_romaji(japanese: str, separator=' ') -> str:
 katakana_to_hiragana_dict = {0x30A0+i:chr(0x3040+i) for i in range(1, 87)}
 katakana_to_hiragana_dict[ord('*')] = '　'
 kana_merge_previous_syllable = {k for k in 'ゃゅょぁぃぇぉぅゎんャュョァィェォゥヮン'}
 kana_merge_next_syllable = {k for k in 'っッ'}
 def kana_to_syllable_string(kana: str):
@ -29,15 +31,30 @@ def kana_to_syllable_list(kana: str) -> list[str]:
 	# Kinda inefficient to work it as a string and then split it afterwards, but elegant
 	return kana_to_syllable_string(kana)[1:].split('|')
 tagger = Tagger('-Owakati')
 tagger = Tagger('-Owakati')
 re_wa = re.compile(r'ワ')
 def _parse_jp_token(token) -> tuple[str, str]:
 	s = str(token)
 	pronunciation: str = token.feature.pron  # katakana with ー for long vowels, might not be 1:1 hiragana/romaji conversion
 	reading: str = token.feature.kana  # katakana with correct vowels
 	# This is a hack to deal with は＝わ morae
 	if len(pronunciation) == len(reading):
 		for match in re_wa.finditer(pronunciation):
 			i = match.start()
 			if reading[i] == 'ハ':
 				print('Found ハ＝ワ:', s, pronunciation, reading)
 				reading = reading[:i] + 'ワ' + reading[i+1:]
 	else:
 		print('Different lengths pronunciation and reading:', s, pronunciation, reading)
 	return (s, reading)
 def parse_jp_text(text: str) -> list[tuple[str, str]]:
-	return [(str(token), token.feature.kana) for token in tagger(text)]
+	return [_parse_jp_token(token) for token in tagger(text)]
 # Our custom word overrides have two levels:
 # - One is a simple search-replace to turn matches into manual furigana "{kanji|furi}" format. This could have false hits on short words.
 # - The latter is to override a word's kana post-tokenization, which requires it to be a dictionary word with multiple readings.
-word_overrides = {'私': 'わたし', '主': 'しゅ'}
+default_word_overrides = {'私': ('わたくし', 'わたし'), '主': 'しゅ'}
 re_manual_furi = re.compile(r'{(.+?)\|(.+?)}')
 def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]:
@ -54,6 +71,7 @@ def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]:
 		blocks.append(FuriBlock(remainder, ''))
 	return blocks
 # For debugging
 def furi_blocks_reconstruction(blocks: list[FuriBlock]):
 	kanji = ''.join([b.kanji for b in blocks])
@ -70,9 +88,11 @@ def debug_parse_manual_furi_line(line: str):
 	romaji_syllables = '|'.join([to_romaji(syl) for syl in syllables.split('|')])
 	print(romaji_syllables)
 re_hiragana = re.compile(r'[\u3041-\u309f]+')
 re_kana = re.compile(r'[\u3041-\u30ff]+')
-def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
+re_kana_no_ha = re.compile(r'[\u3041-\u306e\u3070-\u30ff]+')
 def _word_to_furi_blocks(kanji, hiragana, regex) -> list[FuriBlock]:
 	# On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for
 	# The problem is okurigana matching to the hiragana
 	# In words with multiple kanji split by okurigana, this is a hard problem.
@ -83,7 +103,7 @@ def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
 	kanji_block_indices = []
 	last_unmatched_pos = 0
 	furi_regex_pattern = ''
-	for match in re_kana.finditer(kanji):
+	for match in regex.finditer(kanji):
 		start = match.start()
 		if start > last_unmatched_pos:
 			furi_regex_pattern += '(.+)'
@ -97,13 +117,45 @@ def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
 		furi_regex_pattern += '(.+)'
 		kanji_block_indices.append(len(furi_blocks))
 		furi_blocks.append(remainder)  # str not FuriBlock!
-
+	# This will throw on mismatch, e.g. from は＝わ
-	furi_groups = re.match(furi_regex_pattern, hiragana).groups()  # This could be ambiguous!
+	try:
 		furi_groups = re.match(furi_regex_pattern, hiragana).groups()  # This could be ambiguous!
 	except:
 		# print(furi_regex_pattern)
 		# print(kanji)
 		# print(hiragana)
 		raise
 	for idx, furi in zip(kanji_block_indices, furi_groups):
 		furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi.strip())  # str -> FuriBlock
 	return furi_blocks
 def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
 	# Workaround for は＝わ - try treating は as a kanji if first parse fails
 	try:
 		return _word_to_furi_blocks(kanji, hiragana, re_kana)
 	except AttributeError:
 		return _word_to_furi_blocks(kanji, hiragana, re_kana_no_ha)
-def parse_japanese_line(line: str):
+
 def parse_overrides_str(overrides_str: str) -> dict:
 	# Convert '私|わたくし|わたし' to {'私': ('わたくし', 'わたし')}
 	# Convert '私|わたし' to {'私': 'わたし'}
 	overrides_dict = default_word_overrides.copy()
 	for line in overrides_str.splitlines():
 		line = line.strip()
 		if not line:
 			continue
 		kanji, *replacement = line.split('|')
 		if not kanji or not replacement:
 			continue
 		if len(replacement) > 1:
 			overrides_dict[kanji] = (replacement[0], replacement[1])
 		else:
 			overrides_dict[kanji] = replacement[0]
 	return overrides_dict
 def parse_japanese_line(line: str, overrides_str: str=''):
 	word_overrides = parse_overrides_str(overrides_str)
 	# Split line into plaintext segments to be tokenized, and manual furigana segments
 	last_unmatched_pos = 0
 	word_tokens = []
@ -124,6 +176,8 @@ def parse_japanese_line(line: str):
 	output['word_pairs'] = []
 	for token in (it := iter(word_tokens)):
 		word, katakana = token
 		if katakana is None:  # TODO: Weird edge case with nonsense input, should fix elsewhere
 			katakana = ''
 		if word[-1] == 'っ':  # MeCab splits ？って into ？っ, て so we merge it back
 			try:
 				next_word, next_katakana = next(it)
@ -133,7 +187,12 @@ def parse_japanese_line(line: str):
 				pass
 		hiragana = katakana.translate(katakana_to_hiragana_dict)
 		if word in word_overrides:  # Note that most word replacements will instead need to be handled BEFORE tokenization!
-			hiragana = word_overrides[word]
+			override = word_overrides[word]
 			if isinstance(override, tuple):
 				if hiragana == override[0]:
 					hiragana = override[1]
 			else:
 				hiragana = override
 		output['word_pairs'].append((word, hiragana))
 	# Process our (kanji, hiragana) word pairs into furigana blocks
--- a/subtitle_generator.py
+++ b/subtitle_generator.py
@ -13,11 +13,11 @@ PlayResY: {PlayResY}
 [V4+ Styles]
 Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
-Style: Default,Arial,72,&H002A0A00,&H000019FF,&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1
+Style: Default,{LatinFont},72,&H002A0A00,&H000019FF,&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1
-Style: Kanji,Migu 1P,{KanjiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,4.0,0,2,30,30,{KanjiVMargin},1
+Style: Kanji,{JapaneseFont},{KanjiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,4.0,0,2,30,30,{KanjiVMargin},1
-Style: Furigana,Migu 1P,{FuriSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,2,0,0,{FuriVMargin},1
+Style: Furigana,{JapaneseFont},{FuriSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,2,0,0,{FuriVMargin},1
-Style: Romaji,Migu 1P,{RomajiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1
+Style: Romaji,{LatinFont},{RomajiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,20,1
-Style: Translation,Migu 1P,{TranslationSize},&H00FFFFFF,&H000019FF,&H00000000,&H00000000,0,1,0,0,100,100,0,0,1,1.0,3,8,30,30,25,1
+Style: Translation,{LatinFont},{TranslationSize},&H00FFFFFF,&H000019FF,&H00000000,&H00000000,0,1,0,0,100,100,0,0,1,1.0,3,8,30,30,20,1
 [Events]
 Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
@ -26,8 +26,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 format_defaults = {
 	'PlayResX': 1280,
 	'PlayResY': 720,
-	'TranslationSize': 48,
+	'LatinFont': 'Droid Sans',
-	'RomajiSize': 60,
+	'JapaneseFont': 'Droid Sans Japanese',
 	'TranslationSize': 36,
 	'RomajiSize': 48,
 	'KanjiSize': 72,
 	'KanjiVMargin': 20,
 	'FuriSize': 36,