diff --git a/japanese_converters.py b/japanese_converters.py index ffe9759..c32f0f6 100644 --- a/japanese_converters.py +++ b/japanese_converters.py @@ -3,6 +3,7 @@ from fugashi import Tagger import pykakasi import re + kks = pykakasi.kakasi() def to_romaji(japanese: str, separator=' ') -> str: return separator.join([word['hepburn'] for word in kks.convert(japanese)]) @@ -10,6 +11,7 @@ def to_romaji(japanese: str, separator=' ') -> str: katakana_to_hiragana_dict = {0x30A0+i:chr(0x3040+i) for i in range(1, 87)} katakana_to_hiragana_dict[ord('*')] = ' ' + kana_merge_previous_syllable = {k for k in 'ゃゅょぁぃぇぉぅゎんャュョァィェォゥヮン'} kana_merge_next_syllable = {k for k in 'っッ'} def kana_to_syllable_string(kana: str): @@ -29,15 +31,30 @@ def kana_to_syllable_list(kana: str) -> list[str]: # Kinda inefficient to work it as a string and then split it afterwards, but elegant return kana_to_syllable_string(kana)[1:].split('|') -tagger = Tagger('-Owakati') +tagger = Tagger('-Owakati') +re_wa = re.compile(r'ワ') +def _parse_jp_token(token) -> tuple[str, str]: + s = str(token) + pronunciation: str = token.feature.pron # katakana with ー for long vowels, might not be 1:1 hiragana/romaji conversion + reading: str = token.feature.kana # katakana with correct vowels + # This is a hack to deal with は=わ morae + if len(pronunciation) == len(reading): + for match in re_wa.finditer(pronunciation): + i = match.start() + if reading[i] == 'ハ': + print('Found ハ=ワ:', s, pronunciation, reading) + reading = reading[:i] + 'ワ' + reading[i+1:] + else: + print('Different lengths pronunciation and reading:', s, pronunciation, reading) + return (s, reading) def parse_jp_text(text: str) -> list[tuple[str, str]]: - return [(str(token), token.feature.kana) for token in tagger(text)] + return [_parse_jp_token(token) for token in tagger(text)] # Our custom word overrides have two levels: # - One is a simple search-replace to turn matches into manual furigana "{kanji|furi}" format. This could have false hits on short words. # - The latter is to override a word's kana post-tokenization, which requires it to be a dictionary word with multiple readings. -word_overrides = {'私': 'わたし', '主': 'しゅ'} +default_word_overrides = {'私': ('わたくし', 'わたし'), '主': 'しゅ'} re_manual_furi = re.compile(r'{(.+?)\|(.+?)}') def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]: @@ -54,6 +71,7 @@ def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]: blocks.append(FuriBlock(remainder, '')) return blocks + # For debugging def furi_blocks_reconstruction(blocks: list[FuriBlock]): kanji = ''.join([b.kanji for b in blocks]) @@ -70,9 +88,11 @@ def debug_parse_manual_furi_line(line: str): romaji_syllables = '|'.join([to_romaji(syl) for syl in syllables.split('|')]) print(romaji_syllables) + re_hiragana = re.compile(r'[\u3041-\u309f]+') re_kana = re.compile(r'[\u3041-\u30ff]+') -def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]: +re_kana_no_ha = re.compile(r'[\u3041-\u306e\u3070-\u30ff]+') +def _word_to_furi_blocks(kanji, hiragana, regex) -> list[FuriBlock]: # On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for # The problem is okurigana matching to the hiragana # In words with multiple kanji split by okurigana, this is a hard problem. @@ -83,7 +103,7 @@ def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]: kanji_block_indices = [] last_unmatched_pos = 0 furi_regex_pattern = '' - for match in re_kana.finditer(kanji): + for match in regex.finditer(kanji): start = match.start() if start > last_unmatched_pos: furi_regex_pattern += '(.+)' @@ -97,13 +117,45 @@ def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]: furi_regex_pattern += '(.+)' kanji_block_indices.append(len(furi_blocks)) furi_blocks.append(remainder) # str not FuriBlock! - - furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous! + # This will throw on mismatch, e.g. from は=わ + try: + furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous! + except: + # print(furi_regex_pattern) + # print(kanji) + # print(hiragana) + raise for idx, furi in zip(kanji_block_indices, furi_groups): furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi.strip()) # str -> FuriBlock return furi_blocks +def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]: + # Workaround for は=わ - try treating は as a kanji if first parse fails + try: + return _word_to_furi_blocks(kanji, hiragana, re_kana) + except AttributeError: + return _word_to_furi_blocks(kanji, hiragana, re_kana_no_ha) -def parse_japanese_line(line: str): + +def parse_overrides_str(overrides_str: str) -> dict: + # Convert '私|わたくし|わたし' to {'私': ('わたくし', 'わたし')} + # Convert '私|わたし' to {'私': 'わたし'} + overrides_dict = default_word_overrides.copy() + for line in overrides_str.splitlines(): + line = line.strip() + if not line: + continue + kanji, *replacement = line.split('|') + if not kanji or not replacement: + continue + if len(replacement) > 1: + overrides_dict[kanji] = (replacement[0], replacement[1]) + else: + overrides_dict[kanji] = replacement[0] + return overrides_dict + + +def parse_japanese_line(line: str, overrides_str: str=''): + word_overrides = parse_overrides_str(overrides_str) # Split line into plaintext segments to be tokenized, and manual furigana segments last_unmatched_pos = 0 word_tokens = [] @@ -124,6 +176,8 @@ def parse_japanese_line(line: str): output['word_pairs'] = [] for token in (it := iter(word_tokens)): word, katakana = token + if katakana is None: # TODO: Weird edge case with nonsense input, should fix elsewhere + katakana = '' if word[-1] == 'っ': # MeCab splits ?って into ?っ, て so we merge it back try: next_word, next_katakana = next(it) @@ -133,7 +187,12 @@ def parse_japanese_line(line: str): pass hiragana = katakana.translate(katakana_to_hiragana_dict) if word in word_overrides: # Note that most word replacements will instead need to be handled BEFORE tokenization! - hiragana = word_overrides[word] + override = word_overrides[word] + if isinstance(override, tuple): + if hiragana == override[0]: + hiragana = override[1] + else: + hiragana = override output['word_pairs'].append((word, hiragana)) # Process our (kanji, hiragana) word pairs into furigana blocks diff --git a/subtitle_generator.py b/subtitle_generator.py index 4689e29..4b36bfd 100644 --- a/subtitle_generator.py +++ b/subtitle_generator.py @@ -13,11 +13,11 @@ PlayResY: {PlayResY} [V4+ Styles] Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding -Style: Default,Arial,72,&H002A0A00,&H000019FF,&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1 -Style: Kanji,Migu 1P,{KanjiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,4.0,0,2,30,30,{KanjiVMargin},1 -Style: Furigana,Migu 1P,{FuriSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,2,0,0,{FuriVMargin},1 -Style: Romaji,Migu 1P,{RomajiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1 -Style: Translation,Migu 1P,{TranslationSize},&H00FFFFFF,&H000019FF,&H00000000,&H00000000,0,1,0,0,100,100,0,0,1,1.0,3,8,30,30,25,1 +Style: Default,{LatinFont},72,&H002A0A00,&H000019FF,&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1 +Style: Kanji,{JapaneseFont},{KanjiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,4.0,0,2,30,30,{KanjiVMargin},1 +Style: Furigana,{JapaneseFont},{FuriSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,2,0,0,{FuriVMargin},1 +Style: Romaji,{LatinFont},{RomajiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,20,1 +Style: Translation,{LatinFont},{TranslationSize},&H00FFFFFF,&H000019FF,&H00000000,&H00000000,0,1,0,0,100,100,0,0,1,1.0,3,8,30,30,20,1 [Events] Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text @@ -26,8 +26,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text format_defaults = { 'PlayResX': 1280, 'PlayResY': 720, - 'TranslationSize': 48, - 'RomajiSize': 60, + 'LatinFont': 'Droid Sans', + 'JapaneseFont': 'Droid Sans Japanese', + 'TranslationSize': 36, + 'RomajiSize': 48, 'KanjiSize': 72, 'KanjiVMargin': 20, 'FuriSize': 36,