commit 4888f1c99fb8fb4fd0ee91f5ee1c860c9b452dc3 Author: Luke Hubmayer-Werner Date: Tue Dec 17 21:14:21 2024 +1030 Initial commit diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..bf281e5 --- /dev/null +++ b/Pipfile @@ -0,0 +1,14 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +fugashi = "*" +unidic = "*" +pykakasi = "*" + +[dev-packages] + +[requires] +python_version = "3.12" diff --git a/format.py b/format.py new file mode 100644 index 0000000..cfeaf47 --- /dev/null +++ b/format.py @@ -0,0 +1,16 @@ +from collections import namedtuple +# store '{与|あた}えた{使命|しめい} ' as [('与','あた'), ('えた',''), ('使命','しめい'), (' ','')] +FuriBlock = namedtuple('FuriBlock', ['kanji', 'furi']) + +class LyricLine: + beat_stamps: list[float] = [] # Start at zero for each line, do real timing via get_timestamps() + translated_line: str + romaji_syllables: list[str] # Allow space entries which will be skipped over when calculating timing + furi_blocks: list[FuriBlock] + + def get_timestamps(self, bpm: float, start_offset: float) -> list[float]: + spb = 60.0/bpm # seconds per beat + return [(spb*beat)+start_offset for beat in self.beat_stamps] + +class LyricTrack: + lines: list[LyricLine] diff --git a/japanese_converters.py b/japanese_converters.py new file mode 100644 index 0000000..802c660 --- /dev/null +++ b/japanese_converters.py @@ -0,0 +1,149 @@ +from format import LyricLine, FuriBlock +from fugashi import Tagger +import pykakasi +import re + +kks = pykakasi.kakasi() +def to_romaji(japanese: str, separator=' ') -> str: + return separator.join([word['hepburn'] for word in kks.convert(japanese)]) + +katakana_to_hiragana_dict = {0x30A0+i:chr(0x3040+i) for i in range(1, 87)} +katakana_to_hiragana_dict[ord('*')] = ' ' + +kana_merge_previous_syllable = {k for k in 'ゃゅょぁぃぇぉぅゎんャュョァィェォゥヮン'} +kana_merge_next_syllable = {k for k in 'っッ'} +def kana_to_syllable_string(kana: str): + syl = '' + for k in (it := iter(kana)): + if k in kana_merge_next_syllable: + k += next(it) + syl += '|' + k + continue + # Else + if k not in kana_merge_previous_syllable: + syl += '|' + syl += k + return syl + +def kana_to_syllable_list(kana: str) -> list[str]: + # Kinda inefficient to work it as a string and then split it afterwards, but elegant + return kana_to_syllable_string(kana)[1:].split('|') + +tagger = Tagger('-Owakati') + +def parse_jp_text(text: str) -> list[tuple[str, str]]: + return [(str(token), token.feature.kana) for token in tagger(text)] + +# Our custom word overrides have two levels: +# - One is a simple search-replace to turn matches into manual furigana "{kanji|furi}" format. This could have false hits on short words. +# - The latter is to override a word's kana post-tokenization, which requires it to be a dictionary word with multiple readings. +word_overrides = {'主': 'しゅ'} +re_manual_furi = re.compile(r'{(.+?)\|(.+?)}') + +def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]: + last_unmatched_pos = 0 + blocks = [] + for match in re_manual_furi.finditer(line): + start = match.start() + end = match.end() + if start > last_unmatched_pos: + blocks.append(FuriBlock(line[last_unmatched_pos:start], '')) + blocks.append(FuriBlock(*match.groups())) + last_unmatched_pos = end + if remainder := line[last_unmatched_pos:]: + blocks.append(FuriBlock(remainder, '')) + return blocks + +# For debugging +def furi_blocks_reconstruction(blocks: list[FuriBlock]): + kanji = ''.join([b.kanji for b in blocks]) + kana = ''.join([b.furi if b.furi else b.kanji for b in blocks]) + return kanji, kana + +def debug_parse_manual_furi_line(line: str): + blocks = manual_furi_string_to_blocks(line) + kanji, kana = furi_blocks_reconstruction(blocks) + print(kanji) + print(kana) + syllables = kana_to_syllable_string(kana) + print(syllables) + romaji_syllables = '|'.join([to_romaji(syl) for syl in syllables.split('|')]) + print(romaji_syllables) + +re_hiragana = re.compile(r'[\u3041-\u309f]+') +def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]: + # On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for + # The problem is okurigana matching to the hiragana + # In words with multiple kanji split by okurigana, this is a hard problem. + + # Naive approach: pick out all the kana and make a regex + # e.g. turn '打ち合わせ' into r'(.*)ち(.*)わせ' + furi_blocks = [] + kanji_block_indices = [] + last_unmatched_pos = 0 + furi_regex_pattern = '' + for match in re_hiragana.finditer(kanji): + start = match.start() + if start > last_unmatched_pos: + furi_regex_pattern += '(.+)' + kanji_block_indices.append(len(furi_blocks)) + furi_blocks.append(kanji[last_unmatched_pos:start]) # str not FuriBlock! + furi = match.group(0) + furi_regex_pattern += furi + last_unmatched_pos = match.end() + furi_blocks.append(FuriBlock(furi, '')) + if remainder := kanji[last_unmatched_pos:]: + furi_regex_pattern += '(.+)' + kanji_block_indices.append(len(furi_blocks)) + furi_blocks.append(remainder) # str not FuriBlock! + + furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous! + for idx, furi in zip(kanji_block_indices, furi_groups): + furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi) # str -> FuriBlock + return furi_blocks + +def parse_japanese_line(line: str): + # Split line into plaintext segments to be tokenized, and manual furigana segments + last_unmatched_pos = 0 + word_tokens = [] + for match in re_manual_furi.finditer(line): + start = match.start() + end = match.end() + if start > last_unmatched_pos: + word_tokens += parse_jp_text(line[last_unmatched_pos:start]) # Process japanese plaintext + word_tokens.append(match.groups()) # Already sorted into (kanji, hiragana) + last_unmatched_pos = end + if remainder := line[last_unmatched_pos:]: + word_tokens += parse_jp_text(remainder) + # We now have a series of word tokens that have had all the manual furigana handled + + output = {} + + # Process the tokens + output['word_pairs'] = [] + for token in (it := iter(word_tokens)): + word, katakana = token + if word[-1] == 'っ': # MeCab splits ?って into ?っ, て so we merge it back + try: + next_word, next_katakana = next(it) + word += next_word + katakana += next_katakana + except StopIteration: + pass + hiragana = katakana.translate(katakana_to_hiragana_dict) + if word in word_overrides: # Note that most word replacements will instead need to be handled BEFORE tokenization! + hiragana = word_overrides[word] + output['word_pairs'].append((word, hiragana)) + + # Process our (kanji, hiragana) word pairs into furigana blocks + output['furi_blocks'] = [] # Must be iterated for timing + for kanji, hiragana in output['word_pairs']: + output['furi_blocks'] += word_to_furi_blocks(kanji, hiragana) + + # Create word-spaced romaji syllables + output['romaji_syllables'] = [] # Will have spaces mixed in so must be iterated for timing + for _, hiragana in output['word_pairs']: + output['romaji_syllables'] += [to_romaji(s) for syl in kana_to_syllable_list(hiragana) if (s:= syl.strip())] + if output['romaji_syllables'][-1] != ' ': + output['romaji_syllables'].append(' ') + return output diff --git a/subtitle_generator.py b/subtitle_generator.py new file mode 100644 index 0000000..25a81a4 --- /dev/null +++ b/subtitle_generator.py @@ -0,0 +1,67 @@ +# Substation Alpha (ASS) generation + +# Colour values are &HAABBGGRR, &HBBGGRR, or &HAA. +# Alpha is actually inverted, i.e. transparency - FF is transparent, 00 is opaque. + +ass_preamble = '''[Script Info] +ScriptType: v4.00+ +WrapStyle: 0 +ScaledBorderAndShadow: yes +YCbCr Matrix: TV.709 +PlayResX: {PlayResX} +PlayResY: {PlayResY} + +[V4+ Styles] +Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding +Style: Default,Arial,72,&H002A0A00,&H000019FF,&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1 +Style: Kanji,Migu 1P,{KanjiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,4.0,0,2,30,30,25,1 +Style: Furigana,Migu 1P,{FuriSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,2,30,30,0,1 +Style: Romaji,Migu 1P,{RomajiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1 +Style: Translation,Migu 1P,{TranslationSize},&H00FFFFFF,&H000019FF,&H00000000,&H00000000,0,1,0,0,100,100,0,0,1,1.0,3,8,30,30,25,1 + +[Events] +Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text +''' + +format_defaults = { + 'PlayResX': 1280, + 'PlayResY': 720, + 'TranslationSize': 48, + 'RomajiSize': 60, + 'KanjiSize': 72, + 'FuriSize': 36, + 'KaraokeColourFuture': '000019FF', + 'KaraokeColourPast': 'E02A0A00', +} + +from format import LyricTrack +def generate_ass(filename: str, lyric_track: LyricTrack, format_overloads: dict = None): + format_dict = format_defaults.copy() + if format_overloads: + format_dict.update(format_overloads) + preamble = ass_preamble.format(format_dict) + + # Kanji Furigana layout stuff + size_kanji = format_dict['KanjiSize'] + size_furi = format_dict['FuriSize'] + + with open(filename, 'w') as file: + file.write(preamble) + # for line in lines: + # # + # for syllable in line: + # t, kanji, furi, romaji = syllable + +example_layout = ''' +Dialogue: 0,0:01:08.00,0:01:26.00,Kanji,,,,,,{\k0}{\K100}雨{\K100}や{\K100}雪{\K100}が{\K100}天{\K100}から{\K100}降{\K100}って{\K100}地{\K100}を{\K100}潤{\K100}し {\K100}芽{\K100}を{\K100}出{\K100}さ{\K100}せ{\K100}る +Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 0,1130,,,{\k0}{\K100}あめ +Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 0, 900,,,{\k200}{\K100}ゆき +Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 0, 700,,,{\k400}{\K100}てん +Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 0, 370,,,{\k600}{\K100}ふ +Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 0, 0,,,{\k800}{\K100}ち +Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 260, 0,,,{\k1000}{\K100}うるお +Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 570, 0,,,{\k1200}{\K100}め +Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 800, 0,,,{\k1400}{\K100}だ +Dialogue: 0,0:01:08.00,0:01:26.00,Translation,,,,,,Rain and snow fall from the heavens. Moisten the earth and make it sprout +Dialogue: 0,0:01:08.00,0:01:26.00,Romaji,,,,,,{\K0}{\K100}ame {\K100}ya {\K100}yuki {\K100}ga {\K100}ten {\K100}kara {\K100}fu{\K100}tte {\K100}chi {\K100}wo {\K100}uruo{\K100}shi {\K100}me {\K100}wo {\K100}da{\K100}sa {\K100}se{\K100}ru +'''