Initial commit

2024-12-17 21:14:21 +10:30 · 2024-12-17 21:14:21 +10:30 · 4888f1c99f
commit 4888f1c99f
4 changed files with 246 additions and 0 deletions
--- a/14
+++ b/14
@ -0,0 +1,14 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+fugashi = "*"
+unidic = "*"
+pykakasi = "*"
+
+[dev-packages]
+
+[requires]
+python_version = "3.12"
--- a/format.py
+++ b/format.py
@ -0,0 +1,16 @@
+from collections import namedtuple
+# store '{与|あた}えた{使命|しめい}　' as [('与','あた'), ('えた',''), ('使命','しめい'), ('　','')]
+FuriBlock = namedtuple('FuriBlock', ['kanji', 'furi'])
+
+class LyricLine:
+	beat_stamps: list[float] = []  # Start at zero for each line, do real timing via get_timestamps()
+	translated_line: str
+	romaji_syllables: list[str]  # Allow space entries which will be skipped over when calculating timing
+	furi_blocks: list[FuriBlock]
+
+	def get_timestamps(self, bpm: float, start_offset: float) -> list[float]:
+		spb = 60.0/bpm  # seconds per beat
+		return [(spb*beat)+start_offset for beat in self.beat_stamps]
+
+class LyricTrack:
+	lines: list[LyricLine]
--- a/japanese_converters.py
+++ b/japanese_converters.py
@ -0,0 +1,149 @@
+from format import LyricLine, FuriBlock
+from fugashi import Tagger
+import pykakasi
+import re
+
+kks = pykakasi.kakasi()
+def to_romaji(japanese: str, separator=' ') -> str:
+	return separator.join([word['hepburn'] for word in kks.convert(japanese)])
+
+katakana_to_hiragana_dict = {0x30A0+i:chr(0x3040+i) for i in range(1, 87)}
+katakana_to_hiragana_dict[ord('*')] = '　'
+
+kana_merge_previous_syllable = {k for k in 'ゃゅょぁぃぇぉぅゎんャュョァィェォゥヮン'}
+kana_merge_next_syllable = {k for k in 'っッ'}
+def kana_to_syllable_string(kana: str):
+	syl = ''
+	for k in (it := iter(kana)):
+		if k in kana_merge_next_syllable:
+			k += next(it)
+			syl += '|' + k
+			continue
+		# Else
+		if k not in kana_merge_previous_syllable:
+			syl += '|'
+		syl += k
+	return syl
+
+def kana_to_syllable_list(kana: str) -> list[str]:
+	# Kinda inefficient to work it as a string and then split it afterwards, but elegant
+	return kana_to_syllable_string(kana)[1:].split('|')
+
+tagger = Tagger('-Owakati')
+
+def parse_jp_text(text: str) -> list[tuple[str, str]]:
+	return [(str(token), token.feature.kana) for token in tagger(text)]
+
+# Our custom word overrides have two levels:
+# - One is a simple search-replace to turn matches into manual furigana "{kanji|furi}" format. This could have false hits on short words.
+# - The latter is to override a word's kana post-tokenization, which requires it to be a dictionary word with multiple readings.
+word_overrides = {'主': 'しゅ'}
+re_manual_furi = re.compile(r'{(.+?)\|(.+?)}')
+
+def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]:
+	last_unmatched_pos = 0
+	blocks = []
+	for match in re_manual_furi.finditer(line):
+		start = match.start()
+		end = match.end()
+		if start > last_unmatched_pos:
+			blocks.append(FuriBlock(line[last_unmatched_pos:start], ''))
+		blocks.append(FuriBlock(*match.groups()))
+		last_unmatched_pos = end
+	if remainder := line[last_unmatched_pos:]:
+		blocks.append(FuriBlock(remainder, ''))
+	return blocks
+
+# For debugging
+def furi_blocks_reconstruction(blocks: list[FuriBlock]):
+	kanji = ''.join([b.kanji for b in blocks])
+	kana = ''.join([b.furi if b.furi else b.kanji for b in blocks])
+	return kanji, kana
+
+def debug_parse_manual_furi_line(line: str):
+	blocks = manual_furi_string_to_blocks(line)
+	kanji, kana = furi_blocks_reconstruction(blocks)
+	print(kanji)
+	print(kana)
+	syllables = kana_to_syllable_string(kana)
+	print(syllables)
+	romaji_syllables = '|'.join([to_romaji(syl) for syl in syllables.split('|')])
+	print(romaji_syllables)
+
+re_hiragana = re.compile(r'[\u3041-\u309f]+')
+def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
+	# On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for
+	# The problem is okurigana matching to the hiragana
+	# In words with multiple kanji split by okurigana, this is a hard problem.
+
+	# Naive approach: pick out all the kana and make a regex
+	# e.g. turn '打ち合わせ' into r'(.*)ち(.*)わせ'
+	furi_blocks = []
+	kanji_block_indices = []
+	last_unmatched_pos = 0
+	furi_regex_pattern = ''
+	for match in re_hiragana.finditer(kanji):
+		start = match.start()
+		if start > last_unmatched_pos:
+			furi_regex_pattern += '(.+)'
+			kanji_block_indices.append(len(furi_blocks))
+			furi_blocks.append(kanji[last_unmatched_pos:start])  # str not FuriBlock!
+		furi = match.group(0)
+		furi_regex_pattern += furi
+		last_unmatched_pos = match.end()
+		furi_blocks.append(FuriBlock(furi, ''))
+	if remainder := kanji[last_unmatched_pos:]:
+		furi_regex_pattern += '(.+)'
+		kanji_block_indices.append(len(furi_blocks))
+		furi_blocks.append(remainder)  # str not FuriBlock!
+
+	furi_groups = re.match(furi_regex_pattern, hiragana).groups()  # This could be ambiguous!
+	for idx, furi in zip(kanji_block_indices, furi_groups):
+		furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi)  # str -> FuriBlock
+	return furi_blocks
+
+def parse_japanese_line(line: str):
+	# Split line into plaintext segments to be tokenized, and manual furigana segments
+	last_unmatched_pos = 0
+	word_tokens = []
+	for match in re_manual_furi.finditer(line):
+		start = match.start()
+		end = match.end()
+		if start > last_unmatched_pos:
+			word_tokens += parse_jp_text(line[last_unmatched_pos:start])  # Process japanese plaintext
+		word_tokens.append(match.groups())  # Already sorted into (kanji, hiragana)
+		last_unmatched_pos = end
+	if remainder := line[last_unmatched_pos:]:
+		word_tokens += parse_jp_text(remainder)
+	# We now have a series of word tokens that have had all the manual furigana handled
+
+	output = {}
+
+	# Process the tokens
+	output['word_pairs'] = []
+	for token in (it := iter(word_tokens)):
+		word, katakana = token
+		if word[-1] == 'っ':  # MeCab splits ？って into ？っ, て so we merge it back
+			try:
+				next_word, next_katakana = next(it)
+				word += next_word
+				katakana += next_katakana
+			except StopIteration:
+				pass
+		hiragana = katakana.translate(katakana_to_hiragana_dict)
+		if word in word_overrides:  # Note that most word replacements will instead need to be handled BEFORE tokenization!
+			hiragana = word_overrides[word]
+		output['word_pairs'].append((word, hiragana))
+
+	# Process our (kanji, hiragana) word pairs into furigana blocks
+	output['furi_blocks'] = []  # Must be iterated for timing
+	for kanji, hiragana in output['word_pairs']:
+		output['furi_blocks'] += word_to_furi_blocks(kanji, hiragana)
+	
+	# Create word-spaced romaji syllables
+	output['romaji_syllables'] = []  # Will have spaces mixed in so must be iterated for timing
+	for _, hiragana in output['word_pairs']:
+		output['romaji_syllables'] += [to_romaji(s) for syl in kana_to_syllable_list(hiragana) if (s:= syl.strip())]
+		if output['romaji_syllables'][-1] != ' ':
+			output['romaji_syllables'].append(' ')
+	return output
--- a/subtitle_generator.py
+++ b/subtitle_generator.py
@ -0,0 +1,67 @@
+# Substation Alpha (ASS) generation
+
+# Colour values are &HAABBGGRR, &HBBGGRR, or &HAA.
+# Alpha is actually inverted, i.e. transparency - FF is transparent, 00 is opaque.
+
+ass_preamble = '''[Script Info]
+ScriptType: v4.00+
+WrapStyle: 0
+ScaledBorderAndShadow: yes
+YCbCr Matrix: TV.709
+PlayResX: {PlayResX}
+PlayResY: {PlayResY}
+
+[V4+ Styles]
+Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
+Style: Default,Arial,72,&H002A0A00,&H000019FF,&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1
+Style: Kanji,Migu 1P,{KanjiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,4.0,0,2,30,30,25,1
+Style: Furigana,Migu 1P,{FuriSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,2,30,30,0,1
+Style: Romaji,Migu 1P,{RomajiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1
+Style: Translation,Migu 1P,{TranslationSize},&H00FFFFFF,&H000019FF,&H00000000,&H00000000,0,1,0,0,100,100,0,0,1,1.0,3,8,30,30,25,1
+
+[Events]
+Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+'''
+
+format_defaults = {
+	'PlayResX': 1280,
+	'PlayResY': 720,
+	'TranslationSize': 48,
+	'RomajiSize': 60,
+	'KanjiSize': 72,
+	'FuriSize': 36,
+	'KaraokeColourFuture': '000019FF',
+	'KaraokeColourPast': 'E02A0A00',
+}
+
+from format import LyricTrack
+def generate_ass(filename: str, lyric_track: LyricTrack, format_overloads: dict = None):
+	format_dict = format_defaults.copy()
+	if format_overloads:
+		format_dict.update(format_overloads)
+	preamble = ass_preamble.format(format_dict)
+
+	# Kanji Furigana layout stuff
+	size_kanji = format_dict['KanjiSize']
+	size_furi = format_dict['FuriSize']
+
+	with open(filename, 'w') as file:
+		file.write(preamble)
+		# for line in lines:
+		# 	#
+		# 	for syllable in line:
+		# 		t, kanji, furi, romaji = syllable
+
+example_layout = '''
+Dialogue: 0,0:01:08.00,0:01:26.00,Kanji,,,,,,{\k0}{\K100}雨{\K100}や{\K100}雪{\K100}が{\K100}天{\K100}から{\K100}降{\K100}って{\K100}地{\K100}を{\K100}潤{\K100}し　{\K100}芽{\K100}を{\K100}出{\K100}さ{\K100}せ{\K100}る
+Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,,   0,1130,,,{\k0}{\K100}あめ
+Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,,   0, 900,,,{\k200}{\K100}ゆき
+Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,,   0, 700,,,{\k400}{\K100}てん
+Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,,   0, 370,,,{\k600}{\K100}ふ
+Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,,   0,   0,,,{\k800}{\K100}ち
+Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 260,   0,,,{\k1000}{\K100}うるお
+Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 570,   0,,,{\k1200}{\K100}め
+Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 800,   0,,,{\k1400}{\K100}だ
+Dialogue: 0,0:01:08.00,0:01:26.00,Translation,,,,,,Rain and snow fall from the heavens. Moisten the earth and make it sprout
+Dialogue: 0,0:01:08.00,0:01:26.00,Romaji,,,,,,{\K0}{\K100}ame {\K100}ya {\K100}yuki {\K100}ga {\K100}ten {\K100}kara {\K100}fu{\K100}tte {\K100}chi {\K100}wo {\K100}uruo{\K100}shi {\K100}me {\K100}wo {\K100}da{\K100}sa {\K100}se{\K100}ru
+'''