Initial commit
This commit is contained in:
commit
4888f1c99f
|
@ -0,0 +1,14 @@
|
|||
[[source]]
|
||||
url = "https://pypi.org/simple"
|
||||
verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
fugashi = "*"
|
||||
unidic = "*"
|
||||
pykakasi = "*"
|
||||
|
||||
[dev-packages]
|
||||
|
||||
[requires]
|
||||
python_version = "3.12"
|
|
@ -0,0 +1,16 @@
|
|||
from collections import namedtuple
|
||||
# store '{与|あた}えた{使命|しめい} ' as [('与','あた'), ('えた',''), ('使命','しめい'), (' ','')]
|
||||
FuriBlock = namedtuple('FuriBlock', ['kanji', 'furi'])
|
||||
|
||||
class LyricLine:
|
||||
beat_stamps: list[float] = [] # Start at zero for each line, do real timing via get_timestamps()
|
||||
translated_line: str
|
||||
romaji_syllables: list[str] # Allow space entries which will be skipped over when calculating timing
|
||||
furi_blocks: list[FuriBlock]
|
||||
|
||||
def get_timestamps(self, bpm: float, start_offset: float) -> list[float]:
|
||||
spb = 60.0/bpm # seconds per beat
|
||||
return [(spb*beat)+start_offset for beat in self.beat_stamps]
|
||||
|
||||
class LyricTrack:
|
||||
lines: list[LyricLine]
|
|
@ -0,0 +1,149 @@
|
|||
from format import LyricLine, FuriBlock
|
||||
from fugashi import Tagger
|
||||
import pykakasi
|
||||
import re
|
||||
|
||||
kks = pykakasi.kakasi()
|
||||
def to_romaji(japanese: str, separator=' ') -> str:
|
||||
return separator.join([word['hepburn'] for word in kks.convert(japanese)])
|
||||
|
||||
katakana_to_hiragana_dict = {0x30A0+i:chr(0x3040+i) for i in range(1, 87)}
|
||||
katakana_to_hiragana_dict[ord('*')] = ' '
|
||||
|
||||
kana_merge_previous_syllable = {k for k in 'ゃゅょぁぃぇぉぅゎんャュョァィェォゥヮン'}
|
||||
kana_merge_next_syllable = {k for k in 'っッ'}
|
||||
def kana_to_syllable_string(kana: str):
|
||||
syl = ''
|
||||
for k in (it := iter(kana)):
|
||||
if k in kana_merge_next_syllable:
|
||||
k += next(it)
|
||||
syl += '|' + k
|
||||
continue
|
||||
# Else
|
||||
if k not in kana_merge_previous_syllable:
|
||||
syl += '|'
|
||||
syl += k
|
||||
return syl
|
||||
|
||||
def kana_to_syllable_list(kana: str) -> list[str]:
|
||||
# Kinda inefficient to work it as a string and then split it afterwards, but elegant
|
||||
return kana_to_syllable_string(kana)[1:].split('|')
|
||||
|
||||
tagger = Tagger('-Owakati')
|
||||
|
||||
def parse_jp_text(text: str) -> list[tuple[str, str]]:
|
||||
return [(str(token), token.feature.kana) for token in tagger(text)]
|
||||
|
||||
# Our custom word overrides have two levels:
|
||||
# - One is a simple search-replace to turn matches into manual furigana "{kanji|furi}" format. This could have false hits on short words.
|
||||
# - The latter is to override a word's kana post-tokenization, which requires it to be a dictionary word with multiple readings.
|
||||
word_overrides = {'主': 'しゅ'}
|
||||
re_manual_furi = re.compile(r'{(.+?)\|(.+?)}')
|
||||
|
||||
def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]:
|
||||
last_unmatched_pos = 0
|
||||
blocks = []
|
||||
for match in re_manual_furi.finditer(line):
|
||||
start = match.start()
|
||||
end = match.end()
|
||||
if start > last_unmatched_pos:
|
||||
blocks.append(FuriBlock(line[last_unmatched_pos:start], ''))
|
||||
blocks.append(FuriBlock(*match.groups()))
|
||||
last_unmatched_pos = end
|
||||
if remainder := line[last_unmatched_pos:]:
|
||||
blocks.append(FuriBlock(remainder, ''))
|
||||
return blocks
|
||||
|
||||
# For debugging
|
||||
def furi_blocks_reconstruction(blocks: list[FuriBlock]):
|
||||
kanji = ''.join([b.kanji for b in blocks])
|
||||
kana = ''.join([b.furi if b.furi else b.kanji for b in blocks])
|
||||
return kanji, kana
|
||||
|
||||
def debug_parse_manual_furi_line(line: str):
|
||||
blocks = manual_furi_string_to_blocks(line)
|
||||
kanji, kana = furi_blocks_reconstruction(blocks)
|
||||
print(kanji)
|
||||
print(kana)
|
||||
syllables = kana_to_syllable_string(kana)
|
||||
print(syllables)
|
||||
romaji_syllables = '|'.join([to_romaji(syl) for syl in syllables.split('|')])
|
||||
print(romaji_syllables)
|
||||
|
||||
re_hiragana = re.compile(r'[\u3041-\u309f]+')
|
||||
def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
|
||||
# On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for
|
||||
# The problem is okurigana matching to the hiragana
|
||||
# In words with multiple kanji split by okurigana, this is a hard problem.
|
||||
|
||||
# Naive approach: pick out all the kana and make a regex
|
||||
# e.g. turn '打ち合わせ' into r'(.*)ち(.*)わせ'
|
||||
furi_blocks = []
|
||||
kanji_block_indices = []
|
||||
last_unmatched_pos = 0
|
||||
furi_regex_pattern = ''
|
||||
for match in re_hiragana.finditer(kanji):
|
||||
start = match.start()
|
||||
if start > last_unmatched_pos:
|
||||
furi_regex_pattern += '(.+)'
|
||||
kanji_block_indices.append(len(furi_blocks))
|
||||
furi_blocks.append(kanji[last_unmatched_pos:start]) # str not FuriBlock!
|
||||
furi = match.group(0)
|
||||
furi_regex_pattern += furi
|
||||
last_unmatched_pos = match.end()
|
||||
furi_blocks.append(FuriBlock(furi, ''))
|
||||
if remainder := kanji[last_unmatched_pos:]:
|
||||
furi_regex_pattern += '(.+)'
|
||||
kanji_block_indices.append(len(furi_blocks))
|
||||
furi_blocks.append(remainder) # str not FuriBlock!
|
||||
|
||||
furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous!
|
||||
for idx, furi in zip(kanji_block_indices, furi_groups):
|
||||
furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi) # str -> FuriBlock
|
||||
return furi_blocks
|
||||
|
||||
def parse_japanese_line(line: str):
|
||||
# Split line into plaintext segments to be tokenized, and manual furigana segments
|
||||
last_unmatched_pos = 0
|
||||
word_tokens = []
|
||||
for match in re_manual_furi.finditer(line):
|
||||
start = match.start()
|
||||
end = match.end()
|
||||
if start > last_unmatched_pos:
|
||||
word_tokens += parse_jp_text(line[last_unmatched_pos:start]) # Process japanese plaintext
|
||||
word_tokens.append(match.groups()) # Already sorted into (kanji, hiragana)
|
||||
last_unmatched_pos = end
|
||||
if remainder := line[last_unmatched_pos:]:
|
||||
word_tokens += parse_jp_text(remainder)
|
||||
# We now have a series of word tokens that have had all the manual furigana handled
|
||||
|
||||
output = {}
|
||||
|
||||
# Process the tokens
|
||||
output['word_pairs'] = []
|
||||
for token in (it := iter(word_tokens)):
|
||||
word, katakana = token
|
||||
if word[-1] == 'っ': # MeCab splits ?って into ?っ, て so we merge it back
|
||||
try:
|
||||
next_word, next_katakana = next(it)
|
||||
word += next_word
|
||||
katakana += next_katakana
|
||||
except StopIteration:
|
||||
pass
|
||||
hiragana = katakana.translate(katakana_to_hiragana_dict)
|
||||
if word in word_overrides: # Note that most word replacements will instead need to be handled BEFORE tokenization!
|
||||
hiragana = word_overrides[word]
|
||||
output['word_pairs'].append((word, hiragana))
|
||||
|
||||
# Process our (kanji, hiragana) word pairs into furigana blocks
|
||||
output['furi_blocks'] = [] # Must be iterated for timing
|
||||
for kanji, hiragana in output['word_pairs']:
|
||||
output['furi_blocks'] += word_to_furi_blocks(kanji, hiragana)
|
||||
|
||||
# Create word-spaced romaji syllables
|
||||
output['romaji_syllables'] = [] # Will have spaces mixed in so must be iterated for timing
|
||||
for _, hiragana in output['word_pairs']:
|
||||
output['romaji_syllables'] += [to_romaji(s) for syl in kana_to_syllable_list(hiragana) if (s:= syl.strip())]
|
||||
if output['romaji_syllables'][-1] != ' ':
|
||||
output['romaji_syllables'].append(' ')
|
||||
return output
|
|
@ -0,0 +1,67 @@
|
|||
# Substation Alpha (ASS) generation
|
||||
|
||||
# Colour values are &HAABBGGRR, &HBBGGRR, or &HAA.
|
||||
# Alpha is actually inverted, i.e. transparency - FF is transparent, 00 is opaque.
|
||||
|
||||
ass_preamble = '''[Script Info]
|
||||
ScriptType: v4.00+
|
||||
WrapStyle: 0
|
||||
ScaledBorderAndShadow: yes
|
||||
YCbCr Matrix: TV.709
|
||||
PlayResX: {PlayResX}
|
||||
PlayResY: {PlayResY}
|
||||
|
||||
[V4+ Styles]
|
||||
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
|
||||
Style: Default,Arial,72,&H002A0A00,&H000019FF,&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1
|
||||
Style: Kanji,Migu 1P,{KanjiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,4.0,0,2,30,30,25,1
|
||||
Style: Furigana,Migu 1P,{FuriSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,2,30,30,0,1
|
||||
Style: Romaji,Migu 1P,{RomajiSize},&H{KaraokeColourPast},&H{KaraokeColourFuture},&H00FFFFFF,&H00000000,0,0,0,0,100,100,0,0,1,2.5,0,8,30,30,25,1
|
||||
Style: Translation,Migu 1P,{TranslationSize},&H00FFFFFF,&H000019FF,&H00000000,&H00000000,0,1,0,0,100,100,0,0,1,1.0,3,8,30,30,25,1
|
||||
|
||||
[Events]
|
||||
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
||||
'''
|
||||
|
||||
format_defaults = {
|
||||
'PlayResX': 1280,
|
||||
'PlayResY': 720,
|
||||
'TranslationSize': 48,
|
||||
'RomajiSize': 60,
|
||||
'KanjiSize': 72,
|
||||
'FuriSize': 36,
|
||||
'KaraokeColourFuture': '000019FF',
|
||||
'KaraokeColourPast': 'E02A0A00',
|
||||
}
|
||||
|
||||
from format import LyricTrack
|
||||
def generate_ass(filename: str, lyric_track: LyricTrack, format_overloads: dict = None):
|
||||
format_dict = format_defaults.copy()
|
||||
if format_overloads:
|
||||
format_dict.update(format_overloads)
|
||||
preamble = ass_preamble.format(format_dict)
|
||||
|
||||
# Kanji Furigana layout stuff
|
||||
size_kanji = format_dict['KanjiSize']
|
||||
size_furi = format_dict['FuriSize']
|
||||
|
||||
with open(filename, 'w') as file:
|
||||
file.write(preamble)
|
||||
# for line in lines:
|
||||
# #
|
||||
# for syllable in line:
|
||||
# t, kanji, furi, romaji = syllable
|
||||
|
||||
example_layout = '''
|
||||
Dialogue: 0,0:01:08.00,0:01:26.00,Kanji,,,,,,{\k0}{\K100}雨{\K100}や{\K100}雪{\K100}が{\K100}天{\K100}から{\K100}降{\K100}って{\K100}地{\K100}を{\K100}潤{\K100}し {\K100}芽{\K100}を{\K100}出{\K100}さ{\K100}せ{\K100}る
|
||||
Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 0,1130,,,{\k0}{\K100}あめ
|
||||
Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 0, 900,,,{\k200}{\K100}ゆき
|
||||
Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 0, 700,,,{\k400}{\K100}てん
|
||||
Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 0, 370,,,{\k600}{\K100}ふ
|
||||
Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 0, 0,,,{\k800}{\K100}ち
|
||||
Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 260, 0,,,{\k1000}{\K100}うるお
|
||||
Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 570, 0,,,{\k1200}{\K100}め
|
||||
Dialogue: 0,0:01:08.00,0:01:26.00,Furigana,, 800, 0,,,{\k1400}{\K100}だ
|
||||
Dialogue: 0,0:01:08.00,0:01:26.00,Translation,,,,,,Rain and snow fall from the heavens. Moisten the earth and make it sprout
|
||||
Dialogue: 0,0:01:08.00,0:01:26.00,Romaji,,,,,,{\K0}{\K100}ame {\K100}ya {\K100}yuki {\K100}ga {\K100}ten {\K100}kara {\K100}fu{\K100}tte {\K100}chi {\K100}wo {\K100}uruo{\K100}shi {\K100}me {\K100}wo {\K100}da{\K100}sa {\K100}se{\K100}ru
|
||||
'''
|
Loading…
Reference in New Issue