2024-12-17 21:14:21 +10:30
|
|
|
|
from format import LyricLine, FuriBlock
|
|
|
|
|
from fugashi import Tagger
|
|
|
|
|
import pykakasi
|
|
|
|
|
import re
|
|
|
|
|
|
2024-12-18 20:54:50 +10:30
|
|
|
|
|
2024-12-17 21:14:21 +10:30
|
|
|
|
kks = pykakasi.kakasi()
|
|
|
|
|
def to_romaji(japanese: str, separator=' ') -> str:
|
|
|
|
|
return separator.join([word['hepburn'] for word in kks.convert(japanese)])
|
|
|
|
|
|
|
|
|
|
katakana_to_hiragana_dict = {0x30A0+i:chr(0x3040+i) for i in range(1, 87)}
|
|
|
|
|
katakana_to_hiragana_dict[ord('*')] = ' '
|
|
|
|
|
|
2024-12-18 20:54:50 +10:30
|
|
|
|
|
2024-12-17 21:14:21 +10:30
|
|
|
|
kana_merge_previous_syllable = {k for k in 'ゃゅょぁぃぇぉぅゎんャュョァィェォゥヮン'}
|
|
|
|
|
kana_merge_next_syllable = {k for k in 'っッ'}
|
|
|
|
|
def kana_to_syllable_string(kana: str):
|
|
|
|
|
syl = ''
|
|
|
|
|
for k in (it := iter(kana)):
|
|
|
|
|
if k in kana_merge_next_syllable:
|
|
|
|
|
k += next(it)
|
|
|
|
|
syl += '|' + k
|
|
|
|
|
continue
|
|
|
|
|
# Else
|
|
|
|
|
if k not in kana_merge_previous_syllable:
|
|
|
|
|
syl += '|'
|
|
|
|
|
syl += k
|
|
|
|
|
return syl
|
|
|
|
|
|
|
|
|
|
def kana_to_syllable_list(kana: str) -> list[str]:
|
|
|
|
|
# Kinda inefficient to work it as a string and then split it afterwards, but elegant
|
|
|
|
|
return kana_to_syllable_string(kana)[1:].split('|')
|
|
|
|
|
|
|
|
|
|
|
2024-12-18 20:54:50 +10:30
|
|
|
|
tagger = Tagger('-Owakati')
|
|
|
|
|
re_wa = re.compile(r'ワ')
|
|
|
|
|
def _parse_jp_token(token) -> tuple[str, str]:
|
|
|
|
|
s = str(token)
|
|
|
|
|
pronunciation: str = token.feature.pron # katakana with ー for long vowels, might not be 1:1 hiragana/romaji conversion
|
|
|
|
|
reading: str = token.feature.kana # katakana with correct vowels
|
|
|
|
|
# This is a hack to deal with は=わ morae
|
|
|
|
|
if len(pronunciation) == len(reading):
|
|
|
|
|
for match in re_wa.finditer(pronunciation):
|
|
|
|
|
i = match.start()
|
|
|
|
|
if reading[i] == 'ハ':
|
|
|
|
|
print('Found ハ=ワ:', s, pronunciation, reading)
|
|
|
|
|
reading = reading[:i] + 'ワ' + reading[i+1:]
|
|
|
|
|
else:
|
|
|
|
|
print('Different lengths pronunciation and reading:', s, pronunciation, reading)
|
|
|
|
|
return (s, reading)
|
2024-12-17 21:14:21 +10:30
|
|
|
|
def parse_jp_text(text: str) -> list[tuple[str, str]]:
|
2024-12-18 20:54:50 +10:30
|
|
|
|
return [_parse_jp_token(token) for token in tagger(text)]
|
2024-12-17 21:14:21 +10:30
|
|
|
|
|
|
|
|
|
# Our custom word overrides have two levels:
|
|
|
|
|
# - One is a simple search-replace to turn matches into manual furigana "{kanji|furi}" format. This could have false hits on short words.
|
|
|
|
|
# - The latter is to override a word's kana post-tokenization, which requires it to be a dictionary word with multiple readings.
|
2024-12-18 20:54:50 +10:30
|
|
|
|
default_word_overrides = {'私': ('わたくし', 'わたし'), '主': 'しゅ'}
|
2024-12-17 21:14:21 +10:30
|
|
|
|
re_manual_furi = re.compile(r'{(.+?)\|(.+?)}')
|
|
|
|
|
|
|
|
|
|
def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]:
|
|
|
|
|
last_unmatched_pos = 0
|
|
|
|
|
blocks = []
|
|
|
|
|
for match in re_manual_furi.finditer(line):
|
|
|
|
|
start = match.start()
|
|
|
|
|
end = match.end()
|
|
|
|
|
if start > last_unmatched_pos:
|
|
|
|
|
blocks.append(FuriBlock(line[last_unmatched_pos:start], ''))
|
|
|
|
|
blocks.append(FuriBlock(*match.groups()))
|
|
|
|
|
last_unmatched_pos = end
|
|
|
|
|
if remainder := line[last_unmatched_pos:]:
|
|
|
|
|
blocks.append(FuriBlock(remainder, ''))
|
|
|
|
|
return blocks
|
|
|
|
|
|
2024-12-18 20:54:50 +10:30
|
|
|
|
|
2024-12-17 21:14:21 +10:30
|
|
|
|
# For debugging
|
|
|
|
|
def furi_blocks_reconstruction(blocks: list[FuriBlock]):
|
|
|
|
|
kanji = ''.join([b.kanji for b in blocks])
|
|
|
|
|
kana = ''.join([b.furi if b.furi else b.kanji for b in blocks])
|
|
|
|
|
return kanji, kana
|
|
|
|
|
|
|
|
|
|
def debug_parse_manual_furi_line(line: str):
|
|
|
|
|
blocks = manual_furi_string_to_blocks(line)
|
|
|
|
|
kanji, kana = furi_blocks_reconstruction(blocks)
|
|
|
|
|
print(kanji)
|
|
|
|
|
print(kana)
|
|
|
|
|
syllables = kana_to_syllable_string(kana)
|
|
|
|
|
print(syllables)
|
|
|
|
|
romaji_syllables = '|'.join([to_romaji(syl) for syl in syllables.split('|')])
|
|
|
|
|
print(romaji_syllables)
|
|
|
|
|
|
2024-12-18 20:54:50 +10:30
|
|
|
|
|
2024-12-17 21:14:21 +10:30
|
|
|
|
re_hiragana = re.compile(r'[\u3041-\u309f]+')
|
2024-12-17 21:28:23 +10:30
|
|
|
|
re_kana = re.compile(r'[\u3041-\u30ff]+')
|
2024-12-18 20:54:50 +10:30
|
|
|
|
re_kana_no_ha = re.compile(r'[\u3041-\u306e\u3070-\u30ff]+')
|
|
|
|
|
def _word_to_furi_blocks(kanji, hiragana, regex) -> list[FuriBlock]:
|
2024-12-17 21:14:21 +10:30
|
|
|
|
# On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for
|
|
|
|
|
# The problem is okurigana matching to the hiragana
|
|
|
|
|
# In words with multiple kanji split by okurigana, this is a hard problem.
|
|
|
|
|
|
|
|
|
|
# Naive approach: pick out all the kana and make a regex
|
|
|
|
|
# e.g. turn '打ち合わせ' into r'(.*)ち(.*)わせ'
|
|
|
|
|
furi_blocks = []
|
|
|
|
|
kanji_block_indices = []
|
|
|
|
|
last_unmatched_pos = 0
|
|
|
|
|
furi_regex_pattern = ''
|
2024-12-18 20:54:50 +10:30
|
|
|
|
for match in regex.finditer(kanji):
|
2024-12-17 21:14:21 +10:30
|
|
|
|
start = match.start()
|
|
|
|
|
if start > last_unmatched_pos:
|
|
|
|
|
furi_regex_pattern += '(.+)'
|
|
|
|
|
kanji_block_indices.append(len(furi_blocks))
|
|
|
|
|
furi_blocks.append(kanji[last_unmatched_pos:start]) # str not FuriBlock!
|
|
|
|
|
furi = match.group(0)
|
2024-12-17 21:28:23 +10:30
|
|
|
|
furi_regex_pattern += furi.translate(katakana_to_hiragana_dict)
|
2024-12-17 21:14:21 +10:30
|
|
|
|
last_unmatched_pos = match.end()
|
|
|
|
|
furi_blocks.append(FuriBlock(furi, ''))
|
|
|
|
|
if remainder := kanji[last_unmatched_pos:]:
|
|
|
|
|
furi_regex_pattern += '(.+)'
|
|
|
|
|
kanji_block_indices.append(len(furi_blocks))
|
|
|
|
|
furi_blocks.append(remainder) # str not FuriBlock!
|
2024-12-18 20:54:50 +10:30
|
|
|
|
# This will throw on mismatch, e.g. from は=わ
|
|
|
|
|
try:
|
|
|
|
|
furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous!
|
|
|
|
|
except:
|
|
|
|
|
# print(furi_regex_pattern)
|
|
|
|
|
# print(kanji)
|
|
|
|
|
# print(hiragana)
|
|
|
|
|
raise
|
2024-12-17 21:14:21 +10:30
|
|
|
|
for idx, furi in zip(kanji_block_indices, furi_groups):
|
2024-12-17 21:28:23 +10:30
|
|
|
|
furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi.strip()) # str -> FuriBlock
|
2024-12-17 21:14:21 +10:30
|
|
|
|
return furi_blocks
|
2024-12-18 20:54:50 +10:30
|
|
|
|
def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
|
|
|
|
|
# Workaround for は=わ - try treating は as a kanji if first parse fails
|
|
|
|
|
try:
|
|
|
|
|
return _word_to_furi_blocks(kanji, hiragana, re_kana)
|
|
|
|
|
except AttributeError:
|
|
|
|
|
return _word_to_furi_blocks(kanji, hiragana, re_kana_no_ha)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_overrides_str(overrides_str: str) -> dict:
|
|
|
|
|
# Convert '私|わたくし|わたし' to {'私': ('わたくし', 'わたし')}
|
|
|
|
|
# Convert '私|わたし' to {'私': 'わたし'}
|
|
|
|
|
overrides_dict = default_word_overrides.copy()
|
|
|
|
|
for line in overrides_str.splitlines():
|
|
|
|
|
line = line.strip()
|
|
|
|
|
if not line:
|
|
|
|
|
continue
|
|
|
|
|
kanji, *replacement = line.split('|')
|
|
|
|
|
if not kanji or not replacement:
|
|
|
|
|
continue
|
|
|
|
|
if len(replacement) > 1:
|
|
|
|
|
overrides_dict[kanji] = (replacement[0], replacement[1])
|
|
|
|
|
else:
|
|
|
|
|
overrides_dict[kanji] = replacement[0]
|
|
|
|
|
return overrides_dict
|
|
|
|
|
|
2024-12-17 21:14:21 +10:30
|
|
|
|
|
2024-12-18 20:54:50 +10:30
|
|
|
|
def parse_japanese_line(line: str, overrides_str: str=''):
|
|
|
|
|
word_overrides = parse_overrides_str(overrides_str)
|
2024-12-17 21:14:21 +10:30
|
|
|
|
# Split line into plaintext segments to be tokenized, and manual furigana segments
|
|
|
|
|
last_unmatched_pos = 0
|
|
|
|
|
word_tokens = []
|
|
|
|
|
for match in re_manual_furi.finditer(line):
|
|
|
|
|
start = match.start()
|
|
|
|
|
end = match.end()
|
|
|
|
|
if start > last_unmatched_pos:
|
|
|
|
|
word_tokens += parse_jp_text(line[last_unmatched_pos:start]) # Process japanese plaintext
|
|
|
|
|
word_tokens.append(match.groups()) # Already sorted into (kanji, hiragana)
|
|
|
|
|
last_unmatched_pos = end
|
|
|
|
|
if remainder := line[last_unmatched_pos:]:
|
|
|
|
|
word_tokens += parse_jp_text(remainder)
|
|
|
|
|
# We now have a series of word tokens that have had all the manual furigana handled
|
|
|
|
|
|
|
|
|
|
output = {}
|
|
|
|
|
|
|
|
|
|
# Process the tokens
|
|
|
|
|
output['word_pairs'] = []
|
|
|
|
|
for token in (it := iter(word_tokens)):
|
|
|
|
|
word, katakana = token
|
2024-12-18 20:54:50 +10:30
|
|
|
|
if katakana is None: # TODO: Weird edge case with nonsense input, should fix elsewhere
|
|
|
|
|
katakana = ''
|
2024-12-17 21:14:21 +10:30
|
|
|
|
if word[-1] == 'っ': # MeCab splits ?って into ?っ, て so we merge it back
|
|
|
|
|
try:
|
|
|
|
|
next_word, next_katakana = next(it)
|
|
|
|
|
word += next_word
|
|
|
|
|
katakana += next_katakana
|
|
|
|
|
except StopIteration:
|
|
|
|
|
pass
|
|
|
|
|
hiragana = katakana.translate(katakana_to_hiragana_dict)
|
|
|
|
|
if word in word_overrides: # Note that most word replacements will instead need to be handled BEFORE tokenization!
|
2024-12-18 20:54:50 +10:30
|
|
|
|
override = word_overrides[word]
|
|
|
|
|
if isinstance(override, tuple):
|
|
|
|
|
if hiragana == override[0]:
|
|
|
|
|
hiragana = override[1]
|
|
|
|
|
else:
|
|
|
|
|
hiragana = override
|
2024-12-17 21:14:21 +10:30
|
|
|
|
output['word_pairs'].append((word, hiragana))
|
|
|
|
|
|
|
|
|
|
# Process our (kanji, hiragana) word pairs into furigana blocks
|
|
|
|
|
output['furi_blocks'] = [] # Must be iterated for timing
|
|
|
|
|
for kanji, hiragana in output['word_pairs']:
|
|
|
|
|
output['furi_blocks'] += word_to_furi_blocks(kanji, hiragana)
|
|
|
|
|
|
2024-12-17 23:55:12 +10:30
|
|
|
|
# Create word-spaced hiragana and romaji syllables
|
|
|
|
|
output['hiragana_syllables'] = [] # Will have spaces mixed in so must be iterated for timing
|
2024-12-17 21:14:21 +10:30
|
|
|
|
output['romaji_syllables'] = [] # Will have spaces mixed in so must be iterated for timing
|
|
|
|
|
for _, hiragana in output['word_pairs']:
|
2024-12-17 23:55:12 +10:30
|
|
|
|
l = [s for syl in kana_to_syllable_list(hiragana) if (s:= syl.strip())]
|
|
|
|
|
output['hiragana_syllables'] += l
|
|
|
|
|
output['romaji_syllables'] += [to_romaji(s) for syl in l if (s:= syl.strip())]
|
|
|
|
|
if len(l) > 0:
|
|
|
|
|
output['hiragana_syllables'].append(' ')
|
2024-12-17 21:14:21 +10:30
|
|
|
|
output['romaji_syllables'].append(' ')
|
2024-12-17 23:55:12 +10:30
|
|
|
|
if len(output['romaji_syllables']) > 0: # remove trailing space
|
|
|
|
|
output['hiragana_syllables'].pop()
|
|
|
|
|
output['romaji_syllables'].pop()
|
2024-12-17 21:14:21 +10:30
|
|
|
|
return output
|