KaraokeTestudaiBackend/japanese_converters.py

150 lines
5.6 KiB
Python
Raw Normal View History

2024-12-17 21:14:21 +10:30
from format import LyricLine, FuriBlock
from fugashi import Tagger
import pykakasi
import re
kks = pykakasi.kakasi()
def to_romaji(japanese: str, separator=' ') -> str:
return separator.join([word['hepburn'] for word in kks.convert(japanese)])
katakana_to_hiragana_dict = {0x30A0+i:chr(0x3040+i) for i in range(1, 87)}
katakana_to_hiragana_dict[ord('*')] = ' '
kana_merge_previous_syllable = {k for k in 'ゃゅょぁぃぇぉぅゎんャュョァィェォゥヮン'}
kana_merge_next_syllable = {k for k in 'っッ'}
def kana_to_syllable_string(kana: str):
syl = ''
for k in (it := iter(kana)):
if k in kana_merge_next_syllable:
k += next(it)
syl += '|' + k
continue
# Else
if k not in kana_merge_previous_syllable:
syl += '|'
syl += k
return syl
def kana_to_syllable_list(kana: str) -> list[str]:
# Kinda inefficient to work it as a string and then split it afterwards, but elegant
return kana_to_syllable_string(kana)[1:].split('|')
tagger = Tagger('-Owakati')
def parse_jp_text(text: str) -> list[tuple[str, str]]:
return [(str(token), token.feature.kana) for token in tagger(text)]
# Our custom word overrides have two levels:
# - One is a simple search-replace to turn matches into manual furigana "{kanji|furi}" format. This could have false hits on short words.
# - The latter is to override a word's kana post-tokenization, which requires it to be a dictionary word with multiple readings.
word_overrides = {'': 'しゅ'}
re_manual_furi = re.compile(r'{(.+?)\|(.+?)}')
def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]:
last_unmatched_pos = 0
blocks = []
for match in re_manual_furi.finditer(line):
start = match.start()
end = match.end()
if start > last_unmatched_pos:
blocks.append(FuriBlock(line[last_unmatched_pos:start], ''))
blocks.append(FuriBlock(*match.groups()))
last_unmatched_pos = end
if remainder := line[last_unmatched_pos:]:
blocks.append(FuriBlock(remainder, ''))
return blocks
# For debugging
def furi_blocks_reconstruction(blocks: list[FuriBlock]):
kanji = ''.join([b.kanji for b in blocks])
kana = ''.join([b.furi if b.furi else b.kanji for b in blocks])
return kanji, kana
def debug_parse_manual_furi_line(line: str):
blocks = manual_furi_string_to_blocks(line)
kanji, kana = furi_blocks_reconstruction(blocks)
print(kanji)
print(kana)
syllables = kana_to_syllable_string(kana)
print(syllables)
romaji_syllables = '|'.join([to_romaji(syl) for syl in syllables.split('|')])
print(romaji_syllables)
re_hiragana = re.compile(r'[\u3041-\u309f]+')
def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
# On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for
# The problem is okurigana matching to the hiragana
# In words with multiple kanji split by okurigana, this is a hard problem.
# Naive approach: pick out all the kana and make a regex
# e.g. turn '打ち合わせ' into r'(.*)ち(.*)わせ'
furi_blocks = []
kanji_block_indices = []
last_unmatched_pos = 0
furi_regex_pattern = ''
for match in re_hiragana.finditer(kanji):
start = match.start()
if start > last_unmatched_pos:
furi_regex_pattern += '(.+)'
kanji_block_indices.append(len(furi_blocks))
furi_blocks.append(kanji[last_unmatched_pos:start]) # str not FuriBlock!
furi = match.group(0)
furi_regex_pattern += furi
last_unmatched_pos = match.end()
furi_blocks.append(FuriBlock(furi, ''))
if remainder := kanji[last_unmatched_pos:]:
furi_regex_pattern += '(.+)'
kanji_block_indices.append(len(furi_blocks))
furi_blocks.append(remainder) # str not FuriBlock!
furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous!
for idx, furi in zip(kanji_block_indices, furi_groups):
furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi) # str -> FuriBlock
return furi_blocks
def parse_japanese_line(line: str):
# Split line into plaintext segments to be tokenized, and manual furigana segments
last_unmatched_pos = 0
word_tokens = []
for match in re_manual_furi.finditer(line):
start = match.start()
end = match.end()
if start > last_unmatched_pos:
word_tokens += parse_jp_text(line[last_unmatched_pos:start]) # Process japanese plaintext
word_tokens.append(match.groups()) # Already sorted into (kanji, hiragana)
last_unmatched_pos = end
if remainder := line[last_unmatched_pos:]:
word_tokens += parse_jp_text(remainder)
# We now have a series of word tokens that have had all the manual furigana handled
output = {}
# Process the tokens
output['word_pairs'] = []
for token in (it := iter(word_tokens)):
word, katakana = token
if word[-1] == '': # MeCab splits ?って into ?っ, て so we merge it back
try:
next_word, next_katakana = next(it)
word += next_word
katakana += next_katakana
except StopIteration:
pass
hiragana = katakana.translate(katakana_to_hiragana_dict)
if word in word_overrides: # Note that most word replacements will instead need to be handled BEFORE tokenization!
hiragana = word_overrides[word]
output['word_pairs'].append((word, hiragana))
# Process our (kanji, hiragana) word pairs into furigana blocks
output['furi_blocks'] = [] # Must be iterated for timing
for kanji, hiragana in output['word_pairs']:
output['furi_blocks'] += word_to_furi_blocks(kanji, hiragana)
# Create word-spaced romaji syllables
output['romaji_syllables'] = [] # Will have spaces mixed in so must be iterated for timing
for _, hiragana in output['word_pairs']:
output['romaji_syllables'] += [to_romaji(s) for syl in kana_to_syllable_list(hiragana) if (s:= syl.strip())]
if output['romaji_syllables'][-1] != ' ':
output['romaji_syllables'].append(' ')
return output