KaraokeTestudaiBackend/japanese_converters.py

151 lines
5.7 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from format import LyricLine, FuriBlock
from fugashi import Tagger
import pykakasi
import re
kks = pykakasi.kakasi()
def to_romaji(japanese: str, separator=' ') -> str:
return separator.join([word['hepburn'] for word in kks.convert(japanese)])
katakana_to_hiragana_dict = {0x30A0+i:chr(0x3040+i) for i in range(1, 87)}
katakana_to_hiragana_dict[ord('*')] = ' '
kana_merge_previous_syllable = {k for k in 'ゃゅょぁぃぇぉぅゎんャュョァィェォゥヮン'}
kana_merge_next_syllable = {k for k in 'っッ'}
def kana_to_syllable_string(kana: str):
syl = ''
for k in (it := iter(kana)):
if k in kana_merge_next_syllable:
k += next(it)
syl += '|' + k
continue
# Else
if k not in kana_merge_previous_syllable:
syl += '|'
syl += k
return syl
def kana_to_syllable_list(kana: str) -> list[str]:
# Kinda inefficient to work it as a string and then split it afterwards, but elegant
return kana_to_syllable_string(kana)[1:].split('|')
tagger = Tagger('-Owakati')
def parse_jp_text(text: str) -> list[tuple[str, str]]:
return [(str(token), token.feature.kana) for token in tagger(text)]
# Our custom word overrides have two levels:
# - One is a simple search-replace to turn matches into manual furigana "{kanji|furi}" format. This could have false hits on short words.
# - The latter is to override a word's kana post-tokenization, which requires it to be a dictionary word with multiple readings.
word_overrides = {'': 'しゅ'}
re_manual_furi = re.compile(r'{(.+?)\|(.+?)}')
def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]:
last_unmatched_pos = 0
blocks = []
for match in re_manual_furi.finditer(line):
start = match.start()
end = match.end()
if start > last_unmatched_pos:
blocks.append(FuriBlock(line[last_unmatched_pos:start], ''))
blocks.append(FuriBlock(*match.groups()))
last_unmatched_pos = end
if remainder := line[last_unmatched_pos:]:
blocks.append(FuriBlock(remainder, ''))
return blocks
# For debugging
def furi_blocks_reconstruction(blocks: list[FuriBlock]):
kanji = ''.join([b.kanji for b in blocks])
kana = ''.join([b.furi if b.furi else b.kanji for b in blocks])
return kanji, kana
def debug_parse_manual_furi_line(line: str):
blocks = manual_furi_string_to_blocks(line)
kanji, kana = furi_blocks_reconstruction(blocks)
print(kanji)
print(kana)
syllables = kana_to_syllable_string(kana)
print(syllables)
romaji_syllables = '|'.join([to_romaji(syl) for syl in syllables.split('|')])
print(romaji_syllables)
re_hiragana = re.compile(r'[\u3041-\u309f]+')
re_kana = re.compile(r'[\u3041-\u30ff]+')
def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
# On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for
# The problem is okurigana matching to the hiragana
# In words with multiple kanji split by okurigana, this is a hard problem.
# Naive approach: pick out all the kana and make a regex
# e.g. turn '打ち合わせ' into r'(.*)ち(.*)わせ'
furi_blocks = []
kanji_block_indices = []
last_unmatched_pos = 0
furi_regex_pattern = ''
for match in re_kana.finditer(kanji):
start = match.start()
if start > last_unmatched_pos:
furi_regex_pattern += '(.+)'
kanji_block_indices.append(len(furi_blocks))
furi_blocks.append(kanji[last_unmatched_pos:start]) # str not FuriBlock!
furi = match.group(0)
furi_regex_pattern += furi.translate(katakana_to_hiragana_dict)
last_unmatched_pos = match.end()
furi_blocks.append(FuriBlock(furi, ''))
if remainder := kanji[last_unmatched_pos:]:
furi_regex_pattern += '(.+)'
kanji_block_indices.append(len(furi_blocks))
furi_blocks.append(remainder) # str not FuriBlock!
furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous!
for idx, furi in zip(kanji_block_indices, furi_groups):
furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi.strip()) # str -> FuriBlock
return furi_blocks
def parse_japanese_line(line: str):
# Split line into plaintext segments to be tokenized, and manual furigana segments
last_unmatched_pos = 0
word_tokens = []
for match in re_manual_furi.finditer(line):
start = match.start()
end = match.end()
if start > last_unmatched_pos:
word_tokens += parse_jp_text(line[last_unmatched_pos:start]) # Process japanese plaintext
word_tokens.append(match.groups()) # Already sorted into (kanji, hiragana)
last_unmatched_pos = end
if remainder := line[last_unmatched_pos:]:
word_tokens += parse_jp_text(remainder)
# We now have a series of word tokens that have had all the manual furigana handled
output = {}
# Process the tokens
output['word_pairs'] = []
for token in (it := iter(word_tokens)):
word, katakana = token
if word[-1] == '': # MeCab splits ?って into ?っ, て so we merge it back
try:
next_word, next_katakana = next(it)
word += next_word
katakana += next_katakana
except StopIteration:
pass
hiragana = katakana.translate(katakana_to_hiragana_dict)
if word in word_overrides: # Note that most word replacements will instead need to be handled BEFORE tokenization!
hiragana = word_overrides[word]
output['word_pairs'].append((word, hiragana))
# Process our (kanji, hiragana) word pairs into furigana blocks
output['furi_blocks'] = [] # Must be iterated for timing
for kanji, hiragana in output['word_pairs']:
output['furi_blocks'] += word_to_furi_blocks(kanji, hiragana)
# Create word-spaced romaji syllables
output['romaji_syllables'] = [] # Will have spaces mixed in so must be iterated for timing
for _, hiragana in output['word_pairs']:
output['romaji_syllables'] += [to_romaji(s) for syl in kana_to_syllable_list(hiragana) if (s:= syl.strip())]
if output['romaji_syllables'][-1] != ' ':
output['romaji_syllables'].append(' ')
return output