KaraokeTestudaiBackend/japanese_converters.py

217 lines
8.1 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from format import LyricLine, FuriBlock
from fugashi import Tagger
import pykakasi
import re
kks = pykakasi.kakasi()
def to_romaji(japanese: str, separator=' ') -> str:
return separator.join([word['hepburn'] for word in kks.convert(japanese)])
katakana_to_hiragana_dict = {0x30A0+i:chr(0x3040+i) for i in range(1, 87)}
katakana_to_hiragana_dict[ord('*')] = ' '
kana_merge_previous_syllable = {k for k in 'ゃゅょぁぃぇぉぅゎんャュョァィェォゥヮン'}
kana_merge_next_syllable = {k for k in 'っッ'}
def kana_to_syllable_string(kana: str):
syl = ''
for k in (it := iter(kana)):
if k in kana_merge_next_syllable:
k += next(it)
syl += '|' + k
continue
# Else
if k not in kana_merge_previous_syllable:
syl += '|'
syl += k
return syl
def kana_to_syllable_list(kana: str) -> list[str]:
# Kinda inefficient to work it as a string and then split it afterwards, but elegant
return kana_to_syllable_string(kana)[1:].split('|')
tagger = Tagger('-Owakati')
re_wa = re.compile(r'')
def _parse_jp_token(token) -> tuple[str, str]:
s = str(token)
pronunciation: str = token.feature.pron # katakana with ー for long vowels, might not be 1:1 hiragana/romaji conversion
reading: str = token.feature.kana # katakana with correct vowels
# This is a hack to deal with は=わ morae
if len(pronunciation) == len(reading):
for match in re_wa.finditer(pronunciation):
i = match.start()
if reading[i] == '':
print('Found ハ=ワ:', s, pronunciation, reading)
reading = reading[:i] + '' + reading[i+1:]
else:
print('Different lengths pronunciation and reading:', s, pronunciation, reading)
return (s, reading)
def parse_jp_text(text: str) -> list[tuple[str, str]]:
return [_parse_jp_token(token) for token in tagger(text)]
# Our custom word overrides have two levels:
# - One is a simple search-replace to turn matches into manual furigana "{kanji|furi}" format. This could have false hits on short words.
# - The latter is to override a word's kana post-tokenization, which requires it to be a dictionary word with multiple readings.
default_word_overrides = {'': ('わたくし', 'わたし'), '': 'しゅ'}
re_manual_furi = re.compile(r'{(.+?)\|(.+?)}')
def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]:
last_unmatched_pos = 0
blocks = []
for match in re_manual_furi.finditer(line):
start = match.start()
end = match.end()
if start > last_unmatched_pos:
blocks.append(FuriBlock(line[last_unmatched_pos:start], ''))
blocks.append(FuriBlock(*match.groups()))
last_unmatched_pos = end
if remainder := line[last_unmatched_pos:]:
blocks.append(FuriBlock(remainder, ''))
return blocks
# For debugging
def furi_blocks_reconstruction(blocks: list[FuriBlock]):
kanji = ''.join([b.kanji for b in blocks])
kana = ''.join([b.furi if b.furi else b.kanji for b in blocks])
return kanji, kana
def debug_parse_manual_furi_line(line: str):
blocks = manual_furi_string_to_blocks(line)
kanji, kana = furi_blocks_reconstruction(blocks)
print(kanji)
print(kana)
syllables = kana_to_syllable_string(kana)
print(syllables)
romaji_syllables = '|'.join([to_romaji(syl) for syl in syllables.split('|')])
print(romaji_syllables)
re_hiragana = re.compile(r'[\u3041-\u309f]+')
re_kana = re.compile(r'[\u3041-\u30ff]+')
re_kana_no_ha = re.compile(r'[\u3041-\u306e\u3070-\u30ff]+')
def _word_to_furi_blocks(kanji, hiragana, regex) -> list[FuriBlock]:
# On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for
# The problem is okurigana matching to the hiragana
# In words with multiple kanji split by okurigana, this is a hard problem.
# Naive approach: pick out all the kana and make a regex
# e.g. turn '打ち合わせ' into r'(.*)ち(.*)わせ'
furi_blocks = []
kanji_block_indices = []
last_unmatched_pos = 0
furi_regex_pattern = ''
for match in regex.finditer(kanji):
start = match.start()
if start > last_unmatched_pos:
furi_regex_pattern += '(.+)'
kanji_block_indices.append(len(furi_blocks))
furi_blocks.append(kanji[last_unmatched_pos:start]) # str not FuriBlock!
furi = match.group(0)
furi_regex_pattern += furi.translate(katakana_to_hiragana_dict)
last_unmatched_pos = match.end()
furi_blocks.append(FuriBlock(furi, ''))
if remainder := kanji[last_unmatched_pos:]:
furi_regex_pattern += '(.+)'
kanji_block_indices.append(len(furi_blocks))
furi_blocks.append(remainder) # str not FuriBlock!
# This will throw on mismatch, e.g. from は=わ
try:
furi_groups = re.match(furi_regex_pattern, hiragana).groups() # This could be ambiguous!
except:
# print(furi_regex_pattern)
# print(kanji)
# print(hiragana)
raise
for idx, furi in zip(kanji_block_indices, furi_groups):
furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi.strip()) # str -> FuriBlock
return furi_blocks
def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
# Workaround for は=わ - try treating は as a kanji if first parse fails
try:
return _word_to_furi_blocks(kanji, hiragana, re_kana)
except AttributeError:
return _word_to_furi_blocks(kanji, hiragana, re_kana_no_ha)
def parse_overrides_str(overrides_str: str) -> dict:
# Convert '私|わたくし|わたし' to {'私': ('わたくし', 'わたし')}
# Convert '私|わたし' to {'私': 'わたし'}
overrides_dict = default_word_overrides.copy()
for line in overrides_str.splitlines():
line = line.strip()
if not line:
continue
kanji, *replacement = line.split('|')
if not kanji or not replacement:
continue
if len(replacement) > 1:
overrides_dict[kanji] = (replacement[0], replacement[1])
else:
overrides_dict[kanji] = replacement[0]
return overrides_dict
def parse_japanese_line(line: str, overrides_str: str=''):
word_overrides = parse_overrides_str(overrides_str)
# Split line into plaintext segments to be tokenized, and manual furigana segments
last_unmatched_pos = 0
word_tokens = []
for match in re_manual_furi.finditer(line):
start = match.start()
end = match.end()
if start > last_unmatched_pos:
word_tokens += parse_jp_text(line[last_unmatched_pos:start]) # Process japanese plaintext
word_tokens.append(match.groups()) # Already sorted into (kanji, hiragana)
last_unmatched_pos = end
if remainder := line[last_unmatched_pos:]:
word_tokens += parse_jp_text(remainder)
# We now have a series of word tokens that have had all the manual furigana handled
output = {}
# Process the tokens
output['word_pairs'] = []
for token in (it := iter(word_tokens)):
word, katakana = token
if katakana is None: # TODO: Weird edge case with nonsense input, should fix elsewhere
katakana = ''
if word[-1] == '': # MeCab splits ?って into ?っ, て so we merge it back
try:
next_word, next_katakana = next(it)
word += next_word
katakana += next_katakana
except StopIteration:
pass
hiragana = katakana.translate(katakana_to_hiragana_dict)
if word in word_overrides: # Note that most word replacements will instead need to be handled BEFORE tokenization!
override = word_overrides[word]
if isinstance(override, tuple):
if hiragana == override[0]:
hiragana = override[1]
else:
hiragana = override
output['word_pairs'].append((word, hiragana))
# Process our (kanji, hiragana) word pairs into furigana blocks
output['furi_blocks'] = [] # Must be iterated for timing
for kanji, hiragana in output['word_pairs']:
output['furi_blocks'] += word_to_furi_blocks(kanji, hiragana)
# Create word-spaced hiragana and romaji syllables
output['hiragana_syllables'] = [] # Will have spaces mixed in so must be iterated for timing
output['romaji_syllables'] = [] # Will have spaces mixed in so must be iterated for timing
for _, hiragana in output['word_pairs']:
l = [s for syl in kana_to_syllable_list(hiragana) if (s:= syl.strip())]
output['hiragana_syllables'] += l
output['romaji_syllables'] += [to_romaji(s) for syl in l if (s:= syl.strip())]
if len(l) > 0:
output['hiragana_syllables'].append(' ')
output['romaji_syllables'].append(' ')
if len(output['romaji_syllables']) > 0: # remove trailing space
output['hiragana_syllables'].pop()
output['romaji_syllables'].pop()
return output