KaraokeTestudaiBackend/japanese_converters.py

from format import LyricLine, FuriBlock
from fugashi import Tagger
import pykakasi
import re


kks = pykakasi.kakasi()
def to_romaji(japanese: str, separator=' ') -> str:
	return separator.join([word['hepburn'] for word in kks.convert(japanese)])

katakana_to_hiragana_dict = {0x30A0+i:chr(0x3040+i) for i in range(1, 87)}
katakana_to_hiragana_dict[ord('*')] = '　'


kana_merge_previous_syllable = {k for k in 'ゃゅょぁぃぇぉぅゎんャュョァィェォゥヮン'}
kana_merge_next_syllable = {k for k in 'っッ'}
def kana_to_syllable_string(kana: str):
	syl = ''
	for k in (it := iter(kana)):
		if k in kana_merge_next_syllable:
			k += next(it)
			syl += '|' + k
			continue
		# Else
		if k not in kana_merge_previous_syllable:
			syl += '|'
		syl += k
	return syl

def kana_to_syllable_list(kana: str) -> list[str]:
	# Kinda inefficient to work it as a string and then split it afterwards, but elegant
	return kana_to_syllable_string(kana)[1:].split('|')


tagger = Tagger('-Owakati')
re_wa = re.compile(r'ワ')
def _parse_jp_token(token) -> tuple[str, str]:
	s = str(token)
	pronunciation: str = token.feature.pron  # katakana with ー for long vowels, might not be 1:1 hiragana/romaji conversion
	reading: str = token.feature.kana  # katakana with correct vowels
	# This is a hack to deal with は＝わ morae
	if len(pronunciation) == len(reading):
		for match in re_wa.finditer(pronunciation):
			i = match.start()
			if reading[i] == 'ハ':
				print('Found ハ＝ワ:', s, pronunciation, reading)
				reading = reading[:i] + 'ワ' + reading[i+1:]
	else:
		print('Different lengths pronunciation and reading:', s, pronunciation, reading)
	return (s, reading)
def parse_jp_text(text: str) -> list[tuple[str, str]]:
	return [_parse_jp_token(token) for token in tagger(text)]

# Our custom word overrides have two levels:
# - One is a simple search-replace to turn matches into manual furigana "{kanji|furi}" format. This could have false hits on short words.
# - The latter is to override a word's kana post-tokenization, which requires it to be a dictionary word with multiple readings.
default_word_overrides = {'私': ('わたくし', 'わたし'), '主': 'しゅ'}
re_manual_furi = re.compile(r'{(.+?)\|(.+?)}')

def manual_furi_string_to_blocks(line: str) -> list[FuriBlock]:
	last_unmatched_pos = 0
	blocks = []
	for match in re_manual_furi.finditer(line):
		start = match.start()
		end = match.end()
		if start > last_unmatched_pos:
			blocks.append(FuriBlock(line[last_unmatched_pos:start], ''))
		blocks.append(FuriBlock(*match.groups()))
		last_unmatched_pos = end
	if remainder := line[last_unmatched_pos:]:
		blocks.append(FuriBlock(remainder, ''))
	return blocks


# For debugging
def furi_blocks_reconstruction(blocks: list[FuriBlock]):
	kanji = ''.join([b.kanji for b in blocks])
	kana = ''.join([b.furi if b.furi else b.kanji for b in blocks])
	return kanji, kana

def debug_parse_manual_furi_line(line: str):
	blocks = manual_furi_string_to_blocks(line)
	kanji, kana = furi_blocks_reconstruction(blocks)
	print(kanji)
	print(kana)
	syllables = kana_to_syllable_string(kana)
	print(syllables)
	romaji_syllables = '|'.join([to_romaji(syl) for syl in syllables.split('|')])
	print(romaji_syllables)


re_hiragana = re.compile(r'[\u3041-\u309f]+')
re_kana = re.compile(r'[\u3041-\u30ff]+')
re_kana_no_ha = re.compile(r'[\u3041-\u306e\u3070-\u30ff]+')
def _word_to_furi_blocks(kanji, hiragana, regex) -> list[FuriBlock]:
	# On longer words this may incorrectly split and may need manual furi, which our pipeline is not set up for
	# The problem is okurigana matching to the hiragana
	# In words with multiple kanji split by okurigana, this is a hard problem.

	# Naive approach: pick out all the kana and make a regex
	# e.g. turn '打ち合わせ' into r'(.*)ち(.*)わせ'
	furi_blocks = []
	kanji_block_indices = []
	last_unmatched_pos = 0
	furi_regex_pattern = ''
	for match in regex.finditer(kanji):
		start = match.start()
		if start > last_unmatched_pos:
			furi_regex_pattern += '(.+)'
			kanji_block_indices.append(len(furi_blocks))
			furi_blocks.append(kanji[last_unmatched_pos:start])  # str not FuriBlock!
		furi = match.group(0)
		furi_regex_pattern += furi.translate(katakana_to_hiragana_dict)
		last_unmatched_pos = match.end()
		furi_blocks.append(FuriBlock(furi, ''))
	if remainder := kanji[last_unmatched_pos:]:
		furi_regex_pattern += '(.+)'
		kanji_block_indices.append(len(furi_blocks))
		furi_blocks.append(remainder)  # str not FuriBlock!
	# This will throw on mismatch, e.g. from は＝わ
	try:
		furi_groups = re.match(furi_regex_pattern, hiragana).groups()  # This could be ambiguous!
	except:
		# print(furi_regex_pattern)
		# print(kanji)
		# print(hiragana)
		raise
	for idx, furi in zip(kanji_block_indices, furi_groups):
		furi_blocks[idx] = FuriBlock(furi_blocks[idx], furi.strip())  # str -> FuriBlock
	return furi_blocks
def word_to_furi_blocks(kanji, hiragana) -> list[FuriBlock]:
	# Workaround for は＝わ - try treating は as a kanji if first parse fails
	try:
		return _word_to_furi_blocks(kanji, hiragana, re_kana)
	except AttributeError:
		return _word_to_furi_blocks(kanji, hiragana, re_kana_no_ha)


def parse_overrides_str(overrides_str: str) -> dict:
	# Convert '私|わたくし|わたし' to {'私': ('わたくし', 'わたし')}
	# Convert '私|わたし' to {'私': 'わたし'}
	overrides_dict = default_word_overrides.copy()
	for line in overrides_str.splitlines():
		line = line.strip()
		if not line:
			continue
		kanji, *replacement = line.split('|')
		if not kanji or not replacement:
			continue
		if len(replacement) > 1:
			overrides_dict[kanji] = (replacement[0], replacement[1])
		else:
			overrides_dict[kanji] = replacement[0]
	return overrides_dict


def parse_japanese_line(line: str, overrides_str: str=''):
	word_overrides = parse_overrides_str(overrides_str)
	# Split line into plaintext segments to be tokenized, and manual furigana segments
	last_unmatched_pos = 0
	word_tokens = []
	for match in re_manual_furi.finditer(line):
		start = match.start()
		end = match.end()
		if start > last_unmatched_pos:
			word_tokens += parse_jp_text(line[last_unmatched_pos:start])  # Process japanese plaintext
		word_tokens.append(match.groups())  # Already sorted into (kanji, hiragana)
		last_unmatched_pos = end
	if remainder := line[last_unmatched_pos:]:
		word_tokens += parse_jp_text(remainder)
	# We now have a series of word tokens that have had all the manual furigana handled

	output = {}

	# Process the tokens
	output['word_pairs'] = []
	for token in (it := iter(word_tokens)):
		word, katakana = token
		if katakana is None:  # TODO: Weird edge case with nonsense input, should fix elsewhere
			katakana = ''
		if word[-1] == 'っ':  # MeCab splits ？って into ？っ, て so we merge it back
			try:
				next_word, next_katakana = next(it)
				word += next_word
				katakana += next_katakana
			except StopIteration:
				pass
		hiragana = katakana.translate(katakana_to_hiragana_dict)
		if word in word_overrides:  # Note that most word replacements will instead need to be handled BEFORE tokenization!
			override = word_overrides[word]
			if isinstance(override, tuple):
				if hiragana == override[0]:
					hiragana = override[1]
			else:
				hiragana = override
		output['word_pairs'].append((word, hiragana))

	# Process our (kanji, hiragana) word pairs into furigana blocks
	output['furi_blocks'] = []  # Must be iterated for timing
	for kanji, hiragana in output['word_pairs']:
		output['furi_blocks'] += word_to_furi_blocks(kanji, hiragana)
	
	# Create word-spaced hiragana and romaji syllables
	output['hiragana_syllables'] = []  # Will have spaces mixed in so must be iterated for timing
	output['romaji_syllables'] = []  # Will have spaces mixed in so must be iterated for timing
	for _, hiragana in output['word_pairs']:
		l = [s for syl in kana_to_syllable_list(hiragana) if (s:= syl.strip())]
		output['hiragana_syllables'] += l
		output['romaji_syllables'] += [to_romaji(s) for syl in l if (s:= syl.strip())]
		if len(l) > 0:
			output['hiragana_syllables'].append(' ')
			output['romaji_syllables'].append(' ')
	if len(output['romaji_syllables']) > 0:  # remove trailing space
		output['hiragana_syllables'].pop()
		output['romaji_syllables'].pop()
	return output