[WIP] Sound render shader

2024-07-10 22:13:58 +09:30 · 2024-07-10 22:13:58 +09:30 · 89d244eb88
parent 80cbfa7ab8
commit 89d244eb88
3 changed files with 216 additions and 25 deletions
--- a/scripts/loaders/SoundLoader.gd
+++ b/scripts/loaders/SoundLoader.gd
@ -18,16 +18,25 @@ const BYTES_PER_SAMPLE := 2  # 16bit samples
 # !!! Adding a few ms to the loops removes harshness.                                                 !!!
 const HACK_EXTEND_LOOP_SAMPLE_EXTRA_MS := 2                                                         # !!!
 func HACK_EXTEND_LOOP_SAMPLE(audio: AudioStreamSample) -> AudioStreamSample:                        # !!!
-	if audio.loop_begin >= audio.loop_end:                                                          # !!!
+	var output: AudioStreamSample = audio.duplicate(true)                                           # !!!
-		return audio                                                                                # !!!
+	# Prepend silence                                                                               # !!!
 	var silent_samples := (audio.mix_rate * PREPEND_MS) / 1000                                      # !!!
 	var silence := PoolByteArray()                                                                  # !!!
 	silence.resize(silent_samples * 2)  # 16bit samples in 8bit array                               # !!!
 	silence.fill(0)                                                                                 # !!!
 	output.data = silence + output.data                                                             # !!!
 	output.loop_begin += silent_samples                                                             # !!!
 	output.loop_end += silent_samples                                                               # !!!
 	# Append looped samples                                                                         # !!!
 	if output.loop_begin >= output.loop_end:                                                        # !!!
 		return output                                                                               # !!!
 	var looped_samples = audio.data.subarray(audio.loop_begin * BYTES_PER_SAMPLE, -1)               # !!!
 	var loop_len = len(looped_samples)                                                              # !!!
 	var target_len = (audio.mix_rate * HACK_EXTEND_LOOP_SAMPLE_EXTRA_MS / 1000) * BYTES_PER_SAMPLE  # !!!
 	while loop_len < target_len:  # Keep doubling in length until it's long enough                    !!!
 		looped_samples += looped_samples                                                            # !!!
 		loop_len = len(looped_samples)                                                              # !!!
-	var output = audio.duplicate(true)                                                              # !!!
+	output.data += looped_samples                                                                   # !!!
 	output.data = audio.data + looped_samples                                                       # !!!
 	return output                                                                                   # !!!
 # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@ -81,14 +90,14 @@ func make_sample(buffer: StreamPeerBuffer, size: int, sample_rate: int) -> Audio
 		return audio
 	var num_packets := size/9
-	var samples = PoolIntArray([0, 0])  # Start with two zero samples for filter purposes, strip them from the actual output
+	var samples = PoolIntArray([0, 0])  # Start with two zero samples for filter purposes, strip them from the actual output later
 	var i := 2
 	for pkt in num_packets:
 		# Decode a single 9byte BRR packet
 		var header_byte := buffer.get_u8()
 		var exponent := header_byte >> 4
 		var filter := (header_byte >> 2) & 0x03
-		var loop := bool(header_byte & 0x02)
+		# var loop := bool(header_byte & 0x02)
 		var end := bool(header_byte & 0x01)
 		for sample in 8:
 			var b := buffer.get_u8()
@ -109,30 +118,26 @@ func make_sample(buffer: StreamPeerBuffer, size: int, sample_rate: int) -> Audio
 		if end:
 			# print('End flag on packet')
 			break
-	# Convert int array to byte array
+	# Remove first two zero samples
-	var audio_data = PoolByteArray()
+	samples.remove(0)
-	# Prepend silence, accounting for the two null samples
+	samples.remove(0)
 	var silent_samples := ((sample_rate * PREPEND_MS) / 1000) - 2
 	audio_data.resize(silent_samples * 2)  # 16bit samples in 8bit array
 	audio_data.fill(0)
 	# Pack 16bit samples to 8bit array
-	for b in samples:
+	var out_buff = StreamPeerBuffer.new()
-		audio_data.append(b & 0xFF)
+	for sample in samples:
-		audio_data.append(b >> 8)
+		out_buff.put_16(sample)
-	audio.data = audio_data
+	audio.data = out_buff.data_array
 	return audio
 func get_inst_sample_data(snes_data: Dictionary, buffer: StreamPeerBuffer, id: int) -> AudioStreamSample:
 	var sample_rate := get_reference_pitch_samplerate(snes_data.bgm_instrument_samplerates[id] & 0xFF)
 	var silent_samples := ((sample_rate * PREPEND_MS) / 1000)
 	var loop_start_packet: int = snes_data.bgm_instrument_loop_starts[id]/9  # Note that Instrument $1F Steel Guitar has a length of $088B but a loop point of $088D which is 243.22... packets. Luckily it doesn't matter.
 	buffer.seek(snes_data.bgm_instrument_brr_pointers[id] & 0x3FFFFF)
 	var size := buffer.get_u16()
 	var num_samples := (size/9)*16
 	var audio := make_sample(buffer, size, sample_rate)
 	audio.loop_mode = AudioStreamSample.LOOP_FORWARD
-	audio.loop_begin = (loop_start_packet * 16) + silent_samples  # Each 9byte packet is 16 samples
+	audio.loop_begin = (loop_start_packet * 16)  # Each 9byte packet is 16 samples
-	audio.loop_end = silent_samples + num_samples
+	audio.loop_end = num_samples
 	# print_debug('Loaded instrument #%02X with lookup offset $%06X, BRR data offset $%06X, length $%04X (%f packets, %d samples) and loop point %d samples' % [id, lookup_offset, brr_offset, size, size/9.0, num_samples, audio.loop_begin])
 	return audio
@ -148,11 +153,10 @@ func load_sfx_samples_data(snes_data: Dictionary, buffer: StreamPeerBuffer):
 		buffer.seek(brr_spc_addrs[i] + brr_spc_start)
 		# print('Loading sfx sample #%X with BRR data offset $%06X' % [i, buffer.get_position()])
 		var sample_rate := get_reference_pitch_samplerate(snes_data.sfx_samplerates[i] & 0xFF)
 		var silent_samples := ((sample_rate * PREPEND_MS) / 1000)
 		var audio := make_sample(buffer, 900, sample_rate)
 		var loop_start_packet: int = brr_spc_loop_addrs[i] - brr_spc_addrs[i]
 		audio.loop_mode = AudioStreamSample.LOOP_FORWARD
-		audio.loop_begin = (loop_start_packet * 16) + silent_samples  # Each 9byte packet is 16 samples
+		audio.loop_begin = loop_start_packet * 16  # Each 9byte packet is 16 samples
 		audio.loop_end = (len(audio.data)/2)
 		sfx_samples.append(audio)  # Use 900 as a limit, it won't be hit, parser stops after End packet anyway
 		emit_signal('audio_sfx_sample_loaded', i)
@ -162,13 +166,103 @@ func load_sfx_samples_data(snes_data: Dictionary, buffer: StreamPeerBuffer):
 # Called when the node enters the scene tree for the first time.
 func load_samples(snes_data: Dictionary, buffer: StreamPeerBuffer):
 	load_sfx_samples_data(snes_data, buffer)
 	var largest_sample_idx := -1
 	var largest_sample_sample_count := 0
 	var total_frames := 0
 	# For some reason, this is a bit slow currently under certain editor conditions. Might optimize later.
 	for i in INST_NUM:
 		instrument_samples.append(get_inst_sample_data(snes_data, buffer, i))
 		total_frames += instrument_samples[i].loop_end
 		if largest_sample_sample_count < instrument_samples[i].loop_end:
 			largest_sample_sample_count = instrument_samples[i].loop_end
 			largest_sample_idx = i
 		# Workaround for Godot 3.x quirk where looping samples are interpolated as if they go to nothing instead of looping
 		instrument_samples_HACK_EXTENDED_LOOPS.append(HACK_EXTEND_LOOP_SAMPLE(instrument_samples[i]))
-		print('Instrument %02X has mix_rate %d Hz'%[i, instrument_samples[i].mix_rate])
+		# print('Instrument %02X has mix_rate %d Hz and %d samples'%[i, instrument_samples[i].mix_rate, len(instrument_samples[i].data)/2])
 		emit_signal('audio_inst_sample_loaded', i)
 	# print('Largest sample is instrument %d with length %d and mix_rate %d'%[largest_sample_idx, largest_sample_sample_count, instrument_samples[largest_sample_idx].mix_rate])
 	# print('Total frames: %d'%total_frames)
 # We start the texture with a bunch of same-size headers
 #     uint16 sample_start       // The true start, after the prepended 3 frames of silence
 #     uint16 sample_length      // 3 frames after the true end, because of how we loop
 #     uint16 sample_loop_begin  // 3 frames after the true loop point
 #     uint16 mixrate
 #     2*uint8 AD of ADSR ([0.0, 1.0] is fine)
 #     2*uint8 SR of ADSR ([0.0, 1.0] is fine)
 var samples_tex: ImageTexture
 const TEX_WIDTH := 2048
 const FILTER_PAD := 3
 func samples_to_texture():
 	var num_samples := INST_NUM + SFX_NUM
 	var header_length := num_samples * 6
 	# Create header and unwrapped payload separately first
 	var header_data := PoolByteArray()
 	var header_buffer := StreamPeerBuffer.new()
 	header_buffer.data_array = header_data
 	var payload_data := PoolByteArray()
 	var payload_buffer := StreamPeerBuffer.new()
 	payload_buffer.data_array = payload_data
 	for sample in instrument_samples + sfx_samples:
 		var loop_end: int = sample.loop_end
 		var loop_begin: int = sample.loop_begin
 		var nonlooping: bool = loop_begin >= loop_end
 		if nonlooping:
 			loop_begin = loop_end
 			loop_end += 1
 		header_buffer.put_u16(header_length + (len(payload_data)/2) + FILTER_PAD)  # sample_start
 		header_buffer.put_u16(sample.loop_end + FILTER_PAD)  # sample_length
 		header_buffer.put_u16(sample.loop_begin + FILTER_PAD)  # sample_loop_begin
 		header_buffer.put_u16(sample.mix_rate)  # sample_mixrate
 		header_buffer.put_u8(0)  # TODO: attack
 		header_buffer.put_u8(0)  # TODO: decay
 		header_buffer.put_u8(0)  # TODO: sustain
 		header_buffer.put_u8(0)  # TODO: release
 		for i in FILTER_PAD:  # Prepend 3 frames of silence
 			payload_buffer.put_16(0)
 		payload_buffer.put_data(sample.data)  # Copy entire S16LE audio data
 		if nonlooping:
 			for i in FILTER_PAD*2:
 				payload_buffer.put_16(0)  # 6 frames of trailing silence to loop
 		else:
 			# Copy frame by frame in case the loop is shorter than 6 frames
 			var loop_length = sample.loop_end - sample.loop_begin
 			for i in FILTER_PAD*2:
 				var pos := payload_buffer.get_position()
 				payload_buffer.seek(pos - loop_length)
 				var frame := payload_buffer.get_16()
 				payload_buffer.seek(pos)
 				payload_buffer.put_16(frame)
 	# Combine the unwrapped arrays
 	var data := header_data + payload_data
 	# Now calculate wrapping and rowwise padding for the combined array
 	for row in TEX_WIDTH:
 		var row_end: int = (row + 1) * TEX_WIDTH * 2  # Remember: 8bit array, 16bit values
 		if len(data)/2 > row_end:
 			# [... a b c] + [a b c] + [a b c ...]
 			data = data.subarray(0, row_end-1) + data.subarray(row_end-FILTER_PAD*2, row_end-1) + data.subarray(row_end-FILTER_PAD*2, -1)
 		else:
 			break
 	var needed_rows := (len(data)/2)/float(TEX_WIDTH)
 	var rows := int(pow(2, ceil(log(needed_rows) / log(2))))
 	if rows > TEX_WIDTH:
 		print_debug('Sound Sample Texture rows have exceeded width: %d > %d'%[rows, TEX_WIDTH])
 	# Now that the full texture size is known, pad our existing data with zeroes until the end
 	var final_data_size_bytes = rows * TEX_WIDTH * 2
 	if final_data_size_bytes > len(data):
 		var end_padding := PoolByteArray()
 		end_padding.resize(final_data_size_bytes - len(data))
 		end_padding.fill(0)
 		data = data + end_padding
 	# data is complete, turn it into an ImageTexture for the shader to use
 	var samples_img = Image.new()
 	samples_img.create_from_data(TEX_WIDTH, rows, false, Image.FORMAT_LA8, data)
 	self.samples_tex = ImageTexture.new()
 	self.samples_tex.create_from_image(samples_img, Texture.FLAG_FILTER)
 var player := AudioStreamPlayer.new()  # Make one for each channel, later
--- a/shaders/audio_renderer.gdshader
+++ b/shaders/audio_renderer.gdshader
@ -1,3 +1,6 @@
 // ============================================================= BOILERPLATE =============================================================
 // While most of the data we are working with is integral, GPU conversion overheads mean almost all of this will be floats.
 // Unfortunately, this loses type-checking on [0.0, 1.0] vs [0,255] etc. so a lot of this will involve comments declaring ranges.
 shader_type canvas_item;
 render_mode blend_premul_alpha;
 const float TEX_SIZE = 4096.0;
@ -13,11 +16,16 @@ const float x10000 = float(0x10000);  // 65536.0
 const vec2 INT16_DOT_BE = vec2(xFF00, x00FF);
 const vec2 INT16_DOT_LE = vec2(x00FF, xFF00);
 uniform sampler2D tex : hint_normal;
 float unpack_uint16(vec2 uint16) {
 	// Convert packed 2byte integer, sampled as two [0.0, 1.0] range floats, to the original int value [0, 65535] in float32
 	return dot(uint16, INT16_DOT_LE);
 }
 float unpack_int16(vec2 int16) {
-	// Convert packed 2byte integer, sampled as two [0.0, 1.0] range floats,
+	// Convert packed 2byte integer, sampled as two [0.0, 1.0] range floats, to the original int value [-32768, 32767] in float32
 	// to the original int value [-32768, 32767] or [0, 65535] but in float32
 	float unsigned = dot(int16, INT16_DOT_LE);
 	return unsigned - (unsigned < x7FFF ? 0.0 : x10000);
 }
@ -48,6 +56,76 @@ vec4 test_writeback(vec2 uv) {
 	return output;
 }
 // =============================================================    LOGIC    =============================================================
 // We have around 200k frames across 35 instrument samples
 // 35 instrument samples and 8 sfx samples = 43 samples
 // 2048x128 texture maybe? at 2bytes per texel, that's 512KiB of VRAM
 // We start the texture with a bunch of same-size headers
 //     uint16 sample_start       // The true start, after the prepended 3 frames of silence
 //     uint16 sample_length      // 3 frames after the true end, because of how we loop
 //     uint16 sample_loop_begin  // 3 frames after the true loop point
 //     uint16 mixrate
 //     2*uint8 AD of ADSR ([0.0, 1.0] is fine)
 //     2*uint8 SR of ADSR ([0.0, 1.0] is fine)
 // So six texture() calls spent on header information, and one on the final lookup.
 // Alternatively, sample length could be omitted and fetched as the start of the next entry to save redundant entries.
 //
 // To accomodate filtering, every sample must begin with 3 frames of silence, and end with 6 frames of the beginning of the loop.
 // Looped playback will go from the first 3 of 6 frames at the end, to the third frame after the loop start point, to avoid filter bleeding.
 // If a sample does not loop, it must have 6 frames of silence at the end, not including the subsequent next sample's 3 frames of silence prefix.
 // As such, every sample will have an additional 9 frames, 3 before, 6 after.
 // Additionally, every row of the texture must have 3 redundant frames on either side - i.e., we only sample from [3, 2045) on any given row.
 // So the payload of a 2048-wide texture will be 2042 per row, excluding the initial header.
 // So for 43 samples, a header of 43*6 = 258 texels starts the first row,
 //   after which the first sample's 3 frames of silence (3 texels of (0.0, 0.0), 6 bytes of 0x00) may begin.
 // A 2048x128 texture would have a payload of 2042x128 = 261376 frames (texels) excluding header
 // With the 258 texel header, which uses 3 texels of margin, 255 would be subtracted from the above payload,
 //   leaving 261121 texels for the sample data.
 const float HEADER_LENGTH_TEXELS = 6.0;
 uniform sampler2D instrument_samples;
 uniform vec2 instrument_samples_size = vec2(2048.0, 128.0);
 uniform float instrument_row_padding = 3.0;  // In case we want to go to cubic filtering
 uniform float instrument_row_payload = 2042.0;  // 2048-3-3 Make sure to set with instrument_samples_size and instrument_row_padding!
 uniform float reference_note = 71.0;  // [0, 255], possibly [0, 127]
 uniform float output_mixrate = 32000.0;  // SNES SPC output is 32kHz
 float get_pitch_scale(float note) {
 	// return pow(2.0, (note - reference_note)/12.0);
 	return exp2((note - reference_note)/12.0);
 }
 vec2 get_inst_texel(vec2 xy) {
 	return texture(instrument_samples, xy/instrument_samples_size).xw;
 }
 float get_instrument_sample(float instrument_index, float pitch_scale, float t, float t_end) {
 	// t_end is for ADSR purposes
 	float header_offset = instrument_index * HEADER_LENGTH_TEXELS;
 	float sample_start = unpack_uint16(get_inst_texel(vec2(header_offset, 0.0)));  // The true start, after the prepended 3 frames of silence
 	float sample_length = unpack_uint16(get_inst_texel(vec2(header_offset + 1.0, 0.0)));  // 3 frames after the true end, because of how we loop
 	float sample_loop_begin = unpack_uint16(get_inst_texel(vec2(header_offset + 2.0, 0.0)));  // 3 frames after the true loop point
 	float sample_mixrate = unpack_uint16(get_inst_texel(vec2(header_offset + 3.0, 0.0)));
 	vec2 attack_decay = get_inst_texel(vec2(header_offset + 4.0, 0.0));
 	vec2 sustain_release = get_inst_texel(vec2(header_offset + 5.0, 0.0));
 	// Calculate the point we want to sample in linear space
 	float mixrate = sample_mixrate * pitch_scale;
 	float target_frame = t * mixrate;
 	// If we're past the end of the sample, we need to wrap it back to within the loop range
 	float loop_length = sample_length - sample_loop_begin;
 	float overshoot = max(target_frame - sample_length, 0.0);
 	float overshoot_loops = ceil(overshoot/loop_length);
 	target_frame -= overshoot_loops*loop_length;
 	// Now we need to identify the sampling point since our frames are spread across multiple rows for GPU reasons
 	// We only sample from texel 4 onwards on a given row - texel 0 is the header, texels 1,2,3 are lead-in for filtering
 	// Note that y should be integral, but x should be continuous, as that's what applies the filtering!
 	target_frame += sample_start;
 	vec2 sample_xy = vec2(instrument_row_padding + mod(target_frame, instrument_row_payload), trunc(target_frame/instrument_row_payload));
 	return rescale_int16(unpack_int16(get_inst_texel(sample_xy)));
 }
 void fragment() {
 	// GLES2
 	vec2 uv = vec2(UV.x, 1.0-UV.y);
--- a/test/audio_system.gd
+++ b/test/audio_system.gd
@ -124,4 +124,23 @@ func _ready() -> void:
 	$btn_hack_loop_extension.text += ' (%dms)'%SoundLoader.HACK_EXTEND_LOOP_SAMPLE_EXTRA_MS
 	for i in len(RomLoader.snes_data.bgm_song_pointers):
 		var pointer = RomLoader.snes_data.bgm_song_pointers[i]
-		print('BGM 0x%02X (%02d) at 0x%06X' % [i, i, pointer])
+		# print('BGM 0x%02X (%02d) at 0x%06X' % [i, i, pointer])
 # 	var test_payload := PoolByteArray()
 # 	test_payload.resize(4096*4096*2)
 # 	# for i in 5:
 # 	# 	test_payload.fill(i*2+10)
 # 	# 	$'%audio_renderer'.render_queue.append(test_payload)
 # 	test_payload.fill(0)
 # 	for i in 65536:
 # 		test_payload.set(i*2, i%256)
 # 		test_payload.set(i*2+1, i/256)
 # 	$'%audio_renderer'.render_queue.append(test_payload)
 # 	# $'%audio_renderer'.render_queue.append(test_payload)
 # func _process(_delta):
 # 	update()
 # func _draw() -> void:
 # 	if $'%audio_renderer'.waiting_for_viewport:
 # 		$'%audio_renderer'.get_result()