diff --git a/scripts/loaders/SoundLoader.gd b/scripts/loaders/SoundLoader.gd index 1bb5376..9bb39db 100644 --- a/scripts/loaders/SoundLoader.gd +++ b/scripts/loaders/SoundLoader.gd @@ -18,16 +18,25 @@ const BYTES_PER_SAMPLE := 2 # 16bit samples # !!! Adding a few ms to the loops removes harshness. !!! const HACK_EXTEND_LOOP_SAMPLE_EXTRA_MS := 2 # !!! func HACK_EXTEND_LOOP_SAMPLE(audio: AudioStreamSample) -> AudioStreamSample: # !!! - if audio.loop_begin >= audio.loop_end: # !!! - return audio # !!! + var output: AudioStreamSample = audio.duplicate(true) # !!! + # Prepend silence # !!! + var silent_samples := (audio.mix_rate * PREPEND_MS) / 1000 # !!! + var silence := PoolByteArray() # !!! + silence.resize(silent_samples * 2) # 16bit samples in 8bit array # !!! + silence.fill(0) # !!! + output.data = silence + output.data # !!! + output.loop_begin += silent_samples # !!! + output.loop_end += silent_samples # !!! + # Append looped samples # !!! + if output.loop_begin >= output.loop_end: # !!! + return output # !!! var looped_samples = audio.data.subarray(audio.loop_begin * BYTES_PER_SAMPLE, -1) # !!! var loop_len = len(looped_samples) # !!! var target_len = (audio.mix_rate * HACK_EXTEND_LOOP_SAMPLE_EXTRA_MS / 1000) * BYTES_PER_SAMPLE # !!! while loop_len < target_len: # Keep doubling in length until it's long enough !!! looped_samples += looped_samples # !!! loop_len = len(looped_samples) # !!! - var output = audio.duplicate(true) # !!! - output.data = audio.data + looped_samples # !!! + output.data += looped_samples # !!! return output # !!! # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! @@ -81,14 +90,14 @@ func make_sample(buffer: StreamPeerBuffer, size: int, sample_rate: int) -> Audio return audio var num_packets := size/9 - var samples = PoolIntArray([0, 0]) # Start with two zero samples for filter purposes, strip them from the actual output + var samples = PoolIntArray([0, 0]) # Start with two zero samples for filter purposes, strip them from the actual output later var i := 2 for pkt in num_packets: # Decode a single 9byte BRR packet var header_byte := buffer.get_u8() var exponent := header_byte >> 4 var filter := (header_byte >> 2) & 0x03 - var loop := bool(header_byte & 0x02) + # var loop := bool(header_byte & 0x02) var end := bool(header_byte & 0x01) for sample in 8: var b := buffer.get_u8() @@ -109,30 +118,26 @@ func make_sample(buffer: StreamPeerBuffer, size: int, sample_rate: int) -> Audio if end: # print('End flag on packet') break - # Convert int array to byte array - var audio_data = PoolByteArray() - # Prepend silence, accounting for the two null samples - var silent_samples := ((sample_rate * PREPEND_MS) / 1000) - 2 - audio_data.resize(silent_samples * 2) # 16bit samples in 8bit array - audio_data.fill(0) + # Remove first two zero samples + samples.remove(0) + samples.remove(0) # Pack 16bit samples to 8bit array - for b in samples: - audio_data.append(b & 0xFF) - audio_data.append(b >> 8) - audio.data = audio_data + var out_buff = StreamPeerBuffer.new() + for sample in samples: + out_buff.put_16(sample) + audio.data = out_buff.data_array return audio func get_inst_sample_data(snes_data: Dictionary, buffer: StreamPeerBuffer, id: int) -> AudioStreamSample: var sample_rate := get_reference_pitch_samplerate(snes_data.bgm_instrument_samplerates[id] & 0xFF) - var silent_samples := ((sample_rate * PREPEND_MS) / 1000) var loop_start_packet: int = snes_data.bgm_instrument_loop_starts[id]/9 # Note that Instrument $1F Steel Guitar has a length of $088B but a loop point of $088D which is 243.22... packets. Luckily it doesn't matter. buffer.seek(snes_data.bgm_instrument_brr_pointers[id] & 0x3FFFFF) var size := buffer.get_u16() var num_samples := (size/9)*16 var audio := make_sample(buffer, size, sample_rate) audio.loop_mode = AudioStreamSample.LOOP_FORWARD - audio.loop_begin = (loop_start_packet * 16) + silent_samples # Each 9byte packet is 16 samples - audio.loop_end = silent_samples + num_samples + audio.loop_begin = (loop_start_packet * 16) # Each 9byte packet is 16 samples + audio.loop_end = num_samples # print_debug('Loaded instrument #%02X with lookup offset $%06X, BRR data offset $%06X, length $%04X (%f packets, %d samples) and loop point %d samples' % [id, lookup_offset, brr_offset, size, size/9.0, num_samples, audio.loop_begin]) return audio @@ -148,11 +153,10 @@ func load_sfx_samples_data(snes_data: Dictionary, buffer: StreamPeerBuffer): buffer.seek(brr_spc_addrs[i] + brr_spc_start) # print('Loading sfx sample #%X with BRR data offset $%06X' % [i, buffer.get_position()]) var sample_rate := get_reference_pitch_samplerate(snes_data.sfx_samplerates[i] & 0xFF) - var silent_samples := ((sample_rate * PREPEND_MS) / 1000) var audio := make_sample(buffer, 900, sample_rate) var loop_start_packet: int = brr_spc_loop_addrs[i] - brr_spc_addrs[i] audio.loop_mode = AudioStreamSample.LOOP_FORWARD - audio.loop_begin = (loop_start_packet * 16) + silent_samples # Each 9byte packet is 16 samples + audio.loop_begin = loop_start_packet * 16 # Each 9byte packet is 16 samples audio.loop_end = (len(audio.data)/2) sfx_samples.append(audio) # Use 900 as a limit, it won't be hit, parser stops after End packet anyway emit_signal('audio_sfx_sample_loaded', i) @@ -162,13 +166,103 @@ func load_sfx_samples_data(snes_data: Dictionary, buffer: StreamPeerBuffer): # Called when the node enters the scene tree for the first time. func load_samples(snes_data: Dictionary, buffer: StreamPeerBuffer): load_sfx_samples_data(snes_data, buffer) + var largest_sample_idx := -1 + var largest_sample_sample_count := 0 + var total_frames := 0 # For some reason, this is a bit slow currently under certain editor conditions. Might optimize later. for i in INST_NUM: instrument_samples.append(get_inst_sample_data(snes_data, buffer, i)) + total_frames += instrument_samples[i].loop_end + if largest_sample_sample_count < instrument_samples[i].loop_end: + largest_sample_sample_count = instrument_samples[i].loop_end + largest_sample_idx = i # Workaround for Godot 3.x quirk where looping samples are interpolated as if they go to nothing instead of looping instrument_samples_HACK_EXTENDED_LOOPS.append(HACK_EXTEND_LOOP_SAMPLE(instrument_samples[i])) - print('Instrument %02X has mix_rate %d Hz'%[i, instrument_samples[i].mix_rate]) + # print('Instrument %02X has mix_rate %d Hz and %d samples'%[i, instrument_samples[i].mix_rate, len(instrument_samples[i].data)/2]) emit_signal('audio_inst_sample_loaded', i) + # print('Largest sample is instrument %d with length %d and mix_rate %d'%[largest_sample_idx, largest_sample_sample_count, instrument_samples[largest_sample_idx].mix_rate]) + # print('Total frames: %d'%total_frames) + + +# We start the texture with a bunch of same-size headers +# uint16 sample_start // The true start, after the prepended 3 frames of silence +# uint16 sample_length // 3 frames after the true end, because of how we loop +# uint16 sample_loop_begin // 3 frames after the true loop point +# uint16 mixrate +# 2*uint8 AD of ADSR ([0.0, 1.0] is fine) +# 2*uint8 SR of ADSR ([0.0, 1.0] is fine) +var samples_tex: ImageTexture +const TEX_WIDTH := 2048 +const FILTER_PAD := 3 +func samples_to_texture(): + var num_samples := INST_NUM + SFX_NUM + var header_length := num_samples * 6 + + # Create header and unwrapped payload separately first + var header_data := PoolByteArray() + var header_buffer := StreamPeerBuffer.new() + header_buffer.data_array = header_data + var payload_data := PoolByteArray() + var payload_buffer := StreamPeerBuffer.new() + payload_buffer.data_array = payload_data + + for sample in instrument_samples + sfx_samples: + var loop_end: int = sample.loop_end + var loop_begin: int = sample.loop_begin + var nonlooping: bool = loop_begin >= loop_end + if nonlooping: + loop_begin = loop_end + loop_end += 1 + header_buffer.put_u16(header_length + (len(payload_data)/2) + FILTER_PAD) # sample_start + header_buffer.put_u16(sample.loop_end + FILTER_PAD) # sample_length + header_buffer.put_u16(sample.loop_begin + FILTER_PAD) # sample_loop_begin + header_buffer.put_u16(sample.mix_rate) # sample_mixrate + header_buffer.put_u8(0) # TODO: attack + header_buffer.put_u8(0) # TODO: decay + header_buffer.put_u8(0) # TODO: sustain + header_buffer.put_u8(0) # TODO: release + for i in FILTER_PAD: # Prepend 3 frames of silence + payload_buffer.put_16(0) + payload_buffer.put_data(sample.data) # Copy entire S16LE audio data + if nonlooping: + for i in FILTER_PAD*2: + payload_buffer.put_16(0) # 6 frames of trailing silence to loop + else: + # Copy frame by frame in case the loop is shorter than 6 frames + var loop_length = sample.loop_end - sample.loop_begin + for i in FILTER_PAD*2: + var pos := payload_buffer.get_position() + payload_buffer.seek(pos - loop_length) + var frame := payload_buffer.get_16() + payload_buffer.seek(pos) + payload_buffer.put_16(frame) + # Combine the unwrapped arrays + var data := header_data + payload_data + # Now calculate wrapping and rowwise padding for the combined array + for row in TEX_WIDTH: + var row_end: int = (row + 1) * TEX_WIDTH * 2 # Remember: 8bit array, 16bit values + if len(data)/2 > row_end: + # [... a b c] + [a b c] + [a b c ...] + data = data.subarray(0, row_end-1) + data.subarray(row_end-FILTER_PAD*2, row_end-1) + data.subarray(row_end-FILTER_PAD*2, -1) + else: + break + var needed_rows := (len(data)/2)/float(TEX_WIDTH) + var rows := int(pow(2, ceil(log(needed_rows) / log(2)))) + if rows > TEX_WIDTH: + print_debug('Sound Sample Texture rows have exceeded width: %d > %d'%[rows, TEX_WIDTH]) + # Now that the full texture size is known, pad our existing data with zeroes until the end + var final_data_size_bytes = rows * TEX_WIDTH * 2 + if final_data_size_bytes > len(data): + var end_padding := PoolByteArray() + end_padding.resize(final_data_size_bytes - len(data)) + end_padding.fill(0) + data = data + end_padding + + # data is complete, turn it into an ImageTexture for the shader to use + var samples_img = Image.new() + samples_img.create_from_data(TEX_WIDTH, rows, false, Image.FORMAT_LA8, data) + self.samples_tex = ImageTexture.new() + self.samples_tex.create_from_image(samples_img, Texture.FLAG_FILTER) var player := AudioStreamPlayer.new() # Make one for each channel, later diff --git a/shaders/audio_renderer.gdshader b/shaders/audio_renderer.gdshader index 7392a4e..5007b9e 100644 --- a/shaders/audio_renderer.gdshader +++ b/shaders/audio_renderer.gdshader @@ -1,3 +1,6 @@ +// ============================================================= BOILERPLATE ============================================================= +// While most of the data we are working with is integral, GPU conversion overheads mean almost all of this will be floats. +// Unfortunately, this loses type-checking on [0.0, 1.0] vs [0,255] etc. so a lot of this will involve comments declaring ranges. shader_type canvas_item; render_mode blend_premul_alpha; const float TEX_SIZE = 4096.0; @@ -13,11 +16,16 @@ const float x10000 = float(0x10000); // 65536.0 const vec2 INT16_DOT_BE = vec2(xFF00, x00FF); const vec2 INT16_DOT_LE = vec2(x00FF, xFF00); + uniform sampler2D tex : hint_normal; +float unpack_uint16(vec2 uint16) { + // Convert packed 2byte integer, sampled as two [0.0, 1.0] range floats, to the original int value [0, 65535] in float32 + return dot(uint16, INT16_DOT_LE); +} + float unpack_int16(vec2 int16) { - // Convert packed 2byte integer, sampled as two [0.0, 1.0] range floats, - // to the original int value [-32768, 32767] or [0, 65535] but in float32 + // Convert packed 2byte integer, sampled as two [0.0, 1.0] range floats, to the original int value [-32768, 32767] in float32 float unsigned = dot(int16, INT16_DOT_LE); return unsigned - (unsigned < x7FFF ? 0.0 : x10000); } @@ -48,6 +56,76 @@ vec4 test_writeback(vec2 uv) { return output; } + +// ============================================================= LOGIC ============================================================= +// We have around 200k frames across 35 instrument samples +// 35 instrument samples and 8 sfx samples = 43 samples +// 2048x128 texture maybe? at 2bytes per texel, that's 512KiB of VRAM +// We start the texture with a bunch of same-size headers +// uint16 sample_start // The true start, after the prepended 3 frames of silence +// uint16 sample_length // 3 frames after the true end, because of how we loop +// uint16 sample_loop_begin // 3 frames after the true loop point +// uint16 mixrate +// 2*uint8 AD of ADSR ([0.0, 1.0] is fine) +// 2*uint8 SR of ADSR ([0.0, 1.0] is fine) +// So six texture() calls spent on header information, and one on the final lookup. +// Alternatively, sample length could be omitted and fetched as the start of the next entry to save redundant entries. +// +// To accomodate filtering, every sample must begin with 3 frames of silence, and end with 6 frames of the beginning of the loop. +// Looped playback will go from the first 3 of 6 frames at the end, to the third frame after the loop start point, to avoid filter bleeding. +// If a sample does not loop, it must have 6 frames of silence at the end, not including the subsequent next sample's 3 frames of silence prefix. +// As such, every sample will have an additional 9 frames, 3 before, 6 after. +// Additionally, every row of the texture must have 3 redundant frames on either side - i.e., we only sample from [3, 2045) on any given row. +// So the payload of a 2048-wide texture will be 2042 per row, excluding the initial header. +// So for 43 samples, a header of 43*6 = 258 texels starts the first row, +// after which the first sample's 3 frames of silence (3 texels of (0.0, 0.0), 6 bytes of 0x00) may begin. +// A 2048x128 texture would have a payload of 2042x128 = 261376 frames (texels) excluding header +// With the 258 texel header, which uses 3 texels of margin, 255 would be subtracted from the above payload, +// leaving 261121 texels for the sample data. + +const float HEADER_LENGTH_TEXELS = 6.0; +uniform sampler2D instrument_samples; +uniform vec2 instrument_samples_size = vec2(2048.0, 128.0); +uniform float instrument_row_padding = 3.0; // In case we want to go to cubic filtering +uniform float instrument_row_payload = 2042.0; // 2048-3-3 Make sure to set with instrument_samples_size and instrument_row_padding! +uniform float reference_note = 71.0; // [0, 255], possibly [0, 127] +uniform float output_mixrate = 32000.0; // SNES SPC output is 32kHz + +float get_pitch_scale(float note) { + // return pow(2.0, (note - reference_note)/12.0); + return exp2((note - reference_note)/12.0); +} + +vec2 get_inst_texel(vec2 xy) { + return texture(instrument_samples, xy/instrument_samples_size).xw; +} + +float get_instrument_sample(float instrument_index, float pitch_scale, float t, float t_end) { + // t_end is for ADSR purposes + float header_offset = instrument_index * HEADER_LENGTH_TEXELS; + float sample_start = unpack_uint16(get_inst_texel(vec2(header_offset, 0.0))); // The true start, after the prepended 3 frames of silence + float sample_length = unpack_uint16(get_inst_texel(vec2(header_offset + 1.0, 0.0))); // 3 frames after the true end, because of how we loop + float sample_loop_begin = unpack_uint16(get_inst_texel(vec2(header_offset + 2.0, 0.0))); // 3 frames after the true loop point + float sample_mixrate = unpack_uint16(get_inst_texel(vec2(header_offset + 3.0, 0.0))); + vec2 attack_decay = get_inst_texel(vec2(header_offset + 4.0, 0.0)); + vec2 sustain_release = get_inst_texel(vec2(header_offset + 5.0, 0.0)); + // Calculate the point we want to sample in linear space + float mixrate = sample_mixrate * pitch_scale; + float target_frame = t * mixrate; + // If we're past the end of the sample, we need to wrap it back to within the loop range + float loop_length = sample_length - sample_loop_begin; + float overshoot = max(target_frame - sample_length, 0.0); + float overshoot_loops = ceil(overshoot/loop_length); + target_frame -= overshoot_loops*loop_length; + // Now we need to identify the sampling point since our frames are spread across multiple rows for GPU reasons + // We only sample from texel 4 onwards on a given row - texel 0 is the header, texels 1,2,3 are lead-in for filtering + // Note that y should be integral, but x should be continuous, as that's what applies the filtering! + target_frame += sample_start; + vec2 sample_xy = vec2(instrument_row_padding + mod(target_frame, instrument_row_payload), trunc(target_frame/instrument_row_payload)); + return rescale_int16(unpack_int16(get_inst_texel(sample_xy))); +} + + void fragment() { // GLES2 vec2 uv = vec2(UV.x, 1.0-UV.y); diff --git a/test/audio_system.gd b/test/audio_system.gd index 0a08d05..3841bbf 100644 --- a/test/audio_system.gd +++ b/test/audio_system.gd @@ -124,4 +124,23 @@ func _ready() -> void: $btn_hack_loop_extension.text += ' (%dms)'%SoundLoader.HACK_EXTEND_LOOP_SAMPLE_EXTRA_MS for i in len(RomLoader.snes_data.bgm_song_pointers): var pointer = RomLoader.snes_data.bgm_song_pointers[i] - print('BGM 0x%02X (%02d) at 0x%06X' % [i, i, pointer]) + # print('BGM 0x%02X (%02d) at 0x%06X' % [i, i, pointer]) + +# var test_payload := PoolByteArray() +# test_payload.resize(4096*4096*2) +# # for i in 5: +# # test_payload.fill(i*2+10) +# # $'%audio_renderer'.render_queue.append(test_payload) +# test_payload.fill(0) +# for i in 65536: +# test_payload.set(i*2, i%256) +# test_payload.set(i*2+1, i/256) +# $'%audio_renderer'.render_queue.append(test_payload) +# # $'%audio_renderer'.render_queue.append(test_payload) + +# func _process(_delta): +# update() + +# func _draw() -> void: +# if $'%audio_renderer'.waiting_for_viewport: +# $'%audio_renderer'.get_result()