diff --git a/scripts/loaders/SoundLoader.gd b/scripts/loaders/SoundLoader.gd
index 1bb5376..9bb39db 100644
--- a/scripts/loaders/SoundLoader.gd
+++ b/scripts/loaders/SoundLoader.gd
@@ -18,16 +18,25 @@ const BYTES_PER_SAMPLE := 2  # 16bit samples
 # !!! Adding a few ms to the loops removes harshness.                                                 !!!
 const HACK_EXTEND_LOOP_SAMPLE_EXTRA_MS := 2                                                         # !!!
 func HACK_EXTEND_LOOP_SAMPLE(audio: AudioStreamSample) -> AudioStreamSample:                        # !!!
-	if audio.loop_begin >= audio.loop_end:                                                          # !!!
-		return audio                                                                                # !!!
+	var output: AudioStreamSample = audio.duplicate(true)                                           # !!!
+	# Prepend silence                                                                               # !!!
+	var silent_samples := (audio.mix_rate * PREPEND_MS) / 1000                                      # !!!
+	var silence := PoolByteArray()                                                                  # !!!
+	silence.resize(silent_samples * 2)  # 16bit samples in 8bit array                               # !!!
+	silence.fill(0)                                                                                 # !!!
+	output.data = silence + output.data                                                             # !!!
+	output.loop_begin += silent_samples                                                             # !!!
+	output.loop_end += silent_samples                                                               # !!!
+	# Append looped samples                                                                         # !!!
+	if output.loop_begin >= output.loop_end:                                                        # !!!
+		return output                                                                               # !!!
 	var looped_samples = audio.data.subarray(audio.loop_begin * BYTES_PER_SAMPLE, -1)               # !!!
 	var loop_len = len(looped_samples)                                                              # !!!
 	var target_len = (audio.mix_rate * HACK_EXTEND_LOOP_SAMPLE_EXTRA_MS / 1000) * BYTES_PER_SAMPLE  # !!!
 	while loop_len < target_len:  # Keep doubling in length until it's long enough                    !!!
 		looped_samples += looped_samples                                                            # !!!
 		loop_len = len(looped_samples)                                                              # !!!
-	var output = audio.duplicate(true)                                                              # !!!
-	output.data = audio.data + looped_samples                                                       # !!!
+	output.data += looped_samples                                                                   # !!!
 	return output                                                                                   # !!!
 # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
@@ -81,14 +90,14 @@ func make_sample(buffer: StreamPeerBuffer, size: int, sample_rate: int) -> Audio
 		return audio
 	var num_packets := size/9
 
-	var samples = PoolIntArray([0, 0])  # Start with two zero samples for filter purposes, strip them from the actual output
+	var samples = PoolIntArray([0, 0])  # Start with two zero samples for filter purposes, strip them from the actual output later
 	var i := 2
 	for pkt in num_packets:
 		# Decode a single 9byte BRR packet
 		var header_byte := buffer.get_u8()
 		var exponent := header_byte >> 4
 		var filter := (header_byte >> 2) & 0x03
-		var loop := bool(header_byte & 0x02)
+		# var loop := bool(header_byte & 0x02)
 		var end := bool(header_byte & 0x01)
 		for sample in 8:
 			var b := buffer.get_u8()
@@ -109,30 +118,26 @@ func make_sample(buffer: StreamPeerBuffer, size: int, sample_rate: int) -> Audio
 		if end:
 			# print('End flag on packet')
 			break
-	# Convert int array to byte array
-	var audio_data = PoolByteArray()
-	# Prepend silence, accounting for the two null samples
-	var silent_samples := ((sample_rate * PREPEND_MS) / 1000) - 2
-	audio_data.resize(silent_samples * 2)  # 16bit samples in 8bit array
-	audio_data.fill(0)
+	# Remove first two zero samples
+	samples.remove(0)
+	samples.remove(0)
 	# Pack 16bit samples to 8bit array
-	for b in samples:
-		audio_data.append(b & 0xFF)
-		audio_data.append(b >> 8)
-	audio.data = audio_data
+	var out_buff = StreamPeerBuffer.new()
+	for sample in samples:
+		out_buff.put_16(sample)
+	audio.data = out_buff.data_array
 	return audio
 
 func get_inst_sample_data(snes_data: Dictionary, buffer: StreamPeerBuffer, id: int) -> AudioStreamSample:
 	var sample_rate := get_reference_pitch_samplerate(snes_data.bgm_instrument_samplerates[id] & 0xFF)
-	var silent_samples := ((sample_rate * PREPEND_MS) / 1000)
 	var loop_start_packet: int = snes_data.bgm_instrument_loop_starts[id]/9  # Note that Instrument $1F Steel Guitar has a length of $088B but a loop point of $088D which is 243.22... packets. Luckily it doesn't matter.
 	buffer.seek(snes_data.bgm_instrument_brr_pointers[id] & 0x3FFFFF)
 	var size := buffer.get_u16()
 	var num_samples := (size/9)*16
 	var audio := make_sample(buffer, size, sample_rate)
 	audio.loop_mode = AudioStreamSample.LOOP_FORWARD
-	audio.loop_begin = (loop_start_packet * 16) + silent_samples  # Each 9byte packet is 16 samples
-	audio.loop_end = silent_samples + num_samples
+	audio.loop_begin = (loop_start_packet * 16)  # Each 9byte packet is 16 samples
+	audio.loop_end = num_samples
 	# print_debug('Loaded instrument #%02X with lookup offset $%06X, BRR data offset $%06X, length $%04X (%f packets, %d samples) and loop point %d samples' % [id, lookup_offset, brr_offset, size, size/9.0, num_samples, audio.loop_begin])
 	return audio
 
@@ -148,11 +153,10 @@ func load_sfx_samples_data(snes_data: Dictionary, buffer: StreamPeerBuffer):
 		buffer.seek(brr_spc_addrs[i] + brr_spc_start)
 		# print('Loading sfx sample #%X with BRR data offset $%06X' % [i, buffer.get_position()])
 		var sample_rate := get_reference_pitch_samplerate(snes_data.sfx_samplerates[i] & 0xFF)
-		var silent_samples := ((sample_rate * PREPEND_MS) / 1000)
 		var audio := make_sample(buffer, 900, sample_rate)
 		var loop_start_packet: int = brr_spc_loop_addrs[i] - brr_spc_addrs[i]
 		audio.loop_mode = AudioStreamSample.LOOP_FORWARD
-		audio.loop_begin = (loop_start_packet * 16) + silent_samples  # Each 9byte packet is 16 samples
+		audio.loop_begin = loop_start_packet * 16  # Each 9byte packet is 16 samples
 		audio.loop_end = (len(audio.data)/2)
 		sfx_samples.append(audio)  # Use 900 as a limit, it won't be hit, parser stops after End packet anyway
 		emit_signal('audio_sfx_sample_loaded', i)
@@ -162,13 +166,103 @@ func load_sfx_samples_data(snes_data: Dictionary, buffer: StreamPeerBuffer):
 # Called when the node enters the scene tree for the first time.
 func load_samples(snes_data: Dictionary, buffer: StreamPeerBuffer):
 	load_sfx_samples_data(snes_data, buffer)
+	var largest_sample_idx := -1
+	var largest_sample_sample_count := 0
+	var total_frames := 0
 	# For some reason, this is a bit slow currently under certain editor conditions. Might optimize later.
 	for i in INST_NUM:
 		instrument_samples.append(get_inst_sample_data(snes_data, buffer, i))
+		total_frames += instrument_samples[i].loop_end
+		if largest_sample_sample_count < instrument_samples[i].loop_end:
+			largest_sample_sample_count = instrument_samples[i].loop_end
+			largest_sample_idx = i
 		# Workaround for Godot 3.x quirk where looping samples are interpolated as if they go to nothing instead of looping
 		instrument_samples_HACK_EXTENDED_LOOPS.append(HACK_EXTEND_LOOP_SAMPLE(instrument_samples[i]))
-		print('Instrument %02X has mix_rate %d Hz'%[i, instrument_samples[i].mix_rate])
+		# print('Instrument %02X has mix_rate %d Hz and %d samples'%[i, instrument_samples[i].mix_rate, len(instrument_samples[i].data)/2])
 		emit_signal('audio_inst_sample_loaded', i)
+	# print('Largest sample is instrument %d with length %d and mix_rate %d'%[largest_sample_idx, largest_sample_sample_count, instrument_samples[largest_sample_idx].mix_rate])
+	# print('Total frames: %d'%total_frames)
+
+
+# We start the texture with a bunch of same-size headers
+#     uint16 sample_start       // The true start, after the prepended 3 frames of silence
+#     uint16 sample_length      // 3 frames after the true end, because of how we loop
+#     uint16 sample_loop_begin  // 3 frames after the true loop point
+#     uint16 mixrate
+#     2*uint8 AD of ADSR ([0.0, 1.0] is fine)
+#     2*uint8 SR of ADSR ([0.0, 1.0] is fine)
+var samples_tex: ImageTexture
+const TEX_WIDTH := 2048
+const FILTER_PAD := 3
+func samples_to_texture():
+	var num_samples := INST_NUM + SFX_NUM
+	var header_length := num_samples * 6
+
+	# Create header and unwrapped payload separately first
+	var header_data := PoolByteArray()
+	var header_buffer := StreamPeerBuffer.new()
+	header_buffer.data_array = header_data
+	var payload_data := PoolByteArray()
+	var payload_buffer := StreamPeerBuffer.new()
+	payload_buffer.data_array = payload_data
+
+	for sample in instrument_samples + sfx_samples:
+		var loop_end: int = sample.loop_end
+		var loop_begin: int = sample.loop_begin
+		var nonlooping: bool = loop_begin >= loop_end
+		if nonlooping:
+			loop_begin = loop_end
+			loop_end += 1
+		header_buffer.put_u16(header_length + (len(payload_data)/2) + FILTER_PAD)  # sample_start
+		header_buffer.put_u16(sample.loop_end + FILTER_PAD)  # sample_length
+		header_buffer.put_u16(sample.loop_begin + FILTER_PAD)  # sample_loop_begin
+		header_buffer.put_u16(sample.mix_rate)  # sample_mixrate
+		header_buffer.put_u8(0)  # TODO: attack
+		header_buffer.put_u8(0)  # TODO: decay
+		header_buffer.put_u8(0)  # TODO: sustain
+		header_buffer.put_u8(0)  # TODO: release
+		for i in FILTER_PAD:  # Prepend 3 frames of silence
+			payload_buffer.put_16(0)
+		payload_buffer.put_data(sample.data)  # Copy entire S16LE audio data
+		if nonlooping:
+			for i in FILTER_PAD*2:
+				payload_buffer.put_16(0)  # 6 frames of trailing silence to loop
+		else:
+			# Copy frame by frame in case the loop is shorter than 6 frames
+			var loop_length = sample.loop_end - sample.loop_begin
+			for i in FILTER_PAD*2:
+				var pos := payload_buffer.get_position()
+				payload_buffer.seek(pos - loop_length)
+				var frame := payload_buffer.get_16()
+				payload_buffer.seek(pos)
+				payload_buffer.put_16(frame)
+	# Combine the unwrapped arrays
+	var data := header_data + payload_data
+	# Now calculate wrapping and rowwise padding for the combined array
+	for row in TEX_WIDTH:
+		var row_end: int = (row + 1) * TEX_WIDTH * 2  # Remember: 8bit array, 16bit values
+		if len(data)/2 > row_end:
+			# [... a b c] + [a b c] + [a b c ...]
+			data = data.subarray(0, row_end-1) + data.subarray(row_end-FILTER_PAD*2, row_end-1) + data.subarray(row_end-FILTER_PAD*2, -1)
+		else:
+			break
+	var needed_rows := (len(data)/2)/float(TEX_WIDTH)
+	var rows := int(pow(2, ceil(log(needed_rows) / log(2))))
+	if rows > TEX_WIDTH:
+		print_debug('Sound Sample Texture rows have exceeded width: %d > %d'%[rows, TEX_WIDTH])
+	# Now that the full texture size is known, pad our existing data with zeroes until the end
+	var final_data_size_bytes = rows * TEX_WIDTH * 2
+	if final_data_size_bytes > len(data):
+		var end_padding := PoolByteArray()
+		end_padding.resize(final_data_size_bytes - len(data))
+		end_padding.fill(0)
+		data = data + end_padding
+
+	# data is complete, turn it into an ImageTexture for the shader to use
+	var samples_img = Image.new()
+	samples_img.create_from_data(TEX_WIDTH, rows, false, Image.FORMAT_LA8, data)
+	self.samples_tex = ImageTexture.new()
+	self.samples_tex.create_from_image(samples_img, Texture.FLAG_FILTER)
 
 
 var player := AudioStreamPlayer.new()  # Make one for each channel, later
diff --git a/shaders/audio_renderer.gdshader b/shaders/audio_renderer.gdshader
index 7392a4e..5007b9e 100644
--- a/shaders/audio_renderer.gdshader
+++ b/shaders/audio_renderer.gdshader
@@ -1,3 +1,6 @@
+// ============================================================= BOILERPLATE =============================================================
+// While most of the data we are working with is integral, GPU conversion overheads mean almost all of this will be floats.
+// Unfortunately, this loses type-checking on [0.0, 1.0] vs [0,255] etc. so a lot of this will involve comments declaring ranges.
 shader_type canvas_item;
 render_mode blend_premul_alpha;
 const float TEX_SIZE = 4096.0;
@@ -13,11 +16,16 @@ const float x10000 = float(0x10000);  // 65536.0
 
 const vec2 INT16_DOT_BE = vec2(xFF00, x00FF);
 const vec2 INT16_DOT_LE = vec2(x00FF, xFF00);
+
 uniform sampler2D tex : hint_normal;
 
+float unpack_uint16(vec2 uint16) {
+	// Convert packed 2byte integer, sampled as two [0.0, 1.0] range floats, to the original int value [0, 65535] in float32
+	return dot(uint16, INT16_DOT_LE);
+}
+
 float unpack_int16(vec2 int16) {
-	// Convert packed 2byte integer, sampled as two [0.0, 1.0] range floats,
-	// to the original int value [-32768, 32767] or [0, 65535] but in float32
+	// Convert packed 2byte integer, sampled as two [0.0, 1.0] range floats, to the original int value [-32768, 32767] in float32
 	float unsigned = dot(int16, INT16_DOT_LE);
 	return unsigned - (unsigned < x7FFF ? 0.0 : x10000);
 }
@@ -48,6 +56,76 @@ vec4 test_writeback(vec2 uv) {
 	return output;
 }
 
+
+// =============================================================    LOGIC    =============================================================
+// We have around 200k frames across 35 instrument samples
+// 35 instrument samples and 8 sfx samples = 43 samples
+// 2048x128 texture maybe? at 2bytes per texel, that's 512KiB of VRAM
+// We start the texture with a bunch of same-size headers
+//     uint16 sample_start       // The true start, after the prepended 3 frames of silence
+//     uint16 sample_length      // 3 frames after the true end, because of how we loop
+//     uint16 sample_loop_begin  // 3 frames after the true loop point
+//     uint16 mixrate
+//     2*uint8 AD of ADSR ([0.0, 1.0] is fine)
+//     2*uint8 SR of ADSR ([0.0, 1.0] is fine)
+// So six texture() calls spent on header information, and one on the final lookup.
+// Alternatively, sample length could be omitted and fetched as the start of the next entry to save redundant entries.
+//
+// To accomodate filtering, every sample must begin with 3 frames of silence, and end with 6 frames of the beginning of the loop.
+// Looped playback will go from the first 3 of 6 frames at the end, to the third frame after the loop start point, to avoid filter bleeding.
+// If a sample does not loop, it must have 6 frames of silence at the end, not including the subsequent next sample's 3 frames of silence prefix.
+// As such, every sample will have an additional 9 frames, 3 before, 6 after.
+// Additionally, every row of the texture must have 3 redundant frames on either side - i.e., we only sample from [3, 2045) on any given row.
+// So the payload of a 2048-wide texture will be 2042 per row, excluding the initial header.
+// So for 43 samples, a header of 43*6 = 258 texels starts the first row,
+//   after which the first sample's 3 frames of silence (3 texels of (0.0, 0.0), 6 bytes of 0x00) may begin.
+// A 2048x128 texture would have a payload of 2042x128 = 261376 frames (texels) excluding header
+// With the 258 texel header, which uses 3 texels of margin, 255 would be subtracted from the above payload,
+//   leaving 261121 texels for the sample data.
+
+const float HEADER_LENGTH_TEXELS = 6.0;
+uniform sampler2D instrument_samples;
+uniform vec2 instrument_samples_size = vec2(2048.0, 128.0);
+uniform float instrument_row_padding = 3.0;  // In case we want to go to cubic filtering
+uniform float instrument_row_payload = 2042.0;  // 2048-3-3 Make sure to set with instrument_samples_size and instrument_row_padding!
+uniform float reference_note = 71.0;  // [0, 255], possibly [0, 127]
+uniform float output_mixrate = 32000.0;  // SNES SPC output is 32kHz
+
+float get_pitch_scale(float note) {
+	// return pow(2.0, (note - reference_note)/12.0);
+	return exp2((note - reference_note)/12.0);
+}
+
+vec2 get_inst_texel(vec2 xy) {
+	return texture(instrument_samples, xy/instrument_samples_size).xw;
+}
+
+float get_instrument_sample(float instrument_index, float pitch_scale, float t, float t_end) {
+	// t_end is for ADSR purposes
+	float header_offset = instrument_index * HEADER_LENGTH_TEXELS;
+	float sample_start = unpack_uint16(get_inst_texel(vec2(header_offset, 0.0)));  // The true start, after the prepended 3 frames of silence
+	float sample_length = unpack_uint16(get_inst_texel(vec2(header_offset + 1.0, 0.0)));  // 3 frames after the true end, because of how we loop
+	float sample_loop_begin = unpack_uint16(get_inst_texel(vec2(header_offset + 2.0, 0.0)));  // 3 frames after the true loop point
+	float sample_mixrate = unpack_uint16(get_inst_texel(vec2(header_offset + 3.0, 0.0)));
+	vec2 attack_decay = get_inst_texel(vec2(header_offset + 4.0, 0.0));
+	vec2 sustain_release = get_inst_texel(vec2(header_offset + 5.0, 0.0));
+	// Calculate the point we want to sample in linear space
+	float mixrate = sample_mixrate * pitch_scale;
+	float target_frame = t * mixrate;
+	// If we're past the end of the sample, we need to wrap it back to within the loop range
+	float loop_length = sample_length - sample_loop_begin;
+	float overshoot = max(target_frame - sample_length, 0.0);
+	float overshoot_loops = ceil(overshoot/loop_length);
+	target_frame -= overshoot_loops*loop_length;
+	// Now we need to identify the sampling point since our frames are spread across multiple rows for GPU reasons
+	// We only sample from texel 4 onwards on a given row - texel 0 is the header, texels 1,2,3 are lead-in for filtering
+	// Note that y should be integral, but x should be continuous, as that's what applies the filtering!
+	target_frame += sample_start;
+	vec2 sample_xy = vec2(instrument_row_padding + mod(target_frame, instrument_row_payload), trunc(target_frame/instrument_row_payload));
+	return rescale_int16(unpack_int16(get_inst_texel(sample_xy)));
+}
+
+
 void fragment() {
 	// GLES2
 	vec2 uv = vec2(UV.x, 1.0-UV.y);
diff --git a/test/audio_system.gd b/test/audio_system.gd
index 0a08d05..3841bbf 100644
--- a/test/audio_system.gd
+++ b/test/audio_system.gd
@@ -124,4 +124,23 @@ func _ready() -> void:
 	$btn_hack_loop_extension.text += ' (%dms)'%SoundLoader.HACK_EXTEND_LOOP_SAMPLE_EXTRA_MS
 	for i in len(RomLoader.snes_data.bgm_song_pointers):
 		var pointer = RomLoader.snes_data.bgm_song_pointers[i]
-		print('BGM 0x%02X (%02d) at 0x%06X' % [i, i, pointer])
+		# print('BGM 0x%02X (%02d) at 0x%06X' % [i, i, pointer])
+
+# 	var test_payload := PoolByteArray()
+# 	test_payload.resize(4096*4096*2)
+# 	# for i in 5:
+# 	# 	test_payload.fill(i*2+10)
+# 	# 	$'%audio_renderer'.render_queue.append(test_payload)
+# 	test_payload.fill(0)
+# 	for i in 65536:
+# 		test_payload.set(i*2, i%256)
+# 		test_payload.set(i*2+1, i/256)
+# 	$'%audio_renderer'.render_queue.append(test_payload)
+# 	# $'%audio_renderer'.render_queue.append(test_payload)
+
+# func _process(_delta):
+# 	update()
+
+# func _draw() -> void:
+# 	if $'%audio_renderer'.waiting_for_viewport:
+# 		$'%audio_renderer'.get_result()