// ============================================================= BOILERPLATE ============================================================= // While most of the data we are working with is integral, GPU conversion overheads mean almost all of this will be floats. // Unfortunately, this loses type-checking on [0.0, 1.0] vs [0,255] etc. so a lot of this will involve comments declaring ranges. shader_type canvas_item; render_mode blend_premul_alpha; const float TEX_SIZE = 4096.0; const float UV_QUANTIZE = TEX_SIZE; // I feel like these magic numbers are a bit more intuitive in hex const float x00FF = float(0x00FF); // 255.0 const float x0100 = float(0x0100); // 256.0 const float x7FFF = float(0x7FFF); // 32767.0 const float x8000 = float(0x8000); // 32768.0 const float xFF00 = float(0xFF00); // 65280.0 const float xFFFF = float(0xFFFF); // 65535.0 const float x10000 = float(0x10000); // 65536.0 const vec2 INT16_DOT_BE = vec2(xFF00, x00FF); const vec2 INT16_DOT_LE = vec2(x00FF, xFF00); uniform sampler2D tex : hint_normal; float unpack_uint16(vec2 uint16) { // Convert packed 2byte integer, sampled as two [0.0, 1.0] range floats, to the original int value [0, 65535] in float32 return dot(uint16, INT16_DOT_LE); } float unpack_int16(vec2 int16) { // Convert packed 2byte integer, sampled as two [0.0, 1.0] range floats, to the original int value [-32768, 32767] in float32 float unsigned = dot(int16, INT16_DOT_LE); return unsigned - (unsigned < x7FFF ? 0.0 : x10000); } float rescale_int16(float int16) { // Rescale from [-32768, 32767] to [-1.0, 1.0) return int16 / x8000; } vec2 pack_float_to_int16(float value) { // Convert a float in range [-1.0, 1.0) to a signed 2byte integer [-32768, 32767] packed into two [0.0, 1.0] floats float scaled = value * x8000; float unsigned = scaled + (scaled < 0.0 ? x10000 : 0.0); float unsigned_div_256 = unsigned / x0100; float MSB = trunc(unsigned_div_256) / x00FF; float LSB = fract(unsigned_div_256) * x0100 / x00FF; return vec2(LSB, MSB); } vec4 test_writeback(vec2 uv) { // Test importing and exporting the samples, // and exporting a value derived from the UV vec4 output; float sample_1 = rescale_int16(unpack_int16(texture(tex, uv).xw)); float sample_2 = rescale_int16(dot(trunc(uv*TEX_SIZE), vec2(1.0, TEX_SIZE))); output.xy = pack_float_to_int16(sample_1); output.zw = pack_float_to_int16(sample_2); return output; } // ============================================================= LOGIC ============================================================= // We have around 200k frames across 35 instrument samples // 35 instrument samples and 8 sfx samples = 43 samples // 2048x128 texture maybe? at 2bytes per texel, that's 512KiB of VRAM // We start the texture with a bunch of same-size headers // uint16 sample_start // The true start, after the prepended 3 frames of silence // uint16 sample_length // 3 frames after the true end, because of how we loop // uint16 sample_loop_begin // 3 frames after the true loop point // uint16 mixrate // 2*uint8 AD of ADSR ([0.0, 1.0] is fine) // 2*uint8 SR of ADSR ([0.0, 1.0] is fine) // So six texture() calls spent on header information, and one on the final lookup. // Alternatively, sample length could be omitted and fetched as the start of the next entry to save redundant entries. // // To accomodate filtering, every sample must begin with 3 frames of silence, and end with 6 frames of the beginning of the loop. // Looped playback will go from the first 3 of 6 frames at the end, to the third frame after the loop start point, to avoid filter bleeding. // If a sample does not loop, it must have 6 frames of silence at the end, not including the subsequent next sample's 3 frames of silence prefix. // As such, every sample will have an additional 9 frames, 3 before, 6 after. // Additionally, every row of the texture must have 3 redundant frames on either side - i.e., we only sample from [3, 2045) on any given row. // So the payload of a 2048-wide texture will be 2042 per row, excluding the initial header. // So for 43 samples, a header of 43*6 = 258 texels starts the first row, // after which the first sample's 3 frames of silence (3 texels of (0.0, 0.0), 6 bytes of 0x00) may begin. // A 2048x128 texture would have a payload of 2042x128 = 261376 frames (texels) excluding header // With the 258 texel header, which uses 3 texels of margin, 255 would be subtracted from the above payload, // leaving 261121 texels for the sample data. const float HEADER_LENGTH_TEXELS = 6.0; uniform sampler2D instrument_samples; uniform vec2 instrument_samples_size = vec2(2048.0, 128.0); uniform float instrument_row_padding = 3.0; // In case we want to go to cubic filtering uniform float instrument_row_payload = 2042.0; // 2048-3-3 Make sure to set with instrument_samples_size and instrument_row_padding! uniform float reference_note = 71.0; // [0, 255], possibly [0, 127] uniform float output_mixrate = 32000.0; // SNES SPC output is 32kHz float get_pitch_scale(float note) { // return pow(2.0, (note - reference_note)/12.0); return exp2((note - reference_note)/12.0); } vec2 get_inst_texel(vec2 xy) { return texture(instrument_samples, xy/instrument_samples_size).xw; } float get_instrument_sample(float instrument_index, float pitch_scale, float t, float t_end) { // t_end is for ADSR purposes float header_offset = instrument_index * HEADER_LENGTH_TEXELS; float sample_start = unpack_uint16(get_inst_texel(vec2(header_offset, 0.0))); // The true start, after the prepended 3 frames of silence float sample_length = unpack_uint16(get_inst_texel(vec2(header_offset + 1.0, 0.0))); // 3 frames after the true end, because of how we loop float sample_loop_begin = unpack_uint16(get_inst_texel(vec2(header_offset + 2.0, 0.0))); // 3 frames after the true loop point float sample_mixrate = unpack_uint16(get_inst_texel(vec2(header_offset + 3.0, 0.0))); vec2 attack_decay = get_inst_texel(vec2(header_offset + 4.0, 0.0)); vec2 sustain_release = get_inst_texel(vec2(header_offset + 5.0, 0.0)); // Calculate the point we want to sample in linear space float mixrate = sample_mixrate * pitch_scale; float target_frame = t * mixrate; // If we're past the end of the sample, we need to wrap it back to within the loop range float loop_length = sample_length - sample_loop_begin; float overshoot = max(target_frame - sample_length, 0.0); float overshoot_loops = ceil(overshoot/loop_length); target_frame -= overshoot_loops*loop_length; // Now we need to identify the sampling point since our frames are spread across multiple rows for GPU reasons // We only sample from texel 4 onwards on a given row - texel 0 is the header, texels 1,2,3 are lead-in for filtering // Note that y should be integral, but x should be continuous, as that's what applies the filtering! target_frame += sample_start; vec2 sample_xy = vec2(instrument_row_padding + mod(target_frame, instrument_row_payload), trunc(target_frame/instrument_row_payload)); return rescale_int16(unpack_int16(get_inst_texel(sample_xy))); } void fragment() { // GLES2 vec2 uv = vec2(UV.x, 1.0-UV.y); uv = (trunc(uv*UV_QUANTIZE)+0.5)/UV_QUANTIZE; COLOR.xyzw = test_writeback(uv); }