From fb3aa9628d8fa079696c5484ac8a4ff143c4bfed Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 20 Jun 2023 01:40:35 +0300 Subject: [PATCH] rsx: Migrate vertex fetch out of the cpp file --- rpcs3/Emu/RSX/Program/GLSLCommon.cpp | 204 +---------------- .../GLSLSnippets/RSXProg/RSXVertexFetch.glsl | 209 ++++++++++++++++++ rpcs3/emucore.vcxproj | 1 + rpcs3/emucore.vcxproj.filters | 3 + 4 files changed, 216 insertions(+), 201 deletions(-) create mode 100644 rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexFetch.glsl diff --git a/rpcs3/Emu/RSX/Program/GLSLCommon.cpp b/rpcs3/Emu/RSX/Program/GLSLCommon.cpp index 6a511b56ca..369933cf2a 100644 --- a/rpcs3/Emu/RSX/Program/GLSLCommon.cpp +++ b/rpcs3/Emu/RSX/Program/GLSLCommon.cpp @@ -129,210 +129,12 @@ namespace glsl // Note: Tested on Mesa iris with HD 530 and compilant path works fine, may be a bug on Windows proprietary drivers if (!glsl4_compliant) { - OS << - "void mov(inout uvec4 vector, const in int index, const in uint scalar)\n" - "{\n" - " switch(index)\n" - " {\n" - " case 0: vector.x = scalar; return;\n" - " case 1: vector.y = scalar; return;\n" - " case 2: vector.z = scalar; return;\n" - " case 3: vector.w = scalar; return;\n" - " }\n" - "}\n\n" - - "uint ref(const in uvec4 vector, const in int index)\n" - "{\n" - " switch(index)\n" - " {\n" - " case 0: return vector.x;\n" - " case 1: return vector.y;\n" - " case 2: return vector.z;\n" - " case 3: return vector.w;\n" - " }\n" - "}\n\n"; - } - else - { - OS << - "#define mov(v, i, s) v[i] = s\n" - "#define ref(v, i) v[i]\n\n"; + OS << "#define _INTEL_GLSL\n"; } OS << - "struct attribute_desc\n" - "{\n" - " uint type;\n" - " uint attribute_size;\n" - " uint starting_offset;\n" - " uint stride;\n" - " uint frequency;\n" - " bool swap_bytes;\n" - " bool is_volatile;\n" - " bool modulo;\n" - "};\n\n" - - "uint gen_bits(const in uint x, const in uint y, const in uint z, const in uint w, const in bool swap)\n" - "{\n" - " return (swap) ?\n" - " _set_bits(_set_bits(_set_bits(w, z, 8, 8), y, 16, 8), x, 24, 8) :\n" - " _set_bits(_set_bits(_set_bits(x, y, 8, 8), z, 16, 8), w, 24, 8);\n" - "}\n\n" - - "uint gen_bits(const in uint x, const in uint y, const in bool swap)\n" - "{\n" - " return (swap)? _set_bits(y, x, 8, 8) : _set_bits(x, y, 8, 8);\n" - "}\n\n" - - // NOTE: (int(n) or int(n)) is broken on some NVIDIA and INTEL hardware when the sign bit is involved. - // See https://github.com/RPCS3/rpcs3/issues/8990 - "vec4 sext(const in ivec4 bits)\n" - "{\n" - " // convert raw 16 bit values into signed 32-bit float4 counterpart\n" - " bvec4 sign_check = lessThan(bits, ivec4(0x8000));\n" - " return _select(bits - 65536, bits, sign_check);\n" - "}\n\n" - - "float sext(const in int bits)\n" - "{\n" - " return (bits < 0x8000) ? float(bits) : float(bits - 65536); \n" - "}\n\n" - - "vec4 fetch_attribute(const in attribute_desc desc, const in int vertex_id, usamplerBuffer input_stream)\n" - "{\n" - " const int elem_size_table[] = { 0, 2, 4, 2, 1, 2, 4, 1 };\n" - " const float scaling_table[] = { 1., 32767.5, 1., 1., 255., 1., 32767., 1. };\n" - " const int elem_size = elem_size_table[desc.type];\n" - " const vec4 scale = scaling_table[desc.type].xxxx;\n\n" - - " uvec4 tmp, result = uvec4(0u);\n" - " vec4 ret;\n" - " int n, i = int((vertex_id * desc.stride) + desc.starting_offset);\n\n" - - " for (n = 0; n < desc.attribute_size; n++)\n" - " {\n" - " tmp.x = texelFetch(input_stream, i++).x;\n" - " if (elem_size == 2)\n" - " {\n" - " tmp.y = texelFetch(input_stream, i++).x;\n" - " tmp.x = gen_bits(tmp.x, tmp.y, desc.swap_bytes);\n" - " }\n" - " else if (elem_size == 4)\n" - " {\n" - " tmp.y = texelFetch(input_stream, i++).x;\n" - " tmp.z = texelFetch(input_stream, i++).x;\n" - " tmp.w = texelFetch(input_stream, i++).x;\n" - " tmp.x = gen_bits(tmp.x, tmp.y, tmp.z, tmp.w, desc.swap_bytes);\n" - " }\n\n" - - " mov(result, n, tmp.x);\n" - " }\n\n" - - " // Actual decoding step is done in vector space, outside the loop\n" - " if (desc.type == VTX_FMT_SNORM16 || desc.type == VTX_FMT_SINT16)\n" - " {\n" - " ret = sext(ivec4(result));\n" - " ret = fma(vec4(0.5), vec4(desc.type == VTX_FMT_SNORM16), ret);\n" - " }\n" - " else if (desc.type == VTX_FMT_FLOAT32)\n" - " {\n" - " ret = uintBitsToFloat(result);\n" - " }\n" - " else if (desc.type == VTX_FMT_FLOAT16)\n" - " {\n" - " tmp.x = _set_bits(result.x, result.y, 16, 16);\n" - " tmp.y = _set_bits(result.z, result.w, 16, 16);\n" - " ret.xy = unpackHalf2x16(tmp.x);\n" - " ret.zw = unpackHalf2x16(tmp.y);\n" - " }\n" - " else if (elem_size == 1) //(desc.type == VTX_FMT_UINT8 || desc.type == VTX_FMT_UNORM8)\n" - " {\n" - " // Ignore bswap on single byte channels\n" - " ret = vec4(result);\n" - " }\n" - " else //if (desc.type == VTX_FMT_COMP32)\n" - " {\n" - " result = uvec4(_get_bits(result.x, 0, 11),\n" - " _get_bits(result.x, 11, 11),\n" - " _get_bits(result.x, 22, 10),\n" - " uint(scale.x));\n" - " ret = sext(ivec4(result) << ivec4(5, 5, 6, 0));\n" - " }\n\n" - - " if (desc.attribute_size < 4)\n" - " {\n" - " ret.w = scale.x;\n" - " }\n\n" - - " return ret / scale; \n" - "}\n\n" - - "attribute_desc fetch_desc(const in int location)\n" - "{\n" - " // Each descriptor is 64 bits wide\n" - " // [0-8] attribute stride\n" - " // [8-24] attribute divisor\n" - " // [24-27] attribute type\n" - " // [27-30] attribute size\n" - " // [30-31] reserved\n" - " // [32-60] starting offset\n" - " // [60-61] swap bytes flag\n" - " // [61-62] volatile flag\n" - " // [62-63] modulo enable flag\n\n"; - - if (rules == glsl_rules_opengl4) - { - // Data is packed into a ubo - OS << - " int block = (location >> 1);\n" - " int sub_block = (location & 1) << 1;\n" - " uvec2 attrib = uvec2(\n" - " ref(input_attributes_blob[block], sub_block + 0),\n" - " ref(input_attributes_blob[block], sub_block + 1));\n\n"; - } - else - { - // Fetch parameters streamed separately from draw parameters - OS << - " uvec2 attrib = texelFetch(vertex_layout_stream, location + int(layout_ptr_offset)).xy;\n\n"; - } - - OS << - " attribute_desc result;\n" - " result.stride = _get_bits(attrib.x, 0, 8);\n" - " result.frequency = _get_bits(attrib.x, 8, 16);\n" - " result.type = _get_bits(attrib.x, 24, 3);\n" - " result.attribute_size = _get_bits(attrib.x, 27, 3);\n" - " result.starting_offset = _get_bits(attrib.y, 0, 29);\n" - " result.swap_bytes = _test_bit(attrib.y, 29);\n" - " result.is_volatile = _test_bit(attrib.y, 30);\n" - " result.modulo = _test_bit(attrib.y, 31);\n" - " return result;\n" - "}\n\n" - - "vec4 read_location(const in int location)\n" - "{\n" - " attribute_desc desc = fetch_desc(location);\n" - " int vertex_id = " << vertex_id_name << " - int(vertex_base_index);\n" - " if (desc.frequency == 0)\n" - " {\n" - " vertex_id = 0;\n" - " }\n" - " else if (desc.modulo)\n" - " {\n" - " //if a vertex modifier is active; vertex_base must be 0 and is ignored\n" - " vertex_id = (" << vertex_id_name << " + int(vertex_index_offset)) % int(desc.frequency);\n" - " }\n" - " else\n" - " {\n" - " vertex_id /= int(desc.frequency); \n" - " }\n\n" - - " if (desc.is_volatile)\n" - " return fetch_attribute(desc, vertex_id, volatile_input_stream);\n" - " else\n" - " return fetch_attribute(desc, vertex_id, persistent_input_stream);\n" - "}\n\n"; + #include "GLSLSnippets/RSXProg/RSXVertexFetch.glsl" + ; } void insert_rop_init(std::ostream& OS) diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexFetch.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexFetch.glsl new file mode 100644 index 0000000000..c0be5e8786 --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/RSXProg/RSXVertexFetch.glsl @@ -0,0 +1,209 @@ +R"( +#ifdef _INTEL_GLSL +// For intel GPUs which cannot access vectors in indexed mode (driver bug? or glsl version too low?) +// Note: Tested on Mesa iris with HD 530 and compilant path works fine, may be a bug on Windows proprietary drivers +void mov(inout uvec4 vector, const in int index, const in uint scalar) +{ + switch(index) + { + case 0: vector.x = scalar; return; + case 1: vector.y = scalar; return; + case 2: vector.z = scalar; return; + case 3: vector.w = scalar; return; + } +} + +uint ref(const in uvec4 vector, const in int index) +{ + switch(index) + { + case 0: return vector.x; + case 1: return vector.y; + case 2: return vector.z; + case 3: return vector.w; + } +} +#else +#define mov(v, i, s) v[i] = s +#define ref(v, i) v[i] +#endif + +#ifdef VULKAN +#define _gl_VertexID gl_VertexIndex +#else +#define _gl_VertexID gl_VertexID +#endif + +struct attribute_desc +{ + uint type; + uint attribute_size; + uint starting_offset; + uint stride; + uint frequency; + bool swap_bytes; + bool is_volatile; + bool modulo; +}; + +uint gen_bits(const in uint x, const in uint y, const in uint z, const in uint w, const in bool swap) +{ + return (swap) ? + _set_bits(_set_bits(_set_bits(w, z, 8, 8), y, 16, 8), x, 24, 8) : + _set_bits(_set_bits(_set_bits(x, y, 8, 8), z, 16, 8), w, 24, 8); +} + +uint gen_bits(const in uint x, const in uint y, const in bool swap) +{ + return (swap)? _set_bits(y, x, 8, 8) : _set_bits(x, y, 8, 8); +} + +// NOTE: (int(n) or int(n)) is broken on some NVIDIA and INTEL hardware when the sign bit is involved. +// See https://github.com/RPCS3/rpcs3/issues/8990 +vec4 sext(const in ivec4 bits) +{ + // convert raw 16 bit values into signed 32-bit float4 counterpart + bvec4 sign_check = lessThan(bits, ivec4(0x8000)); + return _select(bits - 65536, bits, sign_check); +} + +float sext(const in int bits) +{ + return (bits < 0x8000) ? float(bits) : float(bits - 65536); +} + +vec4 fetch_attribute(const in attribute_desc desc, const in int vertex_id, usamplerBuffer input_stream) +{ + const int elem_size_table[] = { 0, 2, 4, 2, 1, 2, 4, 1 }; + const float scaling_table[] = { 1., 32767.5, 1., 1., 255., 1., 32767., 1. }; + const int elem_size = elem_size_table[desc.type]; + const vec4 scale = scaling_table[desc.type].xxxx; + + uvec4 tmp, result = uvec4(0u); + vec4 ret; + int n, i = int((vertex_id * desc.stride) + desc.starting_offset); + + for (n = 0; n < desc.attribute_size; n++) + { + tmp.x = texelFetch(input_stream, i++).x; + if (elem_size == 2) + { + tmp.y = texelFetch(input_stream, i++).x; + tmp.x = gen_bits(tmp.x, tmp.y, desc.swap_bytes); + } + else if (elem_size == 4) + { + tmp.y = texelFetch(input_stream, i++).x; + tmp.z = texelFetch(input_stream, i++).x; + tmp.w = texelFetch(input_stream, i++).x; + tmp.x = gen_bits(tmp.x, tmp.y, tmp.z, tmp.w, desc.swap_bytes); + } + + mov(result, n, tmp.x); + } + + // Actual decoding step is done in vector space, outside the loop + if (desc.type == VTX_FMT_SNORM16 || desc.type == VTX_FMT_SINT16) + { + ret = sext(ivec4(result)); + ret = fma(vec4(0.5), vec4(desc.type == VTX_FMT_SNORM16), ret); + } + else if (desc.type == VTX_FMT_FLOAT32) + { + ret = uintBitsToFloat(result); + } + else if (desc.type == VTX_FMT_FLOAT16) + { + tmp.x = _set_bits(result.x, result.y, 16, 16); + tmp.y = _set_bits(result.z, result.w, 16, 16); + ret.xy = unpackHalf2x16(tmp.x); + ret.zw = unpackHalf2x16(tmp.y); + } + else if (elem_size == 1) // (desc.type == VTX_FMT_UINT8 || desc.type == VTX_FMT_UNORM8) + { + // Ignore bswap on single byte channels + ret = vec4(result); + } + else // if (desc.type == VTX_FMT_COMP32) + { + result = uvec4(_get_bits(result.x, 0, 11), + _get_bits(result.x, 11, 11), + _get_bits(result.x, 22, 10), + uint(scale.x)); + ret = sext(ivec4(result) << ivec4(5, 5, 6, 0)); + } + + if (desc.attribute_size < 4) + { + ret.w = scale.x; + } + + return ret / scale; +} + +attribute_desc fetch_desc(const in int location) +{ + // Each descriptor is 64 bits wide + // [0-8] attribute stride + // [8-24] attribute divisor + // [24-27] attribute type + // [27-30] attribute size + // [30-31] reserved + // [32-60] starting offset + // [60-61] swap bytes flag + // [61-62] volatile flag + // [62-63] modulo enable flag; + +#ifdef VULKAN + // Fetch parameters streamed separately from draw parameters + uvec2 attrib = texelFetch(vertex_layout_stream, location + int(layout_ptr_offset)).xy; +#else + // Data is packed into a ubo + int block = (location >> 1); + int sub_block = (location & 1) << 1; + uvec2 attrib = uvec2( + ref(input_attributes_blob[block], sub_block + 0), + ref(input_attributes_blob[block], sub_block + 1)); +#endif + + attribute_desc result; + result.stride = _get_bits(attrib.x, 0, 8); + result.frequency = _get_bits(attrib.x, 8, 16); + result.type = _get_bits(attrib.x, 24, 3); + result.attribute_size = _get_bits(attrib.x, 27, 3); + result.starting_offset = _get_bits(attrib.y, 0, 29); + result.swap_bytes = _test_bit(attrib.y, 29); + result.is_volatile = _test_bit(attrib.y, 30); + result.modulo = _test_bit(attrib.y, 31); + return result; +} + +vec4 read_location(const in int location) +{ + attribute_desc desc = fetch_desc(location); + int vertex_id = _gl_VertexID - int(vertex_base_index); + if (desc.frequency == 0) + { + vertex_id = 0; + } + else if (desc.modulo) + { + // if a vertex modifier is active; vertex_base must be 0 and is ignored + vertex_id = (_gl_VertexID + int(vertex_index_offset)) % int(desc.frequency); + } + else + { + vertex_id /= int(desc.frequency); + } + + if (desc.is_volatile) + { + return fetch_attribute(desc, vertex_id, volatile_input_stream); + } + else + { + return fetch_attribute(desc, vertex_id, persistent_input_stream); + } +} + +)" diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj index 8d3d6c04d1..17051eb305 100644 --- a/rpcs3/emucore.vcxproj +++ b/rpcs3/emucore.vcxproj @@ -911,6 +911,7 @@ + diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters index d219df8c18..0adf76dedf 100644 --- a/rpcs3/emucore.vcxproj.filters +++ b/rpcs3/emucore.vcxproj.filters @@ -2430,5 +2430,8 @@ Emu\GPU\RSX\Program\Snippets\RSXProg + + Emu\GPU\RSX\Program\Snippets\RSXProg + \ No newline at end of file