From 90f411889ba271f747d7d230c461abbc2b970667 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Wed, 24 Jul 2024 23:23:24 +0300 Subject: [PATCH 01/63] Renderer: Add prepareForDraw callback --- include/PICA/shader_unit.hpp | 7 +- include/renderer.hpp | 9 ++- include/renderer_gl/renderer_gl.hpp | 4 +- src/core/PICA/gpu.cpp | 2 +- src/core/PICA/regs.cpp | 1 + src/core/PICA/shader_decompiler.cpp | 22 +++--- src/core/renderer_gl/renderer_gl.cpp | 109 +++++++++++++++------------ 7 files changed, 82 insertions(+), 72 deletions(-) diff --git a/include/PICA/shader_unit.hpp b/include/PICA/shader_unit.hpp index d8d931603..80e013468 100644 --- a/include/PICA/shader_unit.hpp +++ b/include/PICA/shader_unit.hpp @@ -2,10 +2,9 @@ #include "PICA/shader.hpp" class ShaderUnit { - -public: - PICAShader vs; // Vertex shader - PICAShader gs; // Geometry shader + public: + PICAShader vs; // Vertex shader + PICAShader gs; // Geometry shader ShaderUnit() : vs(ShaderType::Vertex), gs(ShaderType::Geometry) {} void reset(); diff --git a/include/renderer.hpp b/include/renderer.hpp index 569a730b7..1d1fb6824 100644 --- a/include/renderer.hpp +++ b/include/renderer.hpp @@ -21,9 +21,11 @@ enum class RendererType : s8 { }; struct EmulatorConfig; -class GPU; struct SDL_Window; +class GPU; +class ShaderUnit; + class Renderer { protected: GPU& gpu; @@ -77,7 +79,10 @@ class Renderer { virtual std::string getUbershader() { return ""; } virtual void setUbershader(const std::string& shader) {} - virtual void setUbershaderSetting(bool value) {} + // This function is called on every draw call before parsing vertex data. + // It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between + // ubershaders and shadergen, and so on. + virtual void prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {} // Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window #ifdef PANDA3DS_FRONTEND_QT diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index f5a964a34..6c18a0c63 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -30,7 +30,6 @@ class RendererGL final : public Renderer { OpenGL::VertexArray vao; OpenGL::VertexBuffer vbo; - bool enableUbershader = true; // Data struct { @@ -110,8 +109,7 @@ class RendererGL final : public Renderer { virtual bool supportsShaderReload() override { return true; } virtual std::string getUbershader() override; virtual void setUbershader(const std::string& shader) override; - - virtual void setUbershaderSetting(bool value) override { enableUbershader = value; } + virtual void prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) override; std::optional getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true); diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index fe336edc8..b6d903e4a 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -117,13 +117,13 @@ void GPU::reset() { externalRegs[Framebuffer1Config] = static_cast(PICA::ColorFmt::RGB8); externalRegs[Framebuffer1Select] = 0; - renderer->setUbershaderSetting(config.useUbershaders); renderer->reset(); } // Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter) // And whether we are going to use the shader JIT (second template parameter) void GPU::drawArrays(bool indexed) { + renderer->prepareForDraw(shaderUnit, false); const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled; if (indexed) { diff --git a/src/core/PICA/regs.cpp b/src/core/PICA/regs.cpp index f805de60a..c9412fc8f 100644 --- a/src/core/PICA/regs.cpp +++ b/src/core/PICA/regs.cpp @@ -249,6 +249,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { // If we've reached 3 verts, issue a draw call // Handle rendering depending on the primitive type if (immediateModeVertIndex == 3) { + renderer->prepareForDraw(shaderUnit, true); renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices); switch (primType) { diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 482aa36ce..ce7d9a330 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -72,19 +72,17 @@ const Function* ShaderDecompiler::findFunction(const AddressRange& range) { void ShaderDecompiler::writeAttributes() { decompiledShader += R"( - layout(location = 0) in vec4 inputs[8]; - - layout(std140) uniform PICAShaderUniforms { - vec4 uniform_float[96]; - uvec4 uniform_int; - uint uniform_bool; - }; - - vec4 temp_registers[16]; - vec4 dummy_vec = vec4(0.0); + layout(location = 0) in vec4 inputs[8]; + layout(std140) uniform PICAShaderUniforms { + vec4 uniform_float[96]; + uvec4 uniform_int; + uint uniform_bool; + }; + + vec4 temp_registers[16]; + vec4 output_registers[8]; + vec4 dummy_vec = vec4(0.0); )"; - - decompiledShader += "\n"; } std::string ShaderDecompiler::decompile() { diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 8b614d2db..90eccf47a 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -4,11 +4,12 @@ #include -#include "config.hpp" #include "PICA/float_types.hpp" -#include "PICA/pica_frag_uniforms.hpp" #include "PICA/gpu.hpp" +#include "PICA/pica_frag_uniforms.hpp" #include "PICA/regs.hpp" +#include "PICA/shader_decompiler.hpp" +#include "config.hpp" #include "math_util.hpp" CMRC_DECLARE(RendererGL); @@ -409,25 +410,6 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v OpenGL::Triangle, }; - bool usingUbershader = enableUbershader; - if (usingUbershader) { - const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0; - const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1; - - // Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen - // This way we generate fewer shaders overall than with full shadergen, but don't tank performance - if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) { - usingUbershader = false; - } - } - - if (usingUbershader) { - gl.useProgram(triangleProgram); - } else { - OpenGL::Program& program = getSpecializedShader(); - gl.useProgram(program); - } - const auto primitiveTopology = primTypes[static_cast(primType)]; gl.disableScissor(); gl.bindVBO(vbo); @@ -449,38 +431,9 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v const int depthFunc = getBits<4, 3>(depthControl); const int colourMask = getBits<8, 4>(depthControl); gl.setColourMask(colourMask & 1, colourMask & 2, colourMask & 4, colourMask & 8); - static constexpr std::array depthModes = {GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL}; - // Update ubershader uniforms - if (usingUbershader) { - const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32(); - const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32(); - const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1; - - if (oldDepthScale != depthScale) { - oldDepthScale = depthScale; - glUniform1f(ubershaderData.depthScaleLoc, depthScale); - } - - if (oldDepthOffset != depthOffset) { - oldDepthOffset = depthOffset; - glUniform1f(ubershaderData.depthOffsetLoc, depthOffset); - } - - if (oldDepthmapEnable != depthMapEnable) { - oldDepthmapEnable = depthMapEnable; - glUniform1i(ubershaderData.depthmapEnableLoc, depthMapEnable); - } - - // Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48) - // The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates - glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, ®s[0x48]); - setupUbershaderTexEnv(); - } - bindTexturesToSlots(); - if (gpu.fogLUTDirty) { updateFogLUT(); } @@ -951,6 +904,62 @@ OpenGL::Program& RendererGL::getSpecializedShader() { return program; } +void RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { + std::string vertShaderSource = PICA::ShaderGen::decompileShader( + shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL + ); + + OpenGL::Shader vert({vertShaderSource.c_str(), vertShaderSource.size()}, OpenGL::Vertex); + //triangleProgram.create({vert, frag}); + std::cout << vertShaderSource << "\n"; + + bool usingUbershader = emulatorConfig->useUbershaders; + if (usingUbershader) { + const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0; + const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1; + + // Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen + // This way we generate fewer shaders overall than with full shadergen, but don't tank performance + if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) { + usingUbershader = false; + } + } + + if (usingUbershader) { + gl.useProgram(triangleProgram); + } else { + OpenGL::Program& program = getSpecializedShader(); + gl.useProgram(program); + } + + // Update ubershader uniforms + if (usingUbershader) { + const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32(); + const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32(); + const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1; + + if (oldDepthScale != depthScale) { + oldDepthScale = depthScale; + glUniform1f(ubershaderData.depthScaleLoc, depthScale); + } + + if (oldDepthOffset != depthOffset) { + oldDepthOffset = depthOffset; + glUniform1f(ubershaderData.depthOffsetLoc, depthOffset); + } + + if (oldDepthmapEnable != depthMapEnable) { + oldDepthmapEnable = depthMapEnable; + glUniform1i(ubershaderData.depthmapEnableLoc, depthMapEnable); + } + + // Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48) + // The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates + glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, ®s[0x48]); + setupUbershaderTexEnv(); + } +} + void RendererGL::screenshot(const std::string& name) { constexpr uint width = 400; constexpr uint height = 2 * 240; From a2b8a7b23d19c7c1ddd704e91a8b848b1fd1c847 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Wed, 24 Jul 2024 23:48:55 +0300 Subject: [PATCH 02/63] Add fmt submodule and port shader decompiler instructions to it --- .gitmodules | 3 +++ CMakeLists.txt | 3 ++- src/core/PICA/shader_decompiler.cpp | 18 ++++++++++-------- third_party/fmt | 1 + 4 files changed, 16 insertions(+), 9 deletions(-) create mode 160000 third_party/fmt diff --git a/.gitmodules b/.gitmodules index 656e1f41d..5b6301b7e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -76,3 +76,6 @@ [submodule "third_party/metal-cpp"] path = third_party/metal-cpp url = https://github.com/Panda3DS-emu/metal-cpp +[submodule "third_party/fmt"] + path = third_party/fmt + url = https://github.com/fmtlib/fmt diff --git a/CMakeLists.txt b/CMakeLists.txt index a3fe41dd2..7c2ec9f15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,6 +93,7 @@ if (NOT ANDROID) target_link_libraries(AlberCore PUBLIC SDL2-static) endif() +add_subdirectory(third_party/fmt) add_subdirectory(third_party/toml11) include_directories(${SDL2_INCLUDE_DIR}) include_directories(third_party/toml11) @@ -419,7 +420,7 @@ set(ALL_SOURCES ${SOURCE_FILES} ${FS_SOURCE_FILES} ${CRYPTO_SOURCE_FILES} ${KERN target_sources(AlberCore PRIVATE ${ALL_SOURCES}) target_link_libraries(AlberCore PRIVATE dynarmic cryptopp glad resources_console_fonts teakra) -target_link_libraries(AlberCore PUBLIC glad capstone) +target_link_libraries(AlberCore PUBLIC glad capstone fmt::fmt) if(ENABLE_DISCORD_RPC AND NOT ANDROID) target_compile_definitions(AlberCore PUBLIC "PANDA3DS_ENABLE_DISCORD_RPC=1") diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index ce7d9a330..826cfaccf 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -1,5 +1,7 @@ #include "PICA/shader_decompiler.hpp" +#include + #include "config.hpp" using namespace PICA; @@ -254,14 +256,14 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { switch (opcode) { case ShaderOpcodes::MOV: setDest(operandDescriptor, dest, src1); break; - case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, src1 + " + " + src2); break; - case ShaderOpcodes::MUL: setDest(operandDescriptor, dest, src1 + " * " + src2); break; - case ShaderOpcodes::MAX: setDest(operandDescriptor, dest, "max(" + src1 + ", " + src2 + ")"); break; - case ShaderOpcodes::MIN: setDest(operandDescriptor, dest, "min(" + src1 + ", " + src2 + ")"); break; - - case ShaderOpcodes::DP3: setDest(operandDescriptor, dest, "vec4(dot(" + src1 + ".xyz, " + src2 + ".xyz))"); break; - case ShaderOpcodes::DP4: setDest(operandDescriptor, dest, "vec4(dot(" + src1 + ", " + src2 + "))"); break; - case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, "vec4(inversesqrt(" + src1 + ".x))"); break; + case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, fmt::format("{} + {}", src1, src2)); break; + case ShaderOpcodes::MUL: setDest(operandDescriptor, dest, fmt::format("{} * {}", src1, src2)); break; + case ShaderOpcodes::MAX: setDest(operandDescriptor, dest, fmt::format("max({}, {})", src1, src2)); break; + case ShaderOpcodes::MIN: setDest(operandDescriptor, dest, fmt::format("min({}, {})", src1, src2)); break; + + case ShaderOpcodes::DP3: setDest(operandDescriptor, dest, fmt::format("vec4(dot({}.xyz, {}.xyz))", src1, src2)); break; + case ShaderOpcodes::DP4: setDest(operandDescriptor, dest, fmt::format("vec4(dot({}, {}))", src1, src2)); break; + case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, fmt::format("vec4(inversesqrt({}.x))", src1)); break; default: Helpers::panic("GLSL recompiler: Unknown common opcode: %X", opcode); break; } diff --git a/third_party/fmt b/third_party/fmt new file mode 160000 index 000000000..f8581bcec --- /dev/null +++ b/third_party/fmt @@ -0,0 +1 @@ +Subproject commit f8581bcecf317e8753887b68187c9ef1ba0524f4 From 251ff5ee495039b5f023cbba9191d4e8323da44c Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Thu, 25 Jul 2024 00:19:07 +0300 Subject: [PATCH 03/63] Add shader acceleration setting --- include/config.hpp | 6 ++++-- src/config.cpp | 2 ++ src/libretro_core.cpp | 6 +++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/include/config.hpp b/include/config.hpp index 52be1af7e..46d2fec81 100644 --- a/include/config.hpp +++ b/include/config.hpp @@ -20,11 +20,13 @@ struct EmulatorConfig { #else static constexpr bool ubershaderDefault = true; #endif - + static constexpr bool accelerateShadersDefault = false; + bool shaderJitEnabled = shaderJitDefault; - bool discordRpcEnabled = false; bool useUbershaders = ubershaderDefault; + bool accelerateShaders = accelerateShadersDefault; bool accurateShaderMul = false; + bool discordRpcEnabled = false; // Toggles whether to force shadergen when there's more than N lights active and we're using the ubershader, for better performance bool forceShadergenForLights = true; diff --git a/src/config.cpp b/src/config.cpp index dae5a0ab0..b774d0640 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -64,6 +64,7 @@ void EmulatorConfig::load() { vsyncEnabled = toml::find_or(gpu, "EnableVSync", true); useUbershaders = toml::find_or(gpu, "UseUbershaders", ubershaderDefault); accurateShaderMul = toml::find_or(gpu, "AccurateShaderMultiplication", false); + accelerateShaders = toml::find_or(gpu, "AccelerateShaders", accelerateShadersDefault); forceShadergenForLights = toml::find_or(gpu, "ForceShadergenForLighting", true); lightShadergenThreshold = toml::find_or(gpu, "ShadergenLightThreshold", 1); @@ -135,6 +136,7 @@ void EmulatorConfig::save() { data["GPU"]["UseUbershaders"] = useUbershaders; data["GPU"]["ForceShadergenForLighting"] = forceShadergenForLights; data["GPU"]["ShadergenLightThreshold"] = lightShadergenThreshold; + data["GPU"]["AccelerateShaders"] = accelerateShaders; data["Audio"]["DSPEmulation"] = std::string(Audio::DSPCore::typeToString(dspType)); data["Audio"]["EnableAudio"] = audioEnabled; diff --git a/src/libretro_core.cpp b/src/libretro_core.cpp index 3e0436b86..fa9f6d14e 100644 --- a/src/libretro_core.cpp +++ b/src/libretro_core.cpp @@ -148,6 +148,8 @@ static bool FetchVariableBool(std::string key, bool def) { static void configInit() { static const retro_variable values[] = { {"panda3ds_use_shader_jit", "Enable shader JIT; enabled|disabled"}, + {"panda3ds_accelerate_shaders", + EmulatorConfig::accelerateShadersDefault ? "Run 3DS shaders on the GPU; enabled|disabled" : "Run 3DS shaders on the GPU; disabled|enabled"}, {"panda3ds_accurate_shader_mul", "Enable accurate shader multiplication; disabled|enabled"}, {"panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault ? "Use ubershaders (No stutter, maybe slower); enabled|disabled" : "Use ubershaders (No stutter, maybe slower); disabled|enabled"}, @@ -179,7 +181,9 @@ static void configUpdate() { config.sdCardInserted = FetchVariableBool("panda3ds_use_virtual_sd", true); config.sdWriteProtected = FetchVariableBool("panda3ds_write_protect_virtual_sd", false); config.accurateShaderMul = FetchVariableBool("panda3ds_accurate_shader_mul", false); - config.useUbershaders = FetchVariableBool("panda3ds_use_ubershader", true); + config.useUbershaders = FetchVariableBool("panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault); + config.accelerateShaders = FetchVariableBool("panda3ds_accelerate_shaders", EmulatorConfig::accelerateShadersDefault); + config.forceShadergenForLights = FetchVariableBool("panda3ds_ubershader_lighting_override", true); config.lightShadergenThreshold = std::clamp(std::stoi(FetchVariable("panda3ds_ubershader_lighting_override_threshold", "1")), 1, 8); config.discordRpcEnabled = false; From 2f4c169cad4ab489d0141921c983e35b80eb8d2f Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Thu, 25 Jul 2024 04:04:41 +0300 Subject: [PATCH 04/63] Hook up vertex shaders to shader cache --- CMakeLists.txt | 1 + include/PICA/gpu.hpp | 8 ++- include/PICA/pica_vert_config.hpp | 31 ++++++++++ include/PICA/shader.hpp | 10 ++-- include/PICA/shader_gen.hpp | 2 + include/renderer.hpp | 3 +- include/renderer_gl/renderer_gl.hpp | 44 +++++++++++++- src/core/PICA/gpu.cpp | 88 ++++++++++++++++----------- src/core/PICA/shader_gen_glsl.cpp | 59 ++++++++++++++++-- src/core/renderer_gl/renderer_gl.cpp | 89 +++++++++++++++++++--------- 10 files changed, 257 insertions(+), 78 deletions(-) create mode 100644 include/PICA/pica_vert_config.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c2ec9f15..a43b7f634 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -256,6 +256,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp include/audio/miniaudio_device.hpp include/ring_buffer.hpp include/bitfield.hpp include/audio/dsp_shared_mem.hpp include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp + include/PICA/pica_vert_config.hpp ) cmrc_add_resource_library( diff --git a/include/PICA/gpu.hpp b/include/PICA/gpu.hpp index ac2a49e6f..1e1d3c4bd 100644 --- a/include/PICA/gpu.hpp +++ b/include/PICA/gpu.hpp @@ -13,6 +13,12 @@ #include "memory.hpp" #include "renderer.hpp" +enum class ShaderExecMode { + Interpreter, // Interpret shaders on the CPU + JIT, // Recompile shaders to CPU machine code + Hardware, // Recompiler shaders to host shaders and run them on the GPU +}; + class GPU { static constexpr u32 regNum = 0x300; static constexpr u32 extRegNum = 0x1000; @@ -45,7 +51,7 @@ class GPU { uint immediateModeVertIndex; uint immediateModeAttrIndex; // Index of the immediate mode attribute we're uploading - template + template void drawArrays(); // Silly method of avoiding linking problems. TODO: Change to something less silly diff --git a/include/PICA/pica_vert_config.hpp b/include/PICA/pica_vert_config.hpp new file mode 100644 index 000000000..ae774405d --- /dev/null +++ b/include/PICA/pica_vert_config.hpp @@ -0,0 +1,31 @@ +#pragma once +#include +#include +#include +#include + +#include "PICA/pica_hash.hpp" +#include "PICA/regs.hpp" +#include "bitfield.hpp" +#include "helpers.hpp" + +namespace PICA { + // Configuration struct used + struct VertConfig { + PICAHash::HashType shaderHash; + PICAHash::HashType opdescHash; + u32 entrypoint; + bool usingUbershader; + + bool operator==(const VertConfig& config) const { + // Hash function and equality operator required by std::unordered_map + return std::memcmp(this, &config, sizeof(VertConfig)) == 0; + } + }; +} // namespace PICA + +// Override std::hash for our vertex config class +template <> +struct std::hash { + std::size_t operator()(const PICA::VertConfig& config) const noexcept { return PICAHash::computeHash((const char*)&config, sizeof(config)); } +}; \ No newline at end of file diff --git a/include/PICA/shader.hpp b/include/PICA/shader.hpp index 68b16de88..c725c180a 100644 --- a/include/PICA/shader.hpp +++ b/include/PICA/shader.hpp @@ -107,6 +107,11 @@ class PICAShader { alignas(16) std::array inputs; // Attributes passed to the shader alignas(16) std::array outputs; alignas(16) vec4f dummy = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()}); // Dummy register used by the JIT + + // We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT + // We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal + // Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first + using Hash = PICAHash::HashType; protected: std::array operandDescriptors; @@ -125,11 +130,6 @@ class PICAShader { std::array callInfo; ShaderType type; - // We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT - // We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal - // Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first - using Hash = PICAHash::HashType; - Hash lastCodeHash = 0; // Last hash computed for the shader code (Used for the JIT caching mechanism) Hash lastOpdescHash = 0; // Last hash computed for the operand descriptors (Also used for the JIT) diff --git a/include/PICA/shader_gen.hpp b/include/PICA/shader_gen.hpp index 215e5adb0..2d39e0787 100644 --- a/include/PICA/shader_gen.hpp +++ b/include/PICA/shader_gen.hpp @@ -30,6 +30,8 @@ namespace PICA::ShaderGen { FragmentGenerator(API api, Language language) : api(api), language(language) {} std::string generate(const PICA::FragmentConfig& config); std::string getDefaultVertexShader(); + // For when PICA shader is acceleration is enabled. Turn the PICA shader source into a proper vertex shader + std::string getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader); void setTarget(API api, Language language) { this->api = api; diff --git a/include/renderer.hpp b/include/renderer.hpp index 1d1fb6824..721364c1a 100644 --- a/include/renderer.hpp +++ b/include/renderer.hpp @@ -82,7 +82,8 @@ class Renderer { // This function is called on every draw call before parsing vertex data. // It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between // ubershaders and shadergen, and so on. - virtual void prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {} + // Returns whether this draw is eligible for using hardware-accelerated shaders or if shaders should run on the CPU + virtual bool prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { return false; } // Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window #ifdef PANDA3DS_FRONTEND_QT diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index 6c18a0c63..0597235bb 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -3,11 +3,14 @@ #include #include #include +#include #include #include +#include #include "PICA/float_types.hpp" #include "PICA/pica_frag_config.hpp" +#include "PICA/pica_vert_config.hpp" #include "PICA/pica_hash.hpp" #include "PICA/pica_vertex.hpp" #include "PICA/regs.hpp" @@ -52,6 +55,11 @@ class RendererGL final : public Renderer { float oldDepthScale = -1.0; float oldDepthOffset = 0.0; bool oldDepthmapEnable = false; + // Set by prepareDraw, tells us whether the current draw is using hw-accelerated shader + bool usingAcceleratedShader = false; + + // Cached pointer to the current vertex shader when using HW accelerated shaders + OpenGL::Shader* generatedVertexShader = nullptr; SurfaceCache depthBufferCache; SurfaceCache colourBufferCache; @@ -74,7 +82,38 @@ class RendererGL final : public Renderer { OpenGL::Program program; uint uboBinding; }; - std::unordered_map shaderCache; + + struct ShaderCache { + std::unordered_map> vertexShaderCache; + std::unordered_map fragmentShaderCache; + + // Program cache indexed by GLuints for the vertex and fragment shader to use + // Top 32 bits are the vertex shader GLuint, bottom 32 bits are the fs GLuint + std::unordered_map programCache; + + void clear() { + for (auto& it : programCache) { + CachedProgram& cachedProgram = it.second; + cachedProgram.program.free(); + glDeleteBuffers(1, &cachedProgram.uboBinding); + } + + for (auto& it : vertexShaderCache) { + if (it.second.has_value()) { + it.second->free(); + } + } + + for (auto& it : fragmentShaderCache) { + it.second.free(); + } + + programCache.clear(); + vertexShaderCache.clear(); + fragmentShaderCache.clear(); + } + }; + ShaderCache shaderCache; OpenGL::Framebuffer getColourFBO(); OpenGL::Texture getTexture(Texture& tex); @@ -109,14 +148,13 @@ class RendererGL final : public Renderer { virtual bool supportsShaderReload() override { return true; } virtual std::string getUbershader() override; virtual void setUbershader(const std::string& shader) override; - virtual void prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) override; + virtual bool prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) override; std::optional getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true); // Note: The caller is responsible for deleting the currently bound FBO before calling this void setFBO(uint handle) { screenFramebuffer.m_handle = handle; } void resetStateManager() { gl.reset(); } - void clearShaderCache(); void initUbershader(OpenGL::Program& program); #ifdef PANDA3DS_FRONTEND_QT diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index b6d903e4a..077c65aa5 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -123,27 +123,38 @@ void GPU::reset() { // Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter) // And whether we are going to use the shader JIT (second template parameter) void GPU::drawArrays(bool indexed) { - renderer->prepareForDraw(shaderUnit, false); - const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled; - - if (indexed) { - if (shaderJITEnabled) - drawArrays(); - else - drawArrays(); + const bool hwShaders = renderer->prepareForDraw(shaderUnit, false); + + if (hwShaders) { + if (indexed) { + drawArrays(); + } else { + drawArrays(); + } } else { - if (shaderJITEnabled) - drawArrays(); - else - drawArrays(); + const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled; + + if (indexed) { + if (shaderJITEnabled) { + drawArrays(); + } else { + drawArrays(); + } + } else { + if (shaderJITEnabled) { + drawArrays(); + } else { + drawArrays(); + } + } } } static std::array vertices; -template +template void GPU::drawArrays() { - if constexpr (useShaderJIT) { + if constexpr (mode == ShaderExecMode::JIT) { shaderJIT.prepare(shaderUnit.vs); } @@ -322,29 +333,38 @@ void GPU::drawArrays() { } } - // Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers - // Based on the SH_ATTRIBUTES_PERMUTATION registers. - // Ie it might attribute #0 to v2, #1 to v7, etc - for (int j = 0; j < totalAttribCount; j++) { - const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf; - std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f)); - } + // Running shader on the CPU instead of the GPU + if constexpr (mode == ShaderExecMode::Interpreter || mode == ShaderExecMode::JIT) { + // Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers + // Based on the SH_ATTRIBUTES_PERMUTATION registers. + // Ie it might map attribute #0 to v2, #1 to v7, etc + for (int j = 0; j < totalAttribCount; j++) { + const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf; + std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f)); + } - if constexpr (useShaderJIT) { - shaderJIT.run(shaderUnit.vs); - } else { - shaderUnit.vs.run(); - } + if constexpr (mode == ShaderExecMode::JIT) { + shaderJIT.run(shaderUnit.vs); + } else { + shaderUnit.vs.run(); + } - PICA::Vertex& out = vertices[i]; - // Map shader outputs to fixed function properties - const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7; - for (int i = 0; i < totalShaderOutputs; i++) { - const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i]; + PICA::Vertex& out = vertices[i]; + // Map shader outputs to fixed function properties + const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7; + for (int i = 0; i < totalShaderOutputs; i++) { + const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i]; - for (int j = 0; j < 4; j++) { // pls unroll - const u32 mapping = (config >> (j * 8)) & 0x1F; - out.raw[mapping] = vsOutputRegisters[i][j]; + for (int j = 0; j < 4; j++) { // pls unroll + const u32 mapping = (config >> (j * 8)) & 0x1F; + out.raw[mapping] = vsOutputRegisters[i][j]; + } + } + } else { // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly + PICA::Vertex& out = vertices[i]; + for (int j = 0; j < totalAttribCount; j++) { + const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf; + std::memcpy(&out.raw[mapping], ¤tAttributes[j], sizeof(vec4f)); } } } diff --git a/src/core/PICA/shader_gen_glsl.cpp b/src/core/PICA/shader_gen_glsl.cpp index 9802be902..d4deee356 100644 --- a/src/core/PICA/shader_gen_glsl.cpp +++ b/src/core/PICA/shader_gen_glsl.cpp @@ -72,11 +72,6 @@ std::string FragmentGenerator::getDefaultVertexShader() { out float gl_ClipDistance[2]; #endif - vec4 abgr8888ToVec4(uint abgr) { - const float scale = 1.0 / 255.0; - return scale * vec4(float(abgr & 0xffu), float((abgr >> 8) & 0xffu), float((abgr >> 16) & 0xffu), float(abgr >> 24)); - } - void main() { gl_Position = a_coords; vec4 colourAbs = abs(a_vertexColour); @@ -677,4 +672,58 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf shader += "vec2 value = texelFetch(u_tex_luts, ivec2(int(clamped_index), 24), 0).rg;"; // fog LUT is past the light LUTs shader += "float fog_factor = clamp(value.r + value.g * delta, 0.0, 1.0);"; shader += "combinerOutput.rgb = mix(fog_color, combinerOutput.rgb, fog_factor);"; +} + +std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader) { + if (usingUbershader) { + Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader"); + return picaSource; + } else { + // TODO: Uniforms and don't hardcode fixed-function semantic indices... + std::string ret = picaSource; + if (api == API::GLES) { + ret += "\n#define USING_GLES\n"; + } + + ret += R"( +out vec4 v_quaternion; +out vec4 v_colour; +out vec3 v_texcoord0; +out vec2 v_texcoord1; +out vec3 v_view; +out vec2 v_texcoord2; + +#ifndef USING_GLES + out float gl_ClipDistance[2]; +#endif + +void main() { + pica_shader_main(); + vec4 a_coords = output_registers[0]; + vec4 a_vertexColour = output_registers[1]; + vec2 a_texcoord0 = output_registers[2].xy; + float a_texcoord0_w = output_registers[2].w; + vec2 a_texcoord1 = output_registers[3].xy; + vec2 a_texcoord2 = output_registers[4].xy; + vec3 a_view = output_registers[5].xyz; + vec4 a_quaternion = output_registers[6]; + + gl_Position = a_coords; + vec4 colourAbs = abs(a_vertexColour); + v_colour = min(colourAbs, vec4(1.f)); + + v_texcoord0 = vec3(a_texcoord0.x, 1.0 - a_texcoord0.y, a_texcoord0_w); + v_texcoord1 = vec2(a_texcoord1.x, 1.0 - a_texcoord1.y); + v_texcoord2 = vec2(a_texcoord2.x, 1.0 - a_texcoord2.y); + v_view = a_view; + v_quaternion = a_quaternion; + +#ifndef USING_GLES + //gl_ClipDistance[0] = -a_coords.z; + //gl_ClipDistance[1] = dot(clipCoords, a_coords); +#endif +})"; + + return ret; + } } \ No newline at end of file diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 90eccf47a..c593ad96f 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -25,7 +25,7 @@ void RendererGL::reset() { colourBufferCache.reset(); textureCache.reset(); - clearShaderCache(); + shaderCache.clear(); // Init the colour/depth buffer settings to some random defaults on reset colourBufferLoc = 0; @@ -788,18 +788,24 @@ OpenGL::Program& RendererGL::getSpecializedShader() { PICA::FragmentConfig fsConfig(regs); - CachedProgram& programEntry = shaderCache[fsConfig]; + OpenGL::Shader& fragShader = shaderCache.fragmentShaderCache[fsConfig]; + if (!fragShader.exists()) { + std::string fs = fragShaderGen.generate(fsConfig); + fragShader.create({fs.c_str(), fs.size()}, OpenGL::Fragment); + } + + // Get the handle of the current vertex shader + OpenGL::Shader& vertexShader = usingAcceleratedShader ? *generatedVertexShader : defaultShadergenVs; + // And form the key for looking up a shader program + const u64 programKey = (u64(vertexShader.handle()) << 32) | u64(fragShader.handle()); + + CachedProgram& programEntry = shaderCache.programCache[programKey]; OpenGL::Program& program = programEntry.program; if (!program.exists()) { - std::string fs = fragShaderGen.generate(fsConfig); - - OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment); - program.create({defaultShadergenVs, fragShader}); + program.create({vertexShader, fragShader}); gl.useProgram(program); - fragShader.free(); - // Init sampler objects. Texture 0 goes in texture unit 0, texture 1 in TU 1, texture 2 in TU 2, and the light maps go in TU 3 glUniform1i(OpenGL::uniformLocation(program, "u_tex0"), 0); glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1); @@ -904,15 +910,8 @@ OpenGL::Program& RendererGL::getSpecializedShader() { return program; } -void RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { - std::string vertShaderSource = PICA::ShaderGen::decompileShader( - shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL - ); - - OpenGL::Shader vert({vertShaderSource.c_str(), vertShaderSource.size()}, OpenGL::Vertex); - //triangleProgram.create({vert, frag}); - std::cout << vertShaderSource << "\n"; - +bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { + // First we figure out if we will be using an ubershader bool usingUbershader = emulatorConfig->useUbershaders; if (usingUbershader) { const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0; @@ -925,6 +924,46 @@ void RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { } } + // Then we figure out if we will use hw accelerated shaders, and try to fetch our shader + // TODO: Ubershader support for accelerated shaders + usingAcceleratedShader = emulatorConfig->accelerateShaders && !isImmediateMode && !usingUbershader; + + if (usingAcceleratedShader) { + auto shaderCodeHash = shaderUnit.vs.getCodeHash(); + auto opdescHash = shaderUnit.vs.getOpdescHash(); + auto vertexConfig = PICA::VertConfig{ + .shaderHash = shaderCodeHash, + .opdescHash = opdescHash, + .entrypoint = shaderUnit.vs.entrypoint, + .usingUbershader = usingUbershader, + }; + + std::optional& shader = shaderCache.vertexShaderCache[vertexConfig]; + // If the optional is false, we have never tried to recompile the shader before. Try to recompile it and see if it works. + if (!shader.has_value()) { + // Initialize shader to a "null" shader (handle == 0) + *shader = OpenGL::Shader(); + + std::string picaShaderSource = PICA::ShaderGen::decompileShader( + shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL + ); + + // Empty source means compilation error, if the source is not empty then we convert the rcompiled PICA code into a valid shader and upload + // it to the GPU + if (!picaShaderSource.empty()) { + std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, usingUbershader); + shader->create({vertexShaderSource}, OpenGL::Vertex); + } + } + + // Shader generation did not work out, so set usingAcceleratedShader to false + if (!shader->exists()) { + usingAcceleratedShader = false; + } else { + generatedVertexShader = &(*shader); + } + } + if (usingUbershader) { gl.useProgram(triangleProgram); } else { @@ -958,6 +997,8 @@ void RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, ®s[0x48]); setupUbershaderTexEnv(); } + + return usingAcceleratedShader; } void RendererGL::screenshot(const std::string& name) { @@ -985,22 +1026,12 @@ void RendererGL::screenshot(const std::string& name) { stbi_write_png(name.c_str(), width, height, 4, flippedPixels.data(), 0); } -void RendererGL::clearShaderCache() { - for (auto& shader : shaderCache) { - CachedProgram& cachedProgram = shader.second; - cachedProgram.program.free(); - glDeleteBuffers(1, &cachedProgram.uboBinding); - } - - shaderCache.clear(); -} - void RendererGL::deinitGraphicsContext() { // Invalidate all surface caches since they'll no longer be valid textureCache.reset(); depthBufferCache.reset(); colourBufferCache.reset(); - clearShaderCache(); + shaderCache.clear(); // All other GL objects should be invalidated automatically and be recreated by the next call to initGraphicsContext // TODO: Make it so that depth and colour buffers get written back to 3DS memory @@ -1048,4 +1079,4 @@ void RendererGL::initUbershader(OpenGL::Program& program) { glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1); glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2); glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3); -} +} \ No newline at end of file From efcb42af2c15fbe5e837dcdc159384ca87034551 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Thu, 25 Jul 2024 23:36:22 +0300 Subject: [PATCH 05/63] Shader decompiler: Fix redundant compilations --- include/renderer_gl/renderer_gl.hpp | 1 - src/core/renderer_gl/renderer_gl.cpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index 41dba6eb6..2471bae03 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -95,7 +95,6 @@ class RendererGL final : public Renderer { for (auto& it : programCache) { CachedProgram& cachedProgram = it.second; cachedProgram.program.free(); - glDeleteBuffers(1, &cachedProgram.uboBinding); } for (auto& it : vertexShaderCache) { diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 6f0cab1a9..5cd7ccedb 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -944,7 +944,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { // If the optional is false, we have never tried to recompile the shader before. Try to recompile it and see if it works. if (!shader.has_value()) { // Initialize shader to a "null" shader (handle == 0) - *shader = OpenGL::Shader(); + shader = OpenGL::Shader(); std::string picaShaderSource = PICA::ShaderGen::decompileShader( shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL From d9f4f3736f48fdb20da6c33915daa68eb6adaf23 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Fri, 26 Jul 2024 00:21:26 +0300 Subject: [PATCH 06/63] Shader Decompiler: Fix vertex attribute upload --- include/renderer_gl/renderer_gl.hpp | 5 ++- src/core/PICA/gpu.cpp | 3 +- src/core/renderer_gl/renderer_gl.cpp | 47 +++++++++++++++++----------- 3 files changed, 34 insertions(+), 21 deletions(-) diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index 2471bae03..cb9328276 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -31,7 +31,10 @@ class RendererGL final : public Renderer { OpenGL::Program triangleProgram; OpenGL::Program displayProgram; - OpenGL::VertexArray vao; + // VAO for when not using accelerated vertex shaders. Contains attribute declarations matching to the PICA fixed function fragment attributes + OpenGL::VertexArray defaultVAO; + // VAO for when using accelerated vertex shaders. The PICA vertex shader inputs are passed as attributes without CPU processing. + OpenGL::VertexArray hwShaderVAO; OpenGL::VertexBuffer vbo; // Data diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index 077c65aa5..a6d734fd0 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -364,7 +364,8 @@ void GPU::drawArrays() { PICA::Vertex& out = vertices[i]; for (int j = 0; j < totalAttribCount; j++) { const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf; - std::memcpy(&out.raw[mapping], ¤tAttributes[j], sizeof(vec4f)); + // Multiply mapping * 4 as mapping refers to a vec4 whereas out.raw is an array of floats + std::memcpy(&out.raw[mapping * 4], ¤tAttributes[j], sizeof(vec4f)); } } } diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 5cd7ccedb..c2c041b3d 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -85,33 +85,42 @@ void RendererGL::initGraphicsContextInternal() { vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW); gl.bindVBO(vbo); - vao.create(); - gl.bindVAO(vao); + // Initialize the VAO used when not using hw shaders + defaultVAO.create(); + gl.bindVAO(defaultVAO); // Position (x, y, z, w) attributes - vao.setAttributeFloat(0, 4, sizeof(Vertex), offsetof(Vertex, s.positions)); - vao.enableAttribute(0); + defaultVAO.setAttributeFloat(0, 4, sizeof(Vertex), offsetof(Vertex, s.positions)); + defaultVAO.enableAttribute(0); // Quaternion attribute - vao.setAttributeFloat(1, 4, sizeof(Vertex), offsetof(Vertex, s.quaternion)); - vao.enableAttribute(1); + defaultVAO.setAttributeFloat(1, 4, sizeof(Vertex), offsetof(Vertex, s.quaternion)); + defaultVAO.enableAttribute(1); // Colour attribute - vao.setAttributeFloat(2, 4, sizeof(Vertex), offsetof(Vertex, s.colour)); - vao.enableAttribute(2); + defaultVAO.setAttributeFloat(2, 4, sizeof(Vertex), offsetof(Vertex, s.colour)); + defaultVAO.enableAttribute(2); // UV 0 attribute - vao.setAttributeFloat(3, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord0)); - vao.enableAttribute(3); + defaultVAO.setAttributeFloat(3, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord0)); + defaultVAO.enableAttribute(3); // UV 1 attribute - vao.setAttributeFloat(4, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord1)); - vao.enableAttribute(4); + defaultVAO.setAttributeFloat(4, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord1)); + defaultVAO.enableAttribute(4); // UV 0 W-component attribute - vao.setAttributeFloat(5, 1, sizeof(Vertex), offsetof(Vertex, s.texcoord0_w)); - vao.enableAttribute(5); + defaultVAO.setAttributeFloat(5, 1, sizeof(Vertex), offsetof(Vertex, s.texcoord0_w)); + defaultVAO.enableAttribute(5); // View - vao.setAttributeFloat(6, 3, sizeof(Vertex), offsetof(Vertex, s.view)); - vao.enableAttribute(6); + defaultVAO.setAttributeFloat(6, 3, sizeof(Vertex), offsetof(Vertex, s.view)); + defaultVAO.enableAttribute(6); // UV 2 attribute - vao.setAttributeFloat(7, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord2)); - vao.enableAttribute(7); + defaultVAO.setAttributeFloat(7, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord2)); + defaultVAO.enableAttribute(7); + + // Initialize the VAO used for hw shaders + hwShaderVAO.create(); + gl.bindVAO(hwShaderVAO); + for (int attr = 0; attr < 8; attr++) { + hwShaderVAO.setAttributeFloat(attr, 4, sizeof(Vertex), attr * sizeof(float) * 4); + hwShaderVAO.enableAttribute(attr); + } dummyVBO.create(); dummyVAO.create(); @@ -418,7 +427,7 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v const auto primitiveTopology = primTypes[static_cast(primType)]; gl.disableScissor(); gl.bindVBO(vbo); - gl.bindVAO(vao); + gl.bindVAO(usingAcceleratedShader ? hwShaderVAO : defaultVAO); gl.enableClipPlane(0); // Clipping plane 0 is always enabled if (regs[PICA::InternalRegs::ClipEnable] & 1) { From 2fc09223aa0f131ddff5448d6796d08381a1478e Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Fri, 26 Jul 2024 01:08:00 +0300 Subject: [PATCH 07/63] Shader compiler: Simplify generated code for reading and faster compilation --- src/core/PICA/shader_decompiler.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 826cfaccf..d2414c13a 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -163,6 +163,12 @@ std::string ShaderDecompiler::getDest(u32 dest) const { } std::string ShaderDecompiler::getSwizzlePattern(u32 swizzle) const { + // If the swizzle field is this value then the swizzle pattern is .xyzw so we don't need a shuffle + static constexpr uint noSwizzle = 0x1B; + if (swizzle == noSwizzle) { + return ""; + } + static constexpr std::array names = {'x', 'y', 'z', 'w'}; std::string ret(". "); @@ -211,8 +217,10 @@ void ShaderDecompiler::setDest(u32 operandDescriptor, const std::string& dest, c decompiledShader += dest + destSwizzle + " = "; if (writtenLaneCount == 1) { decompiledShader += "float(" + value + ");\n"; - } else { - decompiledShader += "vec" + std::to_string(writtenLaneCount) + "(" + value + ");\n"; + } else if (writtenLaneCount <= 3) { // We don't need to cast for vec4, as we guarantee the rhs will be a vec4 + decompiledShader += fmt::format("vec{}({});\n", writtenLaneCount, value); + } else if (writtenLaneCount == 4) { + decompiledShader += fmt::format("{};\n", value); } } From 213183895abe05e4720520dbce6f06ba7cee1403 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Fri, 26 Jul 2024 01:15:03 +0300 Subject: [PATCH 08/63] Further simplify shader decompiler output --- src/core/PICA/shader_decompiler.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index d2414c13a..5559bcc56 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -182,7 +182,6 @@ std::string ShaderDecompiler::getSwizzlePattern(u32 swizzle) const { std::string ShaderDecompiler::getDestSwizzle(u32 destinationMask) const { std::string ret = "."; - if (destinationMask & 0b1000) { ret += "x"; } @@ -214,7 +213,8 @@ void ShaderDecompiler::setDest(u32 operandDescriptor, const std::string& dest, c return; } - decompiledShader += dest + destSwizzle + " = "; + // Don't write destination swizzle if all lanes are getting written to + decompiledShader += fmt::format("{}{} = ", dest, writtenLaneCount == 4 ? "" : destSwizzle); if (writtenLaneCount == 1) { decompiledShader += "float(" + value + ");\n"; } else if (writtenLaneCount <= 3) { // We don't need to cast for vec4, as we guarantee the rhs will be a vec4 From e8b4992036eb254ed48b3775a072bf4da16e22fb Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Fri, 26 Jul 2024 01:24:52 +0300 Subject: [PATCH 09/63] Shader decompiler: More smallen-ing --- src/core/PICA/shader_decompiler.cpp | 10 +++++----- src/core/PICA/shader_gen_glsl.cpp | 16 ++++++++-------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 5559bcc56..599bd31e1 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -81,8 +81,8 @@ void ShaderDecompiler::writeAttributes() { uint uniform_bool; }; - vec4 temp_registers[16]; - vec4 output_registers[8]; + vec4 tmp_regs[16]; + vec4 out_regs[8]; vec4 dummy_vec = vec4(0.0); )"; } @@ -141,7 +141,7 @@ std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index) if (source < 0x10) { return "inputs[" + std::to_string(source) + "]"; } else if (source < 0x20) { - return "temp_registers[" + std::to_string(source - 0x10) + "]"; + return "tmp_regs[" + std::to_string(source - 0x10) + "]"; } else { const usize floatIndex = (source - 0x20) & 0x7f; @@ -154,9 +154,9 @@ std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index) std::string ShaderDecompiler::getDest(u32 dest) const { if (dest < 0x10) { - return "output_registers[" + std::to_string(dest) + "]"; + return "out_regs[" + std::to_string(dest) + "]"; } else if (dest < 0x20) { - return "temp_registers[" + std::to_string(dest - 0x10) + "]"; + return "tmp_regs[" + std::to_string(dest - 0x10) + "]"; } else { return "dummy_vec"; } diff --git a/src/core/PICA/shader_gen_glsl.cpp b/src/core/PICA/shader_gen_glsl.cpp index cb78242dc..edc8a293c 100644 --- a/src/core/PICA/shader_gen_glsl.cpp +++ b/src/core/PICA/shader_gen_glsl.cpp @@ -696,14 +696,14 @@ out vec2 v_texcoord2; void main() { pica_shader_main(); - vec4 a_coords = output_registers[0]; - vec4 a_vertexColour = output_registers[1]; - vec2 a_texcoord0 = output_registers[2].xy; - float a_texcoord0_w = output_registers[2].w; - vec2 a_texcoord1 = output_registers[3].xy; - vec2 a_texcoord2 = output_registers[4].xy; - vec3 a_view = output_registers[5].xyz; - vec4 a_quaternion = output_registers[6]; + vec4 a_coords = out_regs[0]; + vec4 a_vertexColour = out_regs[1]; + vec2 a_texcoord0 = out_regs[2].xy; + float a_texcoord0_w = out_regs[2].w; + vec2 a_texcoord1 = out_regs[3].xy; + vec2 a_texcoord2 = out_regs[4].xy; + vec3 a_view = out_regs[5].xyz; + vec4 a_quaternion = out_regs[6]; gl_Position = a_coords; vec4 colourAbs = abs(a_vertexColour); From 67ff1ccb8b50e0ad51c4f870cc656e0dbdebbf9d Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Fri, 26 Jul 2024 14:28:48 +0300 Subject: [PATCH 10/63] Shader decompiler: Get PICA uniforms uploaded to the GPU --- include/PICA/shader.hpp | 4 ++++ include/renderer_gl/renderer_gl.hpp | 2 ++ src/core/renderer_gl/renderer_gl.cpp | 28 ++++++++++++++++++++++------ 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/include/PICA/shader.hpp b/include/PICA/shader.hpp index c725c180a..7f127795e 100644 --- a/include/PICA/shader.hpp +++ b/include/PICA/shader.hpp @@ -301,6 +301,10 @@ class PICAShader { Hash getCodeHash(); Hash getOpdescHash(); + + // Returns how big the PICA uniforms are combined. Used for hw accelerated shaders where we upload the uniforms to our GPU. + static constexpr usize totalUniformSize() { return sizeof(floatUniforms) + sizeof(intUniforms) + sizeof(boolUniform); } + void* getUniformPointer() { return static_cast(&floatUniforms); } }; static_assert( diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index cb9328276..73b52cc5a 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -80,6 +80,8 @@ class RendererGL final : public Renderer { // We can compile this once and then link it with all other generated fragment shaders OpenGL::Shader defaultShadergenVs; GLuint shadergenFragmentUBO; + // UBO for uploading the PICA uniforms when using hw shaders + GLuint hwShaderUniformUBO; // Cached recompiled fragment shader struct CachedProgram { diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index c2c041b3d..17e3702f3 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -83,6 +83,11 @@ void RendererGL::initGraphicsContextInternal() { gl.bindUBO(shadergenFragmentUBO); glBufferData(GL_UNIFORM_BUFFER, sizeof(PICA::FragmentUniforms), nullptr, GL_DYNAMIC_DRAW); + // Allocate memory for the accelerated vertex shader uniform UBO + glGenBuffers(1, &hwShaderUniformUBO); + gl.bindUBO(hwShaderUniformUBO); + glBufferData(GL_UNIFORM_BUFFER, PICAShader::totalUniformSize(), nullptr, GL_DYNAMIC_DRAW); + vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW); gl.bindVBO(vbo); // Initialize the VAO used when not using hw shaders @@ -798,7 +803,8 @@ std::optional RendererGL::getColourBuffer(u32 addr, PICA::ColorFmt } OpenGL::Program& RendererGL::getSpecializedShader() { - constexpr uint uboBlockBinding = 2; + constexpr uint vsUBOBlockBinding = 1; + constexpr uint fsUBOBlockBinding = 2; PICA::FragmentConfig fsConfig(regs); @@ -826,12 +832,20 @@ OpenGL::Program& RendererGL::getSpecializedShader() { glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2); glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3); - // Set up the binding for our UBO. Sadly we can't specify it in the shader like normal people, + // Set up the binding for our UBOs. Sadly we can't specify it in the shader like normal people, // As it's an OpenGL 4.2 feature that MacOS doesn't support... - uint uboIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms"); - glUniformBlockBinding(program.handle(), uboIndex, uboBlockBinding); + uint fsUBOIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms"); + glUniformBlockBinding(program.handle(), fsUBOIndex, fsUBOBlockBinding); + + if (usingAcceleratedShader) { + uint vertexUBOIndex = glGetUniformBlockIndex(program.handle(), "PICAShaderUniforms"); + glUniformBlockBinding(program.handle(), vertexUBOIndex, vsUBOBlockBinding); + } + } + glBindBufferBase(GL_UNIFORM_BUFFER, fsUBOBlockBinding, shadergenFragmentUBO); + if (usingAcceleratedShader) { + glBindBufferBase(GL_UNIFORM_BUFFER, vsUBOBlockBinding, hwShaderUniformUBO); } - glBindBufferBase(GL_UNIFORM_BUFFER, uboBlockBinding, shadergenFragmentUBO); // Upload uniform data to our shader's UBO PICA::FragmentUniforms uniforms; @@ -958,7 +972,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { std::string picaShaderSource = PICA::ShaderGen::decompileShader( shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL ); - + // Empty source means compilation error, if the source is not empty then we convert the rcompiled PICA code into a valid shader and upload // it to the GPU if (!picaShaderSource.empty()) { @@ -972,6 +986,8 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { usingAcceleratedShader = false; } else { generatedVertexShader = &(*shader); + gl.bindUBO(hwShaderUniformUBO); + glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer()); } } From db64b0a260d09ebd0e3c1bba1c07b21ba40ee52c Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Fri, 26 Jul 2024 14:41:28 +0300 Subject: [PATCH 11/63] Shader decompiler: Readd clipping --- src/core/PICA/shader_gen_glsl.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/core/PICA/shader_gen_glsl.cpp b/src/core/PICA/shader_gen_glsl.cpp index edc8a293c..2dbccaeb7 100644 --- a/src/core/PICA/shader_gen_glsl.cpp +++ b/src/core/PICA/shader_gen_glsl.cpp @@ -682,6 +682,8 @@ std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& pic ret += "\n#define USING_GLES\n"; } + ret += uniformDefinition; + ret += R"( out vec4 v_quaternion; out vec4 v_colour; @@ -716,8 +718,8 @@ void main() { v_quaternion = a_quaternion; #ifndef USING_GLES - //gl_ClipDistance[0] = -a_coords.z; - //gl_ClipDistance[1] = dot(clipCoords, a_coords); + gl_ClipDistance[0] = -a_coords.z; + gl_ClipDistance[1] = dot(clipCoords, a_coords); #endif })"; From 67daf03e446371bd9d07ce7e061b0a00605b988a Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Fri, 26 Jul 2024 16:27:41 +0300 Subject: [PATCH 12/63] Shader decompiler: Actually `break` on control flow instructions --- src/core/PICA/shader_decompiler.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 599bd31e1..3cdbeb8ed 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -34,14 +34,14 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e const u32 opcode = instruction >> 26; switch (opcode) { - case ShaderOpcodes::JMPC: Helpers::panic("Unimplemented control flow operation (JMPC)"); - case ShaderOpcodes::JMPU: Helpers::panic("Unimplemented control flow operation (JMPU)"); - case ShaderOpcodes::IFU: Helpers::panic("Unimplemented control flow operation (IFU)"); - case ShaderOpcodes::IFC: Helpers::panic("Unimplemented control flow operation (IFC)"); - case ShaderOpcodes::CALL: Helpers::panic("Unimplemented control flow operation (CALL)"); - case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)"); - case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)"); - case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)"); + case ShaderOpcodes::JMPC: Helpers::panic("Unimplemented control flow operation (JMPC)"); break; + case ShaderOpcodes::JMPU: Helpers::panic("Unimplemented control flow operation (JMPU)"); break; + case ShaderOpcodes::IFU: Helpers::panic("Unimplemented control flow operation (IFU)"); break; + case ShaderOpcodes::IFC: Helpers::panic("Unimplemented control flow operation (IFC)"); break; + case ShaderOpcodes::CALL: Helpers::panic("Unimplemented control flow operation (CALL)"); break; + case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)"); break; + case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)"); break; + case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)"); break; case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second; default: break; From 5eb15de431ecbec71475f55b28c31f1ce62fe046 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Fri, 26 Jul 2024 22:02:03 +0300 Subject: [PATCH 13/63] Shader decompiler: More control flow handling --- include/PICA/shader_decompiler.hpp | 5 +- src/core/PICA/shader_decompiler.cpp | 217 ++++++++++++++++++++++++++-- 2 files changed, 206 insertions(+), 16 deletions(-) diff --git a/include/PICA/shader_decompiler.hpp b/include/PICA/shader_decompiler.hpp index 1253226fc..42bd56429 100644 --- a/include/PICA/shader_decompiler.hpp +++ b/include/PICA/shader_decompiler.hpp @@ -4,6 +4,7 @@ #include #include #include +#include #include "PICA/shader.hpp" #include "PICA/shader_gen_types.hpp" @@ -95,7 +96,8 @@ namespace PICA::ShaderGen { Language language; void compileInstruction(u32& pc, bool& finished); - void compileRange(const AddressRange& range); + // Compile range "range" and returns the end PC or if we're "finished" with the program (called an END instruction) + std::pair compileRange(const AddressRange& range); void callFunction(const Function& function); const Function* findFunction(const AddressRange& range); @@ -105,6 +107,7 @@ namespace PICA::ShaderGen { std::string getDest(u32 dest) const; std::string getSwizzlePattern(u32 swizzle) const; std::string getDestSwizzle(u32 destinationMask) const; + const char* getCondition(u32 cond, u32 refX, u32 refY); void setDest(u32 operandDescriptor, const std::string& dest, const std::string& value); // Returns if the instruction uses the typical register encodings most instructions use diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 3cdbeb8ed..5134845ed 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -2,6 +2,9 @@ #include +#include +#include + #include "config.hpp" using namespace PICA; @@ -20,6 +23,40 @@ void ControlFlow::analyze(const PICAShader& shader, u32 entrypoint) { } } +// Helpers for merging parallel/series exit methods from Citra +// Merges exit method of two parallel branches. +static ExitMode exitParallel(ExitMode a, ExitMode b) { + if (a == ExitMode::Unknown) { + return b; + } + else if (b == ExitMode::Unknown) { + return a; + } + else if (a == b) { + return a; + } + return ExitMode::Conditional; +} + +// Cascades exit method of two blocks of code. +static ExitMode exitSeries(ExitMode a, ExitMode b) { + assert(a != ExitMode::AlwaysEnd); + + if (a == ExitMode::Unknown) { + return ExitMode::Unknown; + } + + if (a == ExitMode::AlwaysReturn) { + return b; + } + + if (b == ExitMode::Unknown || b == ExitMode::AlwaysEnd) { + return ExitMode::AlwaysEnd; + } + + return ExitMode::Conditional; +} + ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 end, Function::Labels& labels) { // Initialize exit mode to unknown by default, in order to detect things like unending loops auto [it, inserted] = exitMap.emplace(AddressRange(start, end), ExitMode::Unknown); @@ -32,17 +69,63 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e for (u32 pc = start; pc < PICAShader::maxInstructionCount && pc != end; pc++) { const u32 instruction = shader.loadedShader[pc]; const u32 opcode = instruction >> 26; + auto setExitMode = [&it](ExitMode mode) { + it->second = mode; + return it->second; + }; switch (opcode) { - case ShaderOpcodes::JMPC: Helpers::panic("Unimplemented control flow operation (JMPC)"); break; - case ShaderOpcodes::JMPU: Helpers::panic("Unimplemented control flow operation (JMPU)"); break; - case ShaderOpcodes::IFU: Helpers::panic("Unimplemented control flow operation (IFU)"); break; - case ShaderOpcodes::IFC: Helpers::panic("Unimplemented control flow operation (IFC)"); break; + case ShaderOpcodes::JMPC: + case ShaderOpcodes::JMPU: { + const u32 dest = getBits<10, 12>(instruction); + // Register this jump address to our outLabels set + labels.insert(dest); + + // This opens up 2 parallel paths of execution + auto branchTakenExit = analyzeFunction(shader, dest, end, labels); + auto branchNotTakenExit = analyzeFunction(shader, pc + 1, dest, labels); + return setExitMode(exitParallel(branchTakenExit, branchNotTakenExit)); + } + case ShaderOpcodes::IFU: + case ShaderOpcodes::IFC: { + Helpers::panic("IFC/IFU"); + const u32 num = instruction & 0xff; + const u32 dest = getBits<10, 12>(instruction); + + const Function* branchTakenFunc = addFunction(shader, pc + 1, dest); + // Check if analysis of the branch taken func failed and return unknown if it did + if (analysisFailed) { + return setExitMode(ExitMode::Unknown); + } + + // Next analyze the not taken func + ExitMode branchNotTakenExitMode = ExitMode::AlwaysReturn; + if (num != 0) { + const Function* branchNotTakenFunc = addFunction(shader, dest, dest + num); + // Check if analysis failed and return unknown if it did + if (analysisFailed) { + return setExitMode(ExitMode::Unknown); + } + + branchNotTakenExitMode = branchNotTakenFunc->exitMode; + } + + auto parallel = exitParallel(branchTakenFunc->exitMode, branchNotTakenExitMode); + // Both branches of the if/else end, so there's nothing after the call + if (parallel == ExitMode::AlwaysEnd) { + return setExitMode(parallel); + } else { + ExitMode afterConditional = analyzeFunction(shader, pc + 1, end, labels); + ExitMode conditionalExitMode = exitSeries(parallel, afterConditional); + return setExitMode(conditionalExitMode); + } + break; + } case ShaderOpcodes::CALL: Helpers::panic("Unimplemented control flow operation (CALL)"); break; case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)"); break; case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)"); break; case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)"); break; - case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second; + case ShaderOpcodes::END: return setExitMode(ExitMode::AlwaysEnd); default: break; } @@ -52,7 +135,7 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e return ExitMode::AlwaysReturn; } -void ShaderDecompiler::compileRange(const AddressRange& range) { +std::pair ShaderDecompiler::compileRange(const AddressRange& range) { u32 pc = range.start; const u32 end = range.end >= range.start ? range.end : PICAShader::maxInstructionCount; bool finished = false; @@ -60,6 +143,8 @@ void ShaderDecompiler::compileRange(const AddressRange& range) { while (pc < end && !finished) { compileInstruction(pc, finished); } + + return std::make_pair(pc, finished); } const Function* ShaderDecompiler::findFunction(const AddressRange& range) { @@ -84,6 +169,7 @@ void ShaderDecompiler::writeAttributes() { vec4 tmp_regs[16]; vec4 out_regs[8]; vec4 dummy_vec = vec4(0.0); + bvec2 cmp_reg = bvec2(false); )"; } @@ -124,14 +210,45 @@ std::string ShaderDecompiler::decompile() { callFunction(*findFunction(mainFunctionRange)); decompiledShader += "}\n"; - for (auto& func : controlFlow.functions) { - if (func.outLabels.size() > 0) { - Helpers::panic("Function with out labels"); - } + for (const Function& func : controlFlow.functions) { + if (func.outLabels.empty()) { + decompiledShader += fmt::format("void {}() {{\n", func.getIdentifier()); + compileRange(AddressRange(func.start, func.end)); + decompiledShader += "}\n"; + } else { + auto labels = func.outLabels; + labels.insert(func.start); + + // If a function has jumps and "labels", this needs to be emulated using a switch-case, with the variable being switched on being the + // current PC + decompiledShader += fmt::format("void {}() {{\n", func.getIdentifier()); + decompiledShader += fmt::format("uint pc = {}u;\n", func.start); + decompiledShader += "while(true){\nswitch(pc){\n"; + + for (u32 label : labels) { + decompiledShader += fmt::format("case {}u: {{", label); + // Fetch the next label whose address > label + auto it = labels.lower_bound(label + 1); + u32 next = (it == labels.end()) ? func.end : *it; + + auto [endPC, finished] = compileRange(AddressRange(label, next)); + if (endPC > next && !finished) { + labels.insert(endPC); + decompiledShader += fmt::format("pc = {}u; break;", endPC); + } + + // Fallthrough to next label + decompiledShader += "}\n"; + } - decompiledShader += "void " + func.getIdentifier() + "() {\n"; - compileRange(AddressRange(func.start, func.end)); - decompiledShader += "}\n"; + decompiledShader += "default: return;\n"; + // Exit the switch and loop + decompiledShader += "} }\n"; + + // Exit the function + decompiledShader += "return;\n"; + decompiledShader += "}\n"; + } } return decompiledShader; @@ -272,6 +389,33 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { case ShaderOpcodes::DP3: setDest(operandDescriptor, dest, fmt::format("vec4(dot({}.xyz, {}.xyz))", src1, src2)); break; case ShaderOpcodes::DP4: setDest(operandDescriptor, dest, fmt::format("vec4(dot({}, {}))", src1, src2)); break; case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, fmt::format("vec4(inversesqrt({}.x))", src1)); break; + case ShaderOpcodes::RCP: setDest(operandDescriptor, dest, fmt::format("vec4(1.0 / {}.x)", src1)); break; + + case ShaderOpcodes::CMP1: + case ShaderOpcodes::CMP2: { + static constexpr std::array operators = { + // The last 2 operators always return true and are handled specially + "==", "!=", "<", "<=", ">", ">=", "", "", + }; + + const u32 cmpY = getBits<21, 3>(instruction); + const u32 cmpX = getBits<24, 3>(instruction); + + // Compare x first + if (cmpX >= 6) { + decompiledShader += "cmp_reg.x = true;\n"; + } else { + decompiledShader += fmt::format("cmp_reg.x = {}.x {} {}.x;\n", src1, operators[cmpX], src2); + } + + // Then compare Y + if (cmpY >= 6) { + decompiledShader += "cmp_reg.y = true;\n"; + } else { + decompiledShader += fmt::format("cmp_reg.y = {}.y {} {}.y;\n", src1, operators[cmpY], src2); + } + break; + } default: Helpers::panic("GLSL recompiler: Unknown common opcode: %X", opcode); break; } @@ -315,7 +459,20 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { setDest(operandDescriptor, dest, src1 + " * " + src2 + " + " + src3); } else { switch (opcode) { - case ShaderOpcodes::END: finished = true; return; + case ShaderOpcodes::JMPC: { + const u32 dest = getBits<10, 12>(instruction); + const u32 condOp = getBits<22, 2>(instruction); + const uint refY = getBit<24>(instruction); + const uint refX = getBit<25>(instruction); + const char* condition = getCondition(condOp, refX, refY); + + decompiledShader += fmt::format("if ({}) {{ pc = {}u; break; }}", condition, dest); + break; + } + case ShaderOpcodes::END: + decompiledShader += "return;\n"; + finished = true; + return; default: Helpers::panic("GLSL recompiler: Unknown opcode: %X", opcode); break; } } @@ -323,7 +480,6 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { pc++; } - bool ShaderDecompiler::usesCommonEncoding(u32 instruction) const { const u32 opcode = instruction >> 26; switch (opcode) { @@ -360,3 +516,34 @@ std::string ShaderGen::decompileShader(PICAShader& shader, EmulatorConfig& confi return decompiler.decompile(); } + +const char* ShaderDecompiler::getCondition(u32 cond, u32 refX, u32 refY) { + static constexpr std::array conditions = { + // ref(Y, X) = (0, 0) + "!all(cmp_reg)", + "all(not(cmp_reg))", + "!cmp_reg.x", + "!cmp_reg.y", + + // ref(Y, X) = (0, 1) + "cmp_reg.x || !cmp_reg.y", + "cmp_reg.x && !cmp_reg.y", + "cmp_reg.x", + "!cmp_reg.y", + + // ref(Y, X) = (1, 0) + "!cmp_reg.x || cmp_reg.y", + "!cmp_reg.x && cmp_reg.y", + "!cmp_reg.x", + "cmp_reg.y", + + // ref(Y, X) = (1, 1) + "any(cmp_reg)", + "all(cmp_reg)", + "cmp_reg.x", + "cmp_reg.y", + }; + u32 key = (cond & 0b11) | (refX << 2) | (refY << 3); + + return conditions[key]; +} From a20982f78acaaf519388884572d1dc03995070a1 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Fri, 26 Jul 2024 23:30:31 +0300 Subject: [PATCH 14/63] Shader decompiler: Fix desitnation mask --- src/core/PICA/shader_decompiler.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 5134845ed..9a7d768a0 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -65,14 +65,15 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e return it->second; } + auto setExitMode = [&it](ExitMode mode) { + it->second = mode; + return it->second; + }; + // Make sure not to go out of bounds on the shader for (u32 pc = start; pc < PICAShader::maxInstructionCount && pc != end; pc++) { const u32 instruction = shader.loadedShader[pc]; const u32 opcode = instruction >> 26; - auto setExitMode = [&it](ExitMode mode) { - it->second = mode; - return it->second; - }; switch (opcode) { case ShaderOpcodes::JMPC: @@ -332,10 +333,8 @@ void ShaderDecompiler::setDest(u32 operandDescriptor, const std::string& dest, c // Don't write destination swizzle if all lanes are getting written to decompiledShader += fmt::format("{}{} = ", dest, writtenLaneCount == 4 ? "" : destSwizzle); - if (writtenLaneCount == 1) { - decompiledShader += "float(" + value + ");\n"; - } else if (writtenLaneCount <= 3) { // We don't need to cast for vec4, as we guarantee the rhs will be a vec4 - decompiledShader += fmt::format("vec{}({});\n", writtenLaneCount, value); + if (writtenLaneCount <= 3) { + decompiledShader += fmt::format("({}){};\n", value, destSwizzle); } else if (writtenLaneCount == 4) { decompiledShader += fmt::format("{};\n", value); } From 44705508ffd1f3baeb394da7743dac77120acd2a Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 28 Jul 2024 00:47:50 +0300 Subject: [PATCH 15/63] Shader Decomp: Remove pair member capture in lambda (unsupported on NDK) --- src/core/PICA/shader_decompiler.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 9a7d768a0..1395f8e3d 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -65,11 +65,6 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e return it->second; } - auto setExitMode = [&it](ExitMode mode) { - it->second = mode; - return it->second; - }; - // Make sure not to go out of bounds on the shader for (u32 pc = start; pc < PICAShader::maxInstructionCount && pc != end; pc++) { const u32 instruction = shader.loadedShader[pc]; @@ -85,7 +80,8 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e // This opens up 2 parallel paths of execution auto branchTakenExit = analyzeFunction(shader, dest, end, labels); auto branchNotTakenExit = analyzeFunction(shader, pc + 1, dest, labels); - return setExitMode(exitParallel(branchTakenExit, branchNotTakenExit)); + it->second = exitParallel(branchTakenExit, branchNotTakenExit); + return it->second; } case ShaderOpcodes::IFU: case ShaderOpcodes::IFC: { @@ -96,7 +92,8 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e const Function* branchTakenFunc = addFunction(shader, pc + 1, dest); // Check if analysis of the branch taken func failed and return unknown if it did if (analysisFailed) { - return setExitMode(ExitMode::Unknown); + it->second = ExitMode::Unknown; + return it->second; } // Next analyze the not taken func @@ -105,7 +102,8 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e const Function* branchNotTakenFunc = addFunction(shader, dest, dest + num); // Check if analysis failed and return unknown if it did if (analysisFailed) { - return setExitMode(ExitMode::Unknown); + it->second = ExitMode::Unknown; + return it->second; } branchNotTakenExitMode = branchNotTakenFunc->exitMode; @@ -114,11 +112,13 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e auto parallel = exitParallel(branchTakenFunc->exitMode, branchNotTakenExitMode); // Both branches of the if/else end, so there's nothing after the call if (parallel == ExitMode::AlwaysEnd) { - return setExitMode(parallel); + it->second = parallel; + return it->second; } else { ExitMode afterConditional = analyzeFunction(shader, pc + 1, end, labels); ExitMode conditionalExitMode = exitSeries(parallel, afterConditional); - return setExitMode(conditionalExitMode); + it->second = conditionalExitMode; + return it->second; } break; } @@ -126,7 +126,7 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)"); break; case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)"); break; case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)"); break; - case ShaderOpcodes::END: return setExitMode(ExitMode::AlwaysEnd); + case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second; default: break; } From 37d7bad5aaf5da0f6080b5691f96ff23a72ce952 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 28 Jul 2024 03:38:23 +0300 Subject: [PATCH 16/63] Disgusting changes to handle the fact that hw shader shaders are 2x as big --- include/PICA/pica_vert_config.hpp | 20 ++++++++++++++++- include/PICA/shader_gen.hpp | 3 ++- src/core/PICA/gpu.cpp | 33 +++++++++++++++++++++++----- src/core/PICA/shader_decompiler.cpp | 4 ++-- src/core/PICA/shader_gen_glsl.cpp | 29 ++++++++++++++++++++---- src/core/renderer_gl/renderer_gl.cpp | 24 ++++++++++---------- 6 files changed, 89 insertions(+), 24 deletions(-) diff --git a/include/PICA/pica_vert_config.hpp b/include/PICA/pica_vert_config.hpp index ae774405d..083e1997f 100644 --- a/include/PICA/pica_vert_config.hpp +++ b/include/PICA/pica_vert_config.hpp @@ -6,21 +6,39 @@ #include "PICA/pica_hash.hpp" #include "PICA/regs.hpp" +#include "PICA/shader.hpp" #include "bitfield.hpp" #include "helpers.hpp" namespace PICA { - // Configuration struct used + // Configuration struct used struct VertConfig { PICAHash::HashType shaderHash; PICAHash::HashType opdescHash; u32 entrypoint; + + // PICA registers for configuring shader output->fragment semantic mapping + std::array outmaps{}; + u16 outputMask; + u8 outputCount; bool usingUbershader; bool operator==(const VertConfig& config) const { // Hash function and equality operator required by std::unordered_map return std::memcmp(this, &config, sizeof(VertConfig)) == 0; } + + VertConfig(PICAShader& shader, const std::array& regs, bool usingUbershader) : usingUbershader(usingUbershader) { + shaderHash = shader.getCodeHash(); + opdescHash = shader.getOpdescHash(); + entrypoint = shader.entrypoint; + + outputCount = regs[PICA::InternalRegs::ShaderOutputCount] & 7; + outputMask = regs[PICA::InternalRegs::VertexShaderOutputMask]; + for (int i = 0; i < outputCount; i++) { + outputMask = regs[PICA::InternalRegs::ShaderOutmap0 + i]; + } + } }; } // namespace PICA diff --git a/include/PICA/shader_gen.hpp b/include/PICA/shader_gen.hpp index 2d39e0787..aef16d50b 100644 --- a/include/PICA/shader_gen.hpp +++ b/include/PICA/shader_gen.hpp @@ -3,6 +3,7 @@ #include "PICA/gpu.hpp" #include "PICA/pica_frag_config.hpp" +#include "PICA/pica_vert_config.hpp" #include "PICA/regs.hpp" #include "PICA/shader_gen_types.hpp" #include "helpers.hpp" @@ -31,7 +32,7 @@ namespace PICA::ShaderGen { std::string generate(const PICA::FragmentConfig& config); std::string getDefaultVertexShader(); // For when PICA shader is acceleration is enabled. Turn the PICA shader source into a proper vertex shader - std::string getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader); + std::string getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader); void setTarget(API api, Language language) { this->api = api; diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index a6d734fd0..998bacf92 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -150,7 +150,19 @@ void GPU::drawArrays(bool indexed) { } } -static std::array vertices; +// We need a union here, because unfortunately in CPU shaders we only need to store the vertex shader outputs in the vertex buffer, +// which consist of 8 vec4 attributes, while with GPU shaders we need to pass all the vertex shader inputs to the GPU, which consist +// of 16 vec4 attributes +union PICAVertexBuffer { + // Used with CPU shaders + std::array vertices; + // Used with GPU shaders. We can have up to 16 attributes per vertex, each attribute with 4 floats + std::array vsInputs; + + PICAVertexBuffer() {} +}; + +static PICAVertexBuffer vertexBuffer; template void GPU::drawArrays() { @@ -158,6 +170,10 @@ void GPU::drawArrays() { shaderJIT.prepare(shaderUnit.vs); } + // We can have up to 16 attributes, each one consisting of 4 floats + constexpr u32 maxAttrSizeInFloats = 16 * 4; + auto& vertices = vertexBuffer.vertices; + setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]); // Base address for vertex attributes @@ -228,7 +244,14 @@ void GPU::drawArrays() { size_t tag = vertexIndex % vertexCacheSize; // Cache hit if (cache.validBits[tag] && cache.ids[tag] == vertexIndex) { - vertices[i] = vertices[cache.bufferPositions[tag]]; + if constexpr (mode != ShaderExecMode::Hardware) { + vertices[i] = vertices[cache.bufferPositions[tag]]; + } else { + std::memcpy( + &vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cache.bufferPositions[tag] * maxAttrSizeInFloats], + sizeof(float) * maxAttrSizeInFloats + ); + } continue; } @@ -361,11 +384,11 @@ void GPU::drawArrays() { } } } else { // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly - PICA::Vertex& out = vertices[i]; + float* out = &vertexBuffer.vsInputs[i * maxAttrSizeInFloats]; for (int j = 0; j < totalAttribCount; j++) { const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf; - // Multiply mapping * 4 as mapping refers to a vec4 whereas out.raw is an array of floats - std::memcpy(&out.raw[mapping * 4], ¤tAttributes[j], sizeof(vec4f)); + // Multiply mapping * 4 as mapping refers to a vec4 whereas out is an array of floats + std::memcpy(&out[mapping * 4], ¤tAttributes[j], sizeof(vec4f)); } } } diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 1395f8e3d..2adc36614 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -160,7 +160,7 @@ const Function* ShaderDecompiler::findFunction(const AddressRange& range) { void ShaderDecompiler::writeAttributes() { decompiledShader += R"( - layout(location = 0) in vec4 inputs[8]; + layout(location = 0) in vec4 inputs[16]; layout(std140) uniform PICAShaderUniforms { vec4 uniform_float[96]; uvec4 uniform_int; @@ -168,7 +168,7 @@ void ShaderDecompiler::writeAttributes() { }; vec4 tmp_regs[16]; - vec4 out_regs[8]; + vec4 out_regs[16]; vec4 dummy_vec = vec4(0.0); bvec2 cmp_reg = bvec2(false); )"; diff --git a/src/core/PICA/shader_gen_glsl.cpp b/src/core/PICA/shader_gen_glsl.cpp index 8eaf09e8f..3920bed4f 100644 --- a/src/core/PICA/shader_gen_glsl.cpp +++ b/src/core/PICA/shader_gen_glsl.cpp @@ -671,7 +671,28 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf shader += "combinerOutput.rgb = mix(fog_color, combinerOutput.rgb, fog_factor);"; } -std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader) { +std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader) { + // First, calculate output register -> Fixed function fragment semantics based on the VAO config + { + uint count = 0; + u16 outputMask = vertConfig.outputMask; + std::array vsOutputRegisters; + + // See which registers are actually enabled and ignore the disabled ones + for (int i = 0; i < 16; i++) { + if (outputMask & 1) { + vsOutputRegisters[count++] = i; + } + + outputMask >>= 1; + } + + // For the others, map the index to a vs output directly (TODO: What does hw actually do?) + for (; count < 16; count++) { + vsOutputRegisters[count] = count; + } + } + if (usingUbershader) { Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader"); return picaSource; @@ -704,8 +725,8 @@ void main() { float a_texcoord0_w = out_regs[2].w; vec2 a_texcoord1 = out_regs[3].xy; vec2 a_texcoord2 = out_regs[4].xy; - vec3 a_view = out_regs[5].xyz; - vec4 a_quaternion = out_regs[6]; + vec3 a_view = out_regs[2].xyz; + vec4 a_quaternion = out_regs[3]; gl_Position = a_coords; vec4 colourAbs = abs(a_vertexColour); @@ -722,7 +743,7 @@ void main() { gl_ClipDistance[1] = dot(clipCoords, a_coords); #endif })"; - + std::cout << ret << "\n"; return ret; } } diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 17e3702f3..6fd266baa 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -88,7 +88,7 @@ void RendererGL::initGraphicsContextInternal() { gl.bindUBO(hwShaderUniformUBO); glBufferData(GL_UNIFORM_BUFFER, PICAShader::totalUniformSize(), nullptr, GL_DYNAMIC_DRAW); - vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW); + vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize * 2, GL_STREAM_DRAW); gl.bindVBO(vbo); // Initialize the VAO used when not using hw shaders defaultVAO.create(); @@ -122,8 +122,8 @@ void RendererGL::initGraphicsContextInternal() { // Initialize the VAO used for hw shaders hwShaderVAO.create(); gl.bindVAO(hwShaderVAO); - for (int attr = 0; attr < 8; attr++) { - hwShaderVAO.setAttributeFloat(attr, 4, sizeof(Vertex), attr * sizeof(float) * 4); + for (int attr = 0; attr < 16; attr++) { + hwShaderVAO.setAttributeFloat(attr, 4, sizeof(Vertex) * 2, attr * sizeof(float) * 4); hwShaderVAO.enableAttribute(attr); } @@ -495,7 +495,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v setupStencilTest(stencilEnable); - vbo.bufferVertsSub(vertices); + // If we're using hardware shaders, the vertex array works completely different + // And instead of 8 vec4 attributes, each vertex is 16 vec4 attributes. We use a union + aliasing which is not ideal for readability. + if (!usingAcceleratedShader) { + vbo.bufferVertsSub(vertices); + } else { + glBufferSubData(GL_ARRAY_BUFFER, 0, vertices.size_bytes() * 2, vertices.data()); + } + OpenGL::draw(primitiveTopology, GLsizei(vertices.size())); } @@ -956,12 +963,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { if (usingAcceleratedShader) { auto shaderCodeHash = shaderUnit.vs.getCodeHash(); auto opdescHash = shaderUnit.vs.getOpdescHash(); - auto vertexConfig = PICA::VertConfig{ - .shaderHash = shaderCodeHash, - .opdescHash = opdescHash, - .entrypoint = shaderUnit.vs.entrypoint, - .usingUbershader = usingUbershader, - }; + PICA::VertConfig vertexConfig(shaderUnit.vs, regs, usingUbershader); std::optional& shader = shaderCache.vertexShaderCache[vertexConfig]; // If the optional is false, we have never tried to recompile the shader before. Try to recompile it and see if it works. @@ -976,7 +978,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { // Empty source means compilation error, if the source is not empty then we convert the rcompiled PICA code into a valid shader and upload // it to the GPU if (!picaShaderSource.empty()) { - std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, usingUbershader); + std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, vertexConfig, usingUbershader); shader->create({vertexShaderSource}, OpenGL::Vertex); } } From 9ee1c3964a1568bd23984118aa92db6f32b75784 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 28 Jul 2024 15:36:22 +0300 Subject: [PATCH 17/63] Shader decompiler: Implement proper output semantic mapping --- include/PICA/pica_vert_config.hpp | 2 +- src/core/PICA/shader_gen_glsl.cpp | 56 +++++++++++++++++++++++++------ 2 files changed, 47 insertions(+), 11 deletions(-) diff --git a/include/PICA/pica_vert_config.hpp b/include/PICA/pica_vert_config.hpp index 083e1997f..606a28e61 100644 --- a/include/PICA/pica_vert_config.hpp +++ b/include/PICA/pica_vert_config.hpp @@ -36,7 +36,7 @@ namespace PICA { outputCount = regs[PICA::InternalRegs::ShaderOutputCount] & 7; outputMask = regs[PICA::InternalRegs::VertexShaderOutputMask]; for (int i = 0; i < outputCount; i++) { - outputMask = regs[PICA::InternalRegs::ShaderOutmap0 + i]; + outmaps[i] = regs[PICA::InternalRegs::ShaderOutmap0 + i]; } } }; diff --git a/src/core/PICA/shader_gen_glsl.cpp b/src/core/PICA/shader_gen_glsl.cpp index 3920bed4f..1aa307332 100644 --- a/src/core/PICA/shader_gen_glsl.cpp +++ b/src/core/PICA/shader_gen_glsl.cpp @@ -1,3 +1,7 @@ +#include + +#include + #include "PICA/pica_frag_config.hpp" #include "PICA/regs.hpp" #include "PICA/shader_gen.hpp" @@ -673,10 +677,15 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader) { // First, calculate output register -> Fixed function fragment semantics based on the VAO config + // This array contains the mappings for the 32 fixed function semantics (8 variables, with 4 lanes each). + // Each entry is a pair, containing the output reg to use for this semantic (first) and which lane of that register (second) + std::array, 32> outputMappings{}; + // Output registers adjusted according to VS_OUTPUT_MASK, which handles enabling and disabling output attributes + std::array vsOutputRegisters; + { uint count = 0; u16 outputMask = vertConfig.outputMask; - std::array vsOutputRegisters; // See which registers are actually enabled and ignore the disabled ones for (int i = 0; i < 16; i++) { @@ -691,8 +700,38 @@ std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& pic for (; count < 16; count++) { vsOutputRegisters[count] = count; } + + for (int i = 0; i < vertConfig.outputCount; i++) { + const u32 config = vertConfig.outmaps[i]; + for (int j = 0; j < 4; j++) { + const u32 mapping = (config >> (j * 8)) & 0x1F; + outputMappings[mapping] = std::make_pair(vsOutputRegisters[i], j); + } + } } + auto getSemanticName = [&](u32 semanticIndex) { + auto [reg, lane] = outputMappings[semanticIndex]; + return fmt::format("out_regs[{}][{}]", reg, lane); + }; + + std::string semantics = fmt::format( + R"( + vec4 a_coords = vec4({}, {}, {}, {}); + vec4 a_quaternion = vec4({}, {}, {}, {}); + vec4 a_vertexColour = vec4({}, {}, {}, {}); + vec2 a_texcoord0 = vec2({}, {}); + float a_texcoord0_w = {}; + vec2 a_texcoord1 = vec2({}, {}); + vec2 a_texcoord2 = vec2({}, {}); + vec3 a_view = vec3({}, {}, {}); + )", + getSemanticName(0), getSemanticName(1), getSemanticName(2), getSemanticName(3), getSemanticName(4), getSemanticName(5), getSemanticName(6), + getSemanticName(7), getSemanticName(8), getSemanticName(9), getSemanticName(10), getSemanticName(11), getSemanticName(12), + getSemanticName(13), getSemanticName(16), getSemanticName(14), getSemanticName(15), getSemanticName(22), getSemanticName(23), + getSemanticName(18), getSemanticName(19), getSemanticName(20) + ); + if (usingUbershader) { Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader"); return picaSource; @@ -719,15 +758,11 @@ out vec2 v_texcoord2; void main() { pica_shader_main(); - vec4 a_coords = out_regs[0]; - vec4 a_vertexColour = out_regs[1]; - vec2 a_texcoord0 = out_regs[2].xy; - float a_texcoord0_w = out_regs[2].w; - vec2 a_texcoord1 = out_regs[3].xy; - vec2 a_texcoord2 = out_regs[4].xy; - vec3 a_view = out_regs[2].xyz; - vec4 a_quaternion = out_regs[3]; - +)"; + // Transfer fixed function fragment registers from vertex shader output to the fragment shader + ret += semantics; + + ret += R"( gl_Position = a_coords; vec4 colourAbs = abs(a_vertexColour); v_colour = min(colourAbs, vec4(1.f)); @@ -743,6 +778,7 @@ void main() { gl_ClipDistance[1] = dot(clipCoords, a_coords); #endif })"; + std::cout << ret << "\n"; return ret; } From 6c738e821dcbac7e98bcc1f62ee956a72fdc3a76 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 28 Jul 2024 16:06:38 +0300 Subject: [PATCH 18/63] Moar instructions --- include/PICA/shader_decompiler.hpp | 12 +++-- src/core/PICA/shader_decompiler.cpp | 84 +++++++++++++++++++++++++++-- src/core/PICA/shader_gen_glsl.cpp | 18 +++---- 3 files changed, 97 insertions(+), 17 deletions(-) diff --git a/include/PICA/shader_decompiler.hpp b/include/PICA/shader_decompiler.hpp index 42bd56429..d992d0df2 100644 --- a/include/PICA/shader_decompiler.hpp +++ b/include/PICA/shader_decompiler.hpp @@ -1,10 +1,12 @@ #pragma once +#include + +#include #include #include #include -#include -#include #include +#include #include "PICA/shader.hpp" #include "PICA/shader_gen_types.hpp" @@ -42,9 +44,9 @@ namespace PICA::ShaderGen { explicit Function(u32 start, u32 end) : start(start), end(end) {} bool operator<(const Function& other) const { return AddressRange(start, end) < AddressRange(other.start, other.end); } - std::string getIdentifier() const { return "func_" + std::to_string(start) + "_to_" + std::to_string(end); } - std::string getForwardDecl() const { return "void " + getIdentifier() + "();\n"; } - std::string getCallStatement() const { return getIdentifier() + "()"; } + std::string getIdentifier() const { return fmt::format("fn_{}_{}", start, end); } + std::string getForwardDecl() const { return fmt::format("void fn_{}_{}();\n", start, end); } + std::string getCallStatement() const { return fmt::format("fn_{}_{}()", start, end); } }; std::set functions{}; diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 2adc36614..899aff298 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -85,7 +85,6 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e } case ShaderOpcodes::IFU: case ShaderOpcodes::IFC: { - Helpers::panic("IFC/IFU"); const u32 num = instruction & 0xff; const u32 dest = getBits<10, 12>(instruction); @@ -122,7 +121,29 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e } break; } - case ShaderOpcodes::CALL: Helpers::panic("Unimplemented control flow operation (CALL)"); break; + case ShaderOpcodes::CALL: { + const u32 num = instruction & 0xff; + const u32 dest = getBits<10, 12>(instruction); + const Function* calledFunction = addFunction(shader, dest, dest + num); + + // Check if analysis of the branch taken func failed and return unknown if it did + if (analysisFailed) { + it->second = ExitMode::Unknown; + return it->second; + } + + if (calledFunction->exitMode == ExitMode::AlwaysEnd) { + it->second = ExitMode::AlwaysEnd; + return it->second; + } + + // Exit mode of the remainder of this function, after we return from the callee + ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels); + ExitMode exitMode = exitSeries(postCallExitMode, calledFunction->exitMode); + + it->second = exitMode; + return exitMode; + } case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)"); break; case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)"); break; case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)"); break; @@ -464,14 +485,71 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { const uint refY = getBit<24>(instruction); const uint refX = getBit<25>(instruction); const char* condition = getCondition(condOp, refX, refY); - + decompiledShader += fmt::format("if ({}) {{ pc = {}u; break; }}", condition, dest); break; } + + case ShaderOpcodes::IFU: + case ShaderOpcodes::IFC: { + const u32 num = instruction & 0xff; + const u32 dest = getBits<10, 12>(instruction); + const Function* conditionalFunc = findFunction(AddressRange(pc + 1, dest)); + + if (opcode == ShaderOpcodes::IFC) { + const u32 condOp = getBits<22, 2>(instruction); + const uint refY = getBit<24>(instruction); + const uint refX = getBit<25>(instruction); + const char* condition = getCondition(condOp, refX, refY); + + decompiledShader += fmt::format("if ({}) {{", condition); + } else { + const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check + const u32 mask = 1u << bit; + + decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{", mask); + } + + callFunction(*conditionalFunc); + decompiledShader += "}\n"; + + pc = dest; + if (num > 0) { + const Function* elseFunc = findFunction(AddressRange(dest, dest + num)); + pc = dest + num; + + decompiledShader += "else { "; + callFunction(*elseFunc); + decompiledShader += "}\n"; + + if (conditionalFunc->exitMode == ExitMode::AlwaysEnd && elseFunc->exitMode == ExitMode::AlwaysEnd) { + finished = true; + return; + } + } + + return; + } + + case ShaderOpcodes::CALL: { + const u32 num = instruction & 0xff; + const u32 dest = getBits<10, 12>(instruction); + const Function* calledFunc = findFunction(AddressRange(dest, dest + num)); + callFunction(*calledFunc); + + if (opcode == ShaderOpcodes::CALL && calledFunc->exitMode == ExitMode::AlwaysEnd) { + finished = true; + return; + } + break; + } + case ShaderOpcodes::END: decompiledShader += "return;\n"; finished = true; return; + + case ShaderOpcodes::NOP: break; default: Helpers::panic("GLSL recompiler: Unknown opcode: %X", opcode); break; } } diff --git a/src/core/PICA/shader_gen_glsl.cpp b/src/core/PICA/shader_gen_glsl.cpp index 1aa307332..affe9837f 100644 --- a/src/core/PICA/shader_gen_glsl.cpp +++ b/src/core/PICA/shader_gen_glsl.cpp @@ -717,15 +717,15 @@ std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& pic std::string semantics = fmt::format( R"( - vec4 a_coords = vec4({}, {}, {}, {}); - vec4 a_quaternion = vec4({}, {}, {}, {}); - vec4 a_vertexColour = vec4({}, {}, {}, {}); - vec2 a_texcoord0 = vec2({}, {}); - float a_texcoord0_w = {}; - vec2 a_texcoord1 = vec2({}, {}); - vec2 a_texcoord2 = vec2({}, {}); - vec3 a_view = vec3({}, {}, {}); - )", + vec4 a_coords = vec4({}, {}, {}, {}); + vec4 a_quaternion = vec4({}, {}, {}, {}); + vec4 a_vertexColour = vec4({}, {}, {}, {}); + vec2 a_texcoord0 = vec2({}, {}); + float a_texcoord0_w = {}; + vec2 a_texcoord1 = vec2({}, {}); + vec2 a_texcoord2 = vec2({}, {}); + vec3 a_view = vec3({}, {}, {}); +)", getSemanticName(0), getSemanticName(1), getSemanticName(2), getSemanticName(3), getSemanticName(4), getSemanticName(5), getSemanticName(6), getSemanticName(7), getSemanticName(8), getSemanticName(9), getSemanticName(10), getSemanticName(11), getSemanticName(12), getSemanticName(13), getSemanticName(16), getSemanticName(14), getSemanticName(15), getSemanticName(22), getSemanticName(23), From d125180847ca92c9ed4dcd18f5d880b94fa7fe10 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 28 Jul 2024 17:48:16 +0300 Subject: [PATCH 19/63] Shader decompiler: Add FLR/SLT/SLTI/SGE/SGEI --- include/PICA/pica_vert_config.hpp | 3 ++- src/core/PICA/shader_decompiler.cpp | 7 +++++++ src/core/renderer_gl/renderer_gl.cpp | 2 -- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/PICA/pica_vert_config.hpp b/include/PICA/pica_vert_config.hpp index 606a28e61..ba66426d3 100644 --- a/include/PICA/pica_vert_config.hpp +++ b/include/PICA/pica_vert_config.hpp @@ -36,7 +36,8 @@ namespace PICA { outputCount = regs[PICA::InternalRegs::ShaderOutputCount] & 7; outputMask = regs[PICA::InternalRegs::VertexShaderOutputMask]; for (int i = 0; i < outputCount; i++) { - outmaps[i] = regs[PICA::InternalRegs::ShaderOutmap0 + i]; + // Mask out unused bits + outmaps[i] = regs[PICA::InternalRegs::ShaderOutmap0 + i] & 0x1F1F1F1F; } } }; diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 899aff298..e028d6d88 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -408,9 +408,16 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { case ShaderOpcodes::DP3: setDest(operandDescriptor, dest, fmt::format("vec4(dot({}.xyz, {}.xyz))", src1, src2)); break; case ShaderOpcodes::DP4: setDest(operandDescriptor, dest, fmt::format("vec4(dot({}, {}))", src1, src2)); break; + case ShaderOpcodes::FLR: setDest(operandDescriptor, dest, fmt::format("floor({})", src1)); break; case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, fmt::format("vec4(inversesqrt({}.x))", src1)); break; case ShaderOpcodes::RCP: setDest(operandDescriptor, dest, fmt::format("vec4(1.0 / {}.x)", src1)); break; + case ShaderOpcodes::SLT: + case ShaderOpcodes::SLTI: setDest(operandDescriptor, dest, fmt::format("vec4(lessThan({}, {}))", src1, src2)); break; + + case ShaderOpcodes::SGE: + case ShaderOpcodes::SGEI: setDest(operandDescriptor, dest, fmt::format("vec4(greaterThanEqual({}, {}))", src1, src2)); break; + case ShaderOpcodes::CMP1: case ShaderOpcodes::CMP2: { static constexpr std::array operators = { diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 6fd266baa..78dfb98fd 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -961,8 +961,6 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { usingAcceleratedShader = emulatorConfig->accelerateShaders && !isImmediateMode && !usingUbershader; if (usingAcceleratedShader) { - auto shaderCodeHash = shaderUnit.vs.getCodeHash(); - auto opdescHash = shaderUnit.vs.getOpdescHash(); PICA::VertConfig vertexConfig(shaderUnit.vs, regs, usingUbershader); std::optional& shader = shaderCache.vertexShaderCache[vertexConfig]; From 4040d885c6eb4ec990c3cbc2890f2a4cce6b245e Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 28 Jul 2024 21:25:51 +0300 Subject: [PATCH 20/63] Shader decompiler: Add register indexing --- src/core/PICA/shader_decompiler.cpp | 57 ++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index e028d6d88..da880fcc2 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -191,7 +191,14 @@ void ShaderDecompiler::writeAttributes() { vec4 tmp_regs[16]; vec4 out_regs[16]; vec4 dummy_vec = vec4(0.0); + ivec3 addr_reg = ivec3(0); bvec2 cmp_reg = bvec2(false); + + vec4 float_uniform_indexed(int source, int offset) { + int clipped_offs = (offset >= -128 && offset <= 127) ? offset : 0; + uint index = uint(clipped_offs + source) & 127u; + return (index < 96u) ? uniform_float[index] : vec4(1.0); + } )"; } @@ -284,10 +291,15 @@ std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index) } else { const usize floatIndex = (source - 0x20) & 0x7f; - if (floatIndex >= 96) [[unlikely]] { - return "dummy_vec"; + if (index == 0) { + if (floatIndex >= 96) [[unlikely]] { + return "dummy_vec"; + } + return "uniform_float[" + std::to_string(floatIndex) + "]"; + } else { + static constexpr std::array offsets = {"0", "addr_reg.x", "addr_reg.y", "addr_reg.z"}; + return fmt::format("float_uniform_indexed({}, {})", floatIndex, offsets[index]); } - return "uniform_float[" + std::to_string(floatIndex) + "]"; } } @@ -391,14 +403,6 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { std::string dest = getDest(destIndex); - if (idx != 0) { - Helpers::panic("GLSL recompiler: Indexed instruction"); - } - - if (invertSources) { - Helpers::panic("GLSL recompiler: Inverted instruction"); - } - switch (opcode) { case ShaderOpcodes::MOV: setDest(operandDescriptor, dest, src1); break; case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, fmt::format("{} + {}", src1, src2)); break; @@ -444,6 +448,20 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { break; } + case ShaderOpcodes::MOVA: { + const bool writeX = getBit<3>(operandDescriptor); // Should we write the x component of the address register? + const bool writeY = getBit<2>(operandDescriptor); + + if (writeX) { + decompiledShader += fmt::format("addr_reg.x = int({}.x);\n", src1); + } + + if (writeY) { + decompiledShader += fmt::format("addr_reg.y = int({}.y);\n", src1); + } + break; + } + default: Helpers::panic("GLSL recompiler: Unknown common opcode: %X", opcode); break; } } else if (opcode >= 0x30 && opcode <= 0x3F) { // MAD and MADI @@ -478,11 +496,6 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { src3 += getSwizzlePattern(swizzle3); std::string dest = getDest(destIndex); - - if (idx != 0) { - Helpers::panic("GLSL recompiler: Indexed instruction"); - } - setDest(operandDescriptor, dest, src1 + " * " + src2 + " + " + src3); } else { switch (opcode) { @@ -493,7 +506,16 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { const uint refX = getBit<25>(instruction); const char* condition = getCondition(condOp, refX, refY); - decompiledShader += fmt::format("if ({}) {{ pc = {}u; break; }}", condition, dest); + decompiledShader += fmt::format("if ({}) {{ pc = {}u; break; }}\n", condition, dest); + break; + } + + case ShaderOpcodes::JMPU: { + const u32 dest = getBits<10, 12>(instruction); + const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check + const u32 mask = 1u << bit; + + decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{ pc = {}u; break; }}\n", mask, dest); break; } @@ -556,6 +578,7 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { finished = true; return; + case ShaderOpcodes::NOP: break; default: Helpers::panic("GLSL recompiler: Unknown opcode: %X", opcode); break; } From 94bd0600820922813d53d32691bc2168fcd36adf Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 28 Jul 2024 21:28:16 +0300 Subject: [PATCH 21/63] Shader decompiler: Optimize mova with both x and y masked --- src/core/PICA/shader_decompiler.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index da880fcc2..607b5c926 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -452,11 +452,11 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { const bool writeX = getBit<3>(operandDescriptor); // Should we write the x component of the address register? const bool writeY = getBit<2>(operandDescriptor); - if (writeX) { + if (writeX && writeY) { + decompiledShader += fmt::format("addr_reg.xy = ivec2({}.xy);\n", src1); + } else if (writeX) { decompiledShader += fmt::format("addr_reg.x = int({}.x);\n", src1); - } - - if (writeY) { + } else if (writeY) { decompiledShader += fmt::format("addr_reg.y = int({}.y);\n", src1); } break; From 59f4f236d88a7f3abb8c0b2863beca74eb53471a Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Mon, 29 Jul 2024 00:21:30 +0300 Subject: [PATCH 22/63] Shader decompiler: Add DPH/DPHI --- src/core/PICA/shader_decompiler.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 607b5c926..d2a3405de 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -422,6 +422,10 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { case ShaderOpcodes::SGE: case ShaderOpcodes::SGEI: setDest(operandDescriptor, dest, fmt::format("vec4(greaterThanEqual({}, {}))", src1, src2)); break; + case ShaderOpcodes::DPH: + case ShaderOpcodes::DPHI: + setDest(operandDescriptor, dest, fmt::format("vec4(dot(vec4({}.xyz, 1.0), {}))", src1, src2)); break; + case ShaderOpcodes::CMP1: case ShaderOpcodes::CMP2: { static constexpr std::array operators = { From 72097404180f566ffe685ecfe938b1888483b794 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Mon, 29 Jul 2024 01:03:41 +0300 Subject: [PATCH 23/63] Fix shader caching being broken --- include/PICA/pica_vert_config.hpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/PICA/pica_vert_config.hpp b/include/PICA/pica_vert_config.hpp index ba66426d3..4300e4542 100644 --- a/include/PICA/pica_vert_config.hpp +++ b/include/PICA/pica_vert_config.hpp @@ -1,5 +1,6 @@ #pragma once #include +#include #include #include #include @@ -23,6 +24,10 @@ namespace PICA { u8 outputCount; bool usingUbershader; + // Pad to 56 bytes so that the compiler won't insert unnecessary padding, which in turn will affect our unordered_map lookup + // As the padding will get hashed and memcmp'd... + u32 pad{}; + bool operator==(const VertConfig& config) const { // Hash function and equality operator required by std::unordered_map return std::memcmp(this, &config, sizeof(VertConfig)) == 0; @@ -43,6 +48,8 @@ namespace PICA { }; } // namespace PICA +static_assert(sizeof(PICA::VertConfig) == 56); + // Override std::hash for our vertex config class template <> struct std::hash { From 0d6bef2d70c06a6c3cf3bf350715b8bcfe1f6088 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Mon, 29 Jul 2024 01:27:13 +0300 Subject: [PATCH 24/63] PICA decompiler: Cache VS uniforms --- include/PICA/shader.hpp | 11 +++++++++++ src/core/PICA/regs.cpp | 2 +- src/core/PICA/shader_unit.cpp | 1 + src/core/renderer_gl/renderer_gl.cpp | 6 +++++- 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/include/PICA/shader.hpp b/include/PICA/shader.hpp index 7f127795e..535b6f4cc 100644 --- a/include/PICA/shader.hpp +++ b/include/PICA/shader.hpp @@ -133,6 +133,10 @@ class PICAShader { Hash lastCodeHash = 0; // Last hash computed for the shader code (Used for the JIT caching mechanism) Hash lastOpdescHash = 0; // Last hash computed for the operand descriptors (Also used for the JIT) + public: + bool uniformsDirty = false; + + protected: bool codeHashDirty = false; bool opdescHashDirty = false; @@ -283,6 +287,7 @@ class PICAShader { uniform[2] = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16)); uniform[3] = f24::fromRaw(floatUniformBuffer[0] >> 8); } + uniformsDirty = true; } } @@ -294,6 +299,12 @@ class PICAShader { u[1] = getBits<8, 8>(word); u[2] = getBits<16, 8>(word); u[3] = getBits<24, 8>(word); + uniformsDirty = true; + } + + void uploadBoolUniform(u32 value) { + boolUniform = value; + uniformsDirty = true; } void run(); diff --git a/src/core/PICA/regs.cpp b/src/core/PICA/regs.cpp index c9412fc8f..0c5f4adb7 100644 --- a/src/core/PICA/regs.cpp +++ b/src/core/PICA/regs.cpp @@ -301,7 +301,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { } case VertexBoolUniform: { - shaderUnit.vs.boolUniform = value & 0xffff; + shaderUnit.vs.uploadBoolUniform(value & 0xffff); break; } diff --git a/src/core/PICA/shader_unit.cpp b/src/core/PICA/shader_unit.cpp index 759849a8a..6b291d31c 100644 --- a/src/core/PICA/shader_unit.cpp +++ b/src/core/PICA/shader_unit.cpp @@ -34,4 +34,5 @@ void PICAShader::reset() { codeHashDirty = true; opdescHashDirty = true; + uniformsDirty = true; } \ No newline at end of file diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 78dfb98fd..6e50f77be 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -987,7 +987,11 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { } else { generatedVertexShader = &(*shader); gl.bindUBO(hwShaderUniformUBO); - glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer()); + + if (shaderUnit.vs.uniformsDirty) { + shaderUnit.vs.uniformsDirty = false; + glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer()); + } } } From 1c9df7c02c6caf21c586782ad1b388570faeb0e5 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Mon, 29 Jul 2024 01:42:56 +0300 Subject: [PATCH 25/63] Simply vertex cache code --- src/core/PICA/gpu.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index 998bacf92..6cbdb100b 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -174,7 +174,9 @@ void GPU::drawArrays() { constexpr u32 maxAttrSizeInFloats = 16 * 4; auto& vertices = vertexBuffer.vertices; - setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]); + if constexpr (mode != ShaderExecMode::Hardware) { + setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]); + } // Base address for vertex attributes // The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible @@ -247,8 +249,9 @@ void GPU::drawArrays() { if constexpr (mode != ShaderExecMode::Hardware) { vertices[i] = vertices[cache.bufferPositions[tag]]; } else { + const u32 cachedBufferPosition = cache.bufferPositions[tag] * maxAttrSizeInFloats; std::memcpy( - &vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cache.bufferPositions[tag] * maxAttrSizeInFloats], + &vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cachedBufferPosition], sizeof(float) * maxAttrSizeInFloats ); } From 53ee3f305127cf2fa53effc8eac5c61d04caf1d3 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Mon, 29 Jul 2024 01:42:56 +0300 Subject: [PATCH 26/63] Simplify vertex cache code --- src/core/PICA/gpu.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index 998bacf92..6cbdb100b 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -174,7 +174,9 @@ void GPU::drawArrays() { constexpr u32 maxAttrSizeInFloats = 16 * 4; auto& vertices = vertexBuffer.vertices; - setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]); + if constexpr (mode != ShaderExecMode::Hardware) { + setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]); + } // Base address for vertex attributes // The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible @@ -247,8 +249,9 @@ void GPU::drawArrays() { if constexpr (mode != ShaderExecMode::Hardware) { vertices[i] = vertices[cache.bufferPositions[tag]]; } else { + const u32 cachedBufferPosition = cache.bufferPositions[tag] * maxAttrSizeInFloats; std::memcpy( - &vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cache.bufferPositions[tag] * maxAttrSizeInFloats], + &vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cachedBufferPosition], sizeof(float) * maxAttrSizeInFloats ); } From b46f7ad9bcbbc4c27da2276d36667d55bd7071de Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Thu, 1 Aug 2024 00:55:20 +0300 Subject: [PATCH 27/63] Shader decompiler: Add loops --- src/core/PICA/shader_decompiler.cpp | 44 +++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index d2a3405de..b441c8135 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -146,7 +146,24 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e } case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)"); break; case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)"); break; - case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)"); break; + case ShaderOpcodes::LOOP: { + u32 dest = getBits<10, 12>(instruction); + const Function* loopFunction = addFunction(shader, pc + 1, dest + 1); + if (analysisFailed) { + it->second = ExitMode::Unknown; + return it->second; + } + + if (loopFunction->exitMode == ExitMode::AlwaysEnd) { + it->second = ExitMode::AlwaysEnd; + return it->second; + } + + ExitMode afterLoop = analyzeFunction(shader, dest + 1, end, labels); + ExitMode exitMode = exitSeries(afterLoop, loopFunction->exitMode); + it->second = exitMode; + return it->second; + } case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second; default: break; @@ -577,12 +594,35 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { break; } + case ShaderOpcodes::LOOP: { + const u32 dest = getBits<10, 12>(instruction); + const u32 uniformIndex = getBits<22, 2>(instruction); + + // loop counter = uniform.y + decompiledShader += fmt::format("addr_reg.z = int((uniform_int[{}] >> 16u) & 0xFFu);\n", uniformIndex); + decompiledShader += fmt::format( + "for (uint loopCtr{} = 0u; loopCtr{} <= ((uniform_int[{}] >> 24) & 0xFFu); loopCtr{}++, addr_reg.z += int((uniform_int[{}] >> " + "8u) & 0xFFu)) {{\n", + pc, pc, uniformIndex, pc, uniformIndex + ); + + AddressRange range(pc + 1, dest + 1); + const Function* func = findFunction(range); + callFunction(*func); + decompiledShader += "}\n"; + + if (func->exitMode == ExitMode::AlwaysEnd) { + finished = true; + return; + } + break; + } + case ShaderOpcodes::END: decompiledShader += "return;\n"; finished = true; return; - case ShaderOpcodes::NOP: break; default: Helpers::panic("GLSL recompiler: Unknown opcode: %X", opcode); break; } From c7371e3bf4f627700688a896a56d2ee0a8e99e5f Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Thu, 8 Aug 2024 00:38:52 +0300 Subject: [PATCH 28/63] Shader decompiler: Implement safe multiplication --- src/core/PICA/shader_decompiler.cpp | 42 ++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index b441c8135..6e7304e15 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -241,7 +241,7 @@ std::string ShaderDecompiler::decompile() { decompiledShader += R"( vec4 safe_mul(vec4 a, vec4 b) { vec4 res = a * b; - return mix(res, mix(mix(vec4(0.0), res, isnan(rhs)), product, isnan(lhs)), isnan(res)); + return mix(res, mix(mix(vec4(0.0), res, isnan(b)), res, isnan(a)), isnan(res)); } )"; } @@ -423,12 +423,32 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { switch (opcode) { case ShaderOpcodes::MOV: setDest(operandDescriptor, dest, src1); break; case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, fmt::format("{} + {}", src1, src2)); break; - case ShaderOpcodes::MUL: setDest(operandDescriptor, dest, fmt::format("{} * {}", src1, src2)); break; + case ShaderOpcodes::MUL: + if (!config.accurateShaderMul) { + setDest(operandDescriptor, dest, fmt::format("{} * {}", src1, src2)); + } else { + setDest(operandDescriptor, dest, fmt::format("safe_mul({}, {})", src1, src2)); + } + break; case ShaderOpcodes::MAX: setDest(operandDescriptor, dest, fmt::format("max({}, {})", src1, src2)); break; case ShaderOpcodes::MIN: setDest(operandDescriptor, dest, fmt::format("min({}, {})", src1, src2)); break; - case ShaderOpcodes::DP3: setDest(operandDescriptor, dest, fmt::format("vec4(dot({}.xyz, {}.xyz))", src1, src2)); break; - case ShaderOpcodes::DP4: setDest(operandDescriptor, dest, fmt::format("vec4(dot({}, {}))", src1, src2)); break; + case ShaderOpcodes::DP3: + if (!config.accurateShaderMul) { + setDest(operandDescriptor, dest, fmt::format("vec4(dot({}.xyz, {}.xyz))", src1, src2)); + } else { + // A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec3(1.0) + setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul({}, {}).xyz, vec3(1.0)))", src1, src2)); + } + break; + case ShaderOpcodes::DP4: + if (!config.accurateShaderMul) { + setDest(operandDescriptor, dest, fmt::format("vec4(dot({}, {}))", src1, src2)); + } else { + // A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec4(1.0) + setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul({}, {}), vec4(1.0)))", src1, src2)); + } + break; case ShaderOpcodes::FLR: setDest(operandDescriptor, dest, fmt::format("floor({})", src1)); break; case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, fmt::format("vec4(inversesqrt({}.x))", src1)); break; case ShaderOpcodes::RCP: setDest(operandDescriptor, dest, fmt::format("vec4(1.0 / {}.x)", src1)); break; @@ -441,7 +461,13 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { case ShaderOpcodes::DPH: case ShaderOpcodes::DPHI: - setDest(operandDescriptor, dest, fmt::format("vec4(dot(vec4({}.xyz, 1.0), {}))", src1, src2)); break; + if (!config.accurateShaderMul) { + setDest(operandDescriptor, dest, fmt::format("vec4(dot(vec4({}.xyz, 1.0), {}))", src1, src2)); + } else { + // A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec4(1.0) + setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul(vec4({}.xyz, 1.0), {}), vec4(1.0)))", src1, src2)); + } + break; case ShaderOpcodes::CMP1: case ShaderOpcodes::CMP2: { @@ -517,7 +543,11 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { src3 += getSwizzlePattern(swizzle3); std::string dest = getDest(destIndex); - setDest(operandDescriptor, dest, src1 + " * " + src2 + " + " + src3); + if (!config.accurateShaderMul) { + setDest(operandDescriptor, dest, fmt::format("{} * {} + {}", src1, src2, src3)); + } else { + setDest(operandDescriptor, dest, fmt::format("safe_mul({}, {}) + {}", src1, src2, src3)); + } } else { switch (opcode) { case ShaderOpcodes::JMPC: { From 7e04ab78e8d621b3d583129b6b6aaccbe38c8352 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Mon, 19 Aug 2024 22:32:55 +0300 Subject: [PATCH 29/63] Shader decompiler: Implement LG2/EX2 --- src/core/PICA/shader_decompiler.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 6e7304e15..10afc3061 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -452,6 +452,8 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { case ShaderOpcodes::FLR: setDest(operandDescriptor, dest, fmt::format("floor({})", src1)); break; case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, fmt::format("vec4(inversesqrt({}.x))", src1)); break; case ShaderOpcodes::RCP: setDest(operandDescriptor, dest, fmt::format("vec4(1.0 / {}.x)", src1)); break; + case ShaderOpcodes::LG2: setDest(operandDescriptor, dest, fmt::format("vec4(log2({}.x))", src1)); break; + case ShaderOpcodes::EX2: setDest(operandDescriptor, dest, fmt::format("vec4(exp2({}.x))", src1)); break; case ShaderOpcodes::SLT: case ShaderOpcodes::SLTI: setDest(operandDescriptor, dest, fmt::format("vec4(lessThan({}, {}))", src1, src2)); break; From e481ce87a9cc0c16aeb898790054976183eb7994 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Mon, 19 Aug 2024 23:15:44 +0300 Subject: [PATCH 30/63] Shader decompiler: More control flow --- include/PICA/shader_decompiler.hpp | 5 +- src/core/PICA/shader_decompiler.cpp | 95 +++++++++++++++++++++++------ 2 files changed, 80 insertions(+), 20 deletions(-) diff --git a/include/PICA/shader_decompiler.hpp b/include/PICA/shader_decompiler.hpp index d992d0df2..b7bd869c3 100644 --- a/include/PICA/shader_decompiler.hpp +++ b/include/PICA/shader_decompiler.hpp @@ -45,7 +45,10 @@ namespace PICA::ShaderGen { bool operator<(const Function& other) const { return AddressRange(start, end) < AddressRange(other.start, other.end); } std::string getIdentifier() const { return fmt::format("fn_{}_{}", start, end); } - std::string getForwardDecl() const { return fmt::format("void fn_{}_{}();\n", start, end); } + // To handle weird control flow, we have to return from each function a bool that indicates whether or not the shader reached an end + // instruction and should thus terminate. This is necessary for games like Rayman and Gravity Falls, which have "END" instructions called + // from within functions deep in the callstack + std::string getForwardDecl() const { return fmt::format("bool fn_{}_{}();\n", start, end); } std::string getCallStatement() const { return fmt::format("fn_{}_{}()", start, end); } }; diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 10afc3061..cab55fb01 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -138,14 +138,33 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e } // Exit mode of the remainder of this function, after we return from the callee - ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels); - ExitMode exitMode = exitSeries(postCallExitMode, calledFunction->exitMode); + const ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels); + const ExitMode exitMode = exitSeries(postCallExitMode, calledFunction->exitMode); it->second = exitMode; return exitMode; } - case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)"); break; - case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)"); break; + + case ShaderOpcodes::CALLC: + case ShaderOpcodes::CALLU: { + const u32 num = instruction & 0xff; + const u32 dest = getBits<10, 12>(instruction); + const Function* calledFunction = addFunction(shader, dest, dest + num); + + // Check if analysis of the branch taken func failed and return unknown if it did + if (analysisFailed) { + it->second = ExitMode::Unknown; + return it->second; + } + + // Exit mode of the remainder of this function, after we return from the callee + const ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels); + const ExitMode exitMode = exitSeries(exitParallel(calledFunction->exitMode, ExitMode::AlwaysReturn), postCallExitMode); + + it->second = exitMode; + return exitMode; + } + case ShaderOpcodes::LOOP: { u32 dest = getBits<10, 12>(instruction); const Function* loopFunction = addFunction(shader, pc + 1, dest + 1); @@ -159,13 +178,13 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e return it->second; } - ExitMode afterLoop = analyzeFunction(shader, dest + 1, end, labels); - ExitMode exitMode = exitSeries(afterLoop, loopFunction->exitMode); + const ExitMode afterLoop = analyzeFunction(shader, dest + 1, end, labels); + const ExitMode exitMode = exitSeries(afterLoop, loopFunction->exitMode); it->second = exitMode; return it->second; } - case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second; + case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second; default: break; } } @@ -251,15 +270,20 @@ std::string ShaderDecompiler::decompile() { decompiledShader += func.getForwardDecl(); } - decompiledShader += "void pica_shader_main() {\n"; + decompiledShader += "bool pica_shader_main() {\n"; AddressRange mainFunctionRange(entrypoint, PICAShader::maxInstructionCount); callFunction(*findFunction(mainFunctionRange)); - decompiledShader += "}\n"; + decompiledShader += "return true;\n}\n"; for (const Function& func : controlFlow.functions) { if (func.outLabels.empty()) { - decompiledShader += fmt::format("void {}() {{\n", func.getIdentifier()); - compileRange(AddressRange(func.start, func.end)); + decompiledShader += fmt::format("bool {}() {{\n", func.getIdentifier()); + + auto [pc, finished] = compileRange(AddressRange(func.start, func.end)); + if (!finished) { + decompiledShader += "return false;"; + } + decompiledShader += "}\n"; } else { auto labels = func.outLabels; @@ -267,7 +291,7 @@ std::string ShaderDecompiler::decompile() { // If a function has jumps and "labels", this needs to be emulated using a switch-case, with the variable being switched on being the // current PC - decompiledShader += fmt::format("void {}() {{\n", func.getIdentifier()); + decompiledShader += fmt::format("bool {}() {{\n", func.getIdentifier()); decompiledShader += fmt::format("uint pc = {}u;\n", func.start); decompiledShader += "while(true){\nswitch(pc){\n"; @@ -287,12 +311,12 @@ std::string ShaderDecompiler::decompile() { decompiledShader += "}\n"; } - decompiledShader += "default: return;\n"; + decompiledShader += "default: return false;\n"; // Exit the switch and loop decompiledShader += "} }\n"; // Exit the function - decompiledShader += "return;\n"; + decompiledShader += "return false;\n"; decompiledShader += "}\n"; } } @@ -613,12 +637,35 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { return; } - case ShaderOpcodes::CALL: { + case ShaderOpcodes::CALL: + case ShaderOpcodes::CALLC: + case ShaderOpcodes::CALLU: { const u32 num = instruction & 0xff; const u32 dest = getBits<10, 12>(instruction); const Function* calledFunc = findFunction(AddressRange(dest, dest + num)); + + // Handle conditions for CALLC/CALLU + if (opcode == ShaderOpcodes::CALLC) { + const u32 condOp = getBits<22, 2>(instruction); + const uint refY = getBit<24>(instruction); + const uint refX = getBit<25>(instruction); + const char* condition = getCondition(condOp, refX, refY); + + decompiledShader += fmt::format("if ({}) {{", condition); + } else if (opcode == ShaderOpcodes::CALLU) { + const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check + const u32 mask = 1u << bit; + + decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{", mask); + } + callFunction(*calledFunc); + // Close brackets for CALLC/CALLU + if (opcode != ShaderOpcodes::CALL) { + decompiledShader += "}"; + } + if (opcode == ShaderOpcodes::CALL && calledFunc->exitMode == ExitMode::AlwaysEnd) { finished = true; return; @@ -651,7 +698,7 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { } case ShaderOpcodes::END: - decompiledShader += "return;\n"; + decompiledShader += "return true;\n"; finished = true; return; @@ -686,13 +733,23 @@ bool ShaderDecompiler::usesCommonEncoding(u32 instruction) const { case ShaderOpcodes::SLT: case ShaderOpcodes::SLTI: case ShaderOpcodes::SGE: - case ShaderOpcodes::SGEI: return true; + case ShaderOpcodes::SGEI: + case ShaderOpcodes::LITP: return true; default: return false; } } -void ShaderDecompiler::callFunction(const Function& function) { decompiledShader += function.getCallStatement() + ";\n"; } +void ShaderDecompiler::callFunction(const Function& function) { + switch (function.exitMode) { + // This function always ends, so call it and return true to signal that we're gonna be ending the shader + case ExitMode::AlwaysEnd: decompiledShader += function.getCallStatement() + ";\nreturn true;\n"; break; + // This function will potentially end. Call it, see if it returns that it ended, and return that we're ending if it did + case ExitMode::Conditional: decompiledShader += fmt::format("if ({}) {{ return true; }}\n", function.getCallStatement()); break; + // This function will not end. Just call it like a normal function. + default: decompiledShader += function.getCallStatement() + ";\n"; break; + } +} std::string ShaderGen::decompileShader(PICAShader& shader, EmulatorConfig& config, u32 entrypoint, API api, Language language) { ShaderDecompiler decompiler(shader, config, entrypoint, api, language); @@ -726,7 +783,7 @@ const char* ShaderDecompiler::getCondition(u32 cond, u32 refX, u32 refY) { "cmp_reg.x", "cmp_reg.y", }; - u32 key = (cond & 0b11) | (refX << 2) | (refY << 3); + const u32 key = (cond & 0b11) | (refX << 2) | (refY << 3); return conditions[key]; } From 943cf9b8890b8b822b3559cad3c8830acb6dec95 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Mon, 19 Aug 2024 23:46:37 +0300 Subject: [PATCH 31/63] Shader decompiler: Fix JMPU condition --- src/core/PICA/shader_decompiler.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index cab55fb01..73bd4eb01 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -591,8 +591,9 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { const u32 dest = getBits<10, 12>(instruction); const u32 bit = getBits<22, 4>(instruction); // Bit of the bool uniform to check const u32 mask = 1u << bit; + const u32 test = (instruction & 1) ^ 1; // If the LSB is 0 we jump if bit = 1, otherwise 0 - decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{ pc = {}u; break; }}\n", mask, dest); + decompiledShader += fmt::format("if ((uniform_bool & {}u) {} 0u) {{ pc = {}u; break; }}\n", mask, (test != 0) ? "!=" : "==", dest); break; } From 652b6008845bef59539192fec713d270ab4cd86d Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Tue, 20 Aug 2024 15:10:55 +0300 Subject: [PATCH 32/63] Shader decompiler: Convert main function to void --- src/core/PICA/shader_decompiler.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 73bd4eb01..133637a7b 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -270,10 +270,11 @@ std::string ShaderDecompiler::decompile() { decompiledShader += func.getForwardDecl(); } - decompiledShader += "bool pica_shader_main() {\n"; + decompiledShader += "void pica_shader_main() {\n"; AddressRange mainFunctionRange(entrypoint, PICAShader::maxInstructionCount); - callFunction(*findFunction(mainFunctionRange)); - decompiledShader += "return true;\n}\n"; + auto mainFunc = findFunction(mainFunctionRange); + + decompiledShader += mainFunc->getCallStatement() + ";\n}\n"; for (const Function& func : controlFlow.functions) { if (func.outLabels.empty()) { From e13ef42b654a8dd0e8122e6f78fb7713ca84e8c2 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Wed, 21 Aug 2024 00:47:57 +0300 Subject: [PATCH 33/63] PICA: Start implementing GPU vertex fetch --- CMakeLists.txt | 4 +- include/PICA/draw_acceleration.hpp | 19 ++++++++ include/PICA/gpu.hpp | 2 + include/renderer.hpp | 5 +- include/renderer_gl/renderer_gl.hpp | 4 +- src/core/PICA/draw_acceleration.cpp | 71 ++++++++++++++++++++++++++++ src/core/PICA/gpu.cpp | 10 +++- src/core/PICA/regs.cpp | 2 +- src/core/renderer_gl/renderer_gl.cpp | 8 +++- 9 files changed, 117 insertions(+), 8 deletions(-) create mode 100644 include/PICA/draw_acceleration.hpp create mode 100644 src/core/PICA/draw_acceleration.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 043bb084b..643e48e38 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -241,7 +241,7 @@ set(PICA_SOURCE_FILES src/core/PICA/gpu.cpp src/core/PICA/regs.cpp src/core/PICA src/core/PICA/shader_interpreter.cpp src/core/PICA/dynapica/shader_rec.cpp src/core/PICA/dynapica/shader_rec_emitter_x64.cpp src/core/PICA/pica_hash.cpp src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp src/core/PICA/shader_gen_glsl.cpp - src/core/PICA/shader_decompiler.cpp + src/core/PICA/shader_decompiler.cpp src/core/PICA/draw_acceleration.cpp ) set(LOADER_SOURCE_FILES src/core/loader/elf.cpp src/core/loader/ncsd.cpp src/core/loader/ncch.cpp src/core/loader/3dsx.cpp src/core/loader/lz77.cpp) @@ -293,7 +293,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp include/audio/miniaudio_device.hpp include/ring_buffer.hpp include/bitfield.hpp include/audio/dsp_shared_mem.hpp include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp - include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp + include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp ) cmrc_add_resource_library( diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp new file mode 100644 index 000000000..eec76b873 --- /dev/null +++ b/include/PICA/draw_acceleration.hpp @@ -0,0 +1,19 @@ +#pragma once + +#include + +#include "helpers.hpp" + +namespace PICA { + struct DrawAcceleration { + u8* vertexBuffer; + u8* indexBuffer; + + // Minimum and maximum index in the index buffer for a draw call + u16 minimumIndex, maximumIndex; + u32 vertexDataSize; + + bool canBeAccelerated; + bool indexed; + }; +} // namespace PICA \ No newline at end of file diff --git a/include/PICA/gpu.hpp b/include/PICA/gpu.hpp index 1e1d3c4bd..c168a9bfe 100644 --- a/include/PICA/gpu.hpp +++ b/include/PICA/gpu.hpp @@ -1,6 +1,7 @@ #pragma once #include +#include "PICA/draw_acceleration.hpp" #include "PICA/dynapica/shader_rec.hpp" #include "PICA/float_types.hpp" #include "PICA/pica_vertex.hpp" @@ -87,6 +88,7 @@ class GPU { std::unique_ptr renderer; PICA::Vertex getImmediateModeVertex(); + void getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed); public: // 256 entries per LUT with each LUT as its own row forming a 2D image 256 * LUT_COUNT // Encoded in PICA native format diff --git a/include/renderer.hpp b/include/renderer.hpp index 721364c1a..94a0b0f3c 100644 --- a/include/renderer.hpp +++ b/include/renderer.hpp @@ -1,9 +1,10 @@ #pragma once #include +#include #include #include -#include +#include "PICA/draw_acceleration.hpp" #include "PICA/pica_vertex.hpp" #include "PICA/regs.hpp" #include "helpers.hpp" @@ -83,7 +84,7 @@ class Renderer { // It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between // ubershaders and shadergen, and so on. // Returns whether this draw is eligible for using hardware-accelerated shaders or if shaders should run on the CPU - virtual bool prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { return false; } + virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel, bool isImmediateMode) { return false; } // Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window #ifdef PANDA3DS_FRONTEND_QT diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index 73b52cc5a..397aaf534 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -135,6 +135,8 @@ class RendererGL final : public Renderer { void updateFogLUT(); void initGraphicsContextInternal(); + void accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel); + public: RendererGL(GPU& gpu, const std::array& internalRegs, const std::array& externalRegs) : Renderer(gpu, internalRegs, externalRegs), fragShaderGen(PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL) {} @@ -152,7 +154,7 @@ class RendererGL final : public Renderer { virtual bool supportsShaderReload() override { return true; } virtual std::string getUbershader() override; virtual void setUbershader(const std::string& shader) override; - virtual bool prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) override; + virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel, bool isImmediateMode) override; std::optional getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true); diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp new file mode 100644 index 000000000..4f3e5bdd7 --- /dev/null +++ b/src/core/PICA/draw_acceleration.cpp @@ -0,0 +1,71 @@ +#include "PICA/draw_acceleration.hpp" + +#include + +#include "PICA/gpu.hpp" +#include "PICA/regs.hpp" + +void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { + accel.indexed = indexed; + const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16; + const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg]; // Total # of vertices to transfer + + accel.vertexBuffer = getPointerPhys(vertexBase); + if (indexed) { + u32 indexBufferConfig = regs[PICA::InternalRegs::IndexBufferConfig]; + u32 indexBufferPointer = vertexBase + (indexBufferConfig & 0xfffffff); + + u8* indexBuffer = getPointerPhys(indexBufferPointer); + u16 minimumIndex = std::numeric_limits::max(); + u16 maximumIndex = 0; + + // Check whether the index buffer uses u16 indices or u8 + bool shortIndex = Helpers::getBit<31>(indexBufferConfig); // Indicates whether vert indices are 16-bit or 8-bit + + // Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them + if (shortIndex) { + u16* indexBuffer16 = reinterpret_cast(indexBuffer); + for (int i = 0; i < vertexCount; i++) { + u16 index = indexBuffer16[i]; + minimumIndex = std::min(minimumIndex, index); + maximumIndex = std::max(maximumIndex, index); + } + } else { + for (int i = 0; i < vertexCount; i++) { + u16 index = u16(indexBuffer[i]); + minimumIndex = std::min(minimumIndex, index); + maximumIndex = std::max(maximumIndex, index); + } + } + + accel.indexBuffer = indexBuffer; + accel.minimumIndex = minimumIndex; + accel.maximumIndex = maximumIndex; + } else { + accel.indexBuffer = nullptr; + accel.minimumIndex = regs[PICA::InternalRegs::VertexOffsetReg]; + accel.maximumIndex = accel.minimumIndex + vertexCount - 1; + } + + int buffer = 0; + accel.vertexDataSize = 0; + + for (int attrCount = 0; attrCount < totalAttribCount; attrCount++) { + bool fixedAttribute = (fixedAttribMask & (1 << attrCount)) != 0; + + if (!fixedAttribute) { + auto& attr = attributeInfo[buffer]; // Get information for this attribute + + if (attr.componentCount != 0) { + // Size of the attribute in bytes multiplied by the total number of vertices + const u32 bytes = attr.size * vertexCount; + // Add it to the total vertex data size, aligned to 4 bytes. + accel.vertexDataSize += (bytes + 3) & ~3; + } + + buffer++; + } + } + + accel.canBeAccelerated = true; +} \ No newline at end of file diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index 6cbdb100b..7e9be0053 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -123,7 +123,15 @@ void GPU::reset() { // Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter) // And whether we are going to use the shader JIT (second template parameter) void GPU::drawArrays(bool indexed) { - const bool hwShaders = renderer->prepareForDraw(shaderUnit, false); + PICA::DrawAcceleration accel; + + if (config.accelerateShaders) { + // If we are potentially going to use hw shaders, gather necessary to do vertex fetch, index buffering, etc on the GPU + // This includes parsing which vertices to upload, getting pointers to the index buffer data & vertex data, and so on + getAcceleratedDrawInfo(accel, indexed); + } + + const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel, false); if (hwShaders) { if (indexed) { diff --git a/src/core/PICA/regs.cpp b/src/core/PICA/regs.cpp index 0c5f4adb7..091bd377d 100644 --- a/src/core/PICA/regs.cpp +++ b/src/core/PICA/regs.cpp @@ -249,7 +249,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { // If we've reached 3 verts, issue a draw call // Handle rendering depending on the primitive type if (immediateModeVertIndex == 3) { - renderer->prepareForDraw(shaderUnit, true); + renderer->prepareForDraw(shaderUnit, nullptr, true); renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices); switch (primType) { diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 6e50f77be..d0ecf4433 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -942,7 +942,7 @@ OpenGL::Program& RendererGL::getSpecializedShader() { return program; } -bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { +bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel, bool isImmediateMode) { // First we figure out if we will be using an ubershader bool usingUbershader = emulatorConfig->useUbershaders; if (usingUbershader) { @@ -993,6 +993,8 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer()); } } + + accelerateVertexUpload(shaderUnit, accel); } if (usingUbershader) { @@ -1110,4 +1112,8 @@ void RendererGL::initUbershader(OpenGL::Program& program) { glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1); glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2); glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3); +} + +void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) { + } \ No newline at end of file From 74a341ba46667696d87160fa979831a53d4f5a73 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sat, 24 Aug 2024 02:58:05 +0300 Subject: [PATCH 34/63] More hw VAO work --- include/PICA/draw_acceleration.hpp | 15 +++++++++ src/core/PICA/draw_acceleration.cpp | 47 ++++++++++++++++++++++++---- src/core/renderer_gl/renderer_gl.cpp | 10 ++++++ 3 files changed, 66 insertions(+), 6 deletions(-) diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp index eec76b873..f940fc7c0 100644 --- a/include/PICA/draw_acceleration.hpp +++ b/include/PICA/draw_acceleration.hpp @@ -6,13 +6,28 @@ namespace PICA { struct DrawAcceleration { + static constexpr u32 maxAttribCount = 12; + + struct AttributeInfo { + u32 offset; + + u8 type; + u8 componentCount; + bool fixed; + + std::array fixedValue; // For fixed attributes + }; + u8* vertexBuffer; u8* indexBuffer; // Minimum and maximum index in the index buffer for a draw call u16 minimumIndex, maximumIndex; + u32 totalAttribCount; u32 vertexDataSize; + std::array attributeInfo; + bool canBeAccelerated; bool indexed; }; diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp index 4f3e5bdd7..827f107db 100644 --- a/src/core/PICA/draw_acceleration.cpp +++ b/src/core/PICA/draw_acceleration.cpp @@ -7,6 +7,8 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { accel.indexed = indexed; + accel.totalAttribCount = totalAttribCount; + const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16; const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg]; // Total # of vertices to transfer @@ -47,23 +49,56 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { accel.maximumIndex = accel.minimumIndex + vertexCount - 1; } + const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32); int buffer = 0; accel.vertexDataSize = 0; for (int attrCount = 0; attrCount < totalAttribCount; attrCount++) { - bool fixedAttribute = (fixedAttribMask & (1 << attrCount)) != 0; + auto& attr = accel.attributeInfo[attrCount]; + attr.fixed = (fixedAttribMask & (1 << attrCount)) != 0; - if (!fixedAttribute) { - auto& attr = attributeInfo[buffer]; // Get information for this attribute - - if (attr.componentCount != 0) { + // Variable attribute attribute + if (!attr.fixed) { + auto& attrData = attributeInfo[buffer]; // Get information for this attribute + u64 attrCfg = attrData.getConfigFull(); // Get config1 | (config2 << 32) + u32 attributeOffset = attrData.offset; + + if (attrData.componentCount != 0) { // Size of the attribute in bytes multiplied by the total number of vertices - const u32 bytes = attr.size * vertexCount; + const u32 bytes = attrData.size * vertexCount; // Add it to the total vertex data size, aligned to 4 bytes. accel.vertexDataSize += (bytes + 3) & ~3; } + for (int i = 0; i < attrData.componentCount; i++) { + uint index = (attrCfg >> (i * 4)) & 0xf; // Get index of attribute in vertexCfg + + // Vertex attributes used as padding + // 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively + if (index >= 12) [[unlikely]] { + Helpers::panic("Padding attribute"); + // Align attribute address up to a 4 byte boundary + attributeOffset = (attributeOffset + 3) & -4; + attributeOffset += (index - 11) << 2; + continue; + } + + u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf; + u32 attribType = attribInfo & 0x3; // Type of attribute(sbyte/ubyte/short/float) + u32 size = (attribInfo >> 2) + 1; // Total number of components + + attr.componentCount = size; + attr.offset = attributeOffset; + attr.type = attribType; + } + buffer++; + } else { + vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount]; + + for (int i = 0; i < 4; i++) { + attr.fixedValue[i] = fixedAttr[i].toFloat32(); + } } } diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index d0ecf4433..71346f9b5 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -1115,5 +1115,15 @@ void RendererGL::initUbershader(OpenGL::Program& program) { } void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) { + u32 buffer = 0; // Vertex buffer index for non-fixed attributes + u32 attrCount = 0; + const u32 totalAttribCount = accel->totalAttribCount; + + static constexpr GLenum attributeFormats[4] = { + GL_BYTE, // 0: Signed byte + GL_UNSIGNED_BYTE, // 1: Unsigned byte + GL_SHORT, // 2: Short + GL_FLOAT, // 3: Float + }; } \ No newline at end of file From 5d6f59112aa677084851734e1959d99d1c8d5283 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sat, 24 Aug 2024 02:58:05 +0300 Subject: [PATCH 35/63] More hw VAO work --- include/PICA/draw_acceleration.hpp | 15 ++++++++ src/core/PICA/draw_acceleration.cpp | 51 ++++++++++++++++++++++++---- src/core/renderer_gl/renderer_gl.cpp | 10 ++++++ 3 files changed, 70 insertions(+), 6 deletions(-) diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp index eec76b873..f940fc7c0 100644 --- a/include/PICA/draw_acceleration.hpp +++ b/include/PICA/draw_acceleration.hpp @@ -6,13 +6,28 @@ namespace PICA { struct DrawAcceleration { + static constexpr u32 maxAttribCount = 12; + + struct AttributeInfo { + u32 offset; + + u8 type; + u8 componentCount; + bool fixed; + + std::array fixedValue; // For fixed attributes + }; + u8* vertexBuffer; u8* indexBuffer; // Minimum and maximum index in the index buffer for a draw call u16 minimumIndex, maximumIndex; + u32 totalAttribCount; u32 vertexDataSize; + std::array attributeInfo; + bool canBeAccelerated; bool indexed; }; diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp index 4f3e5bdd7..b96f6db42 100644 --- a/src/core/PICA/draw_acceleration.cpp +++ b/src/core/PICA/draw_acceleration.cpp @@ -7,6 +7,8 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { accel.indexed = indexed; + accel.totalAttribCount = totalAttribCount; + const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16; const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg]; // Total # of vertices to transfer @@ -47,23 +49,60 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { accel.maximumIndex = accel.minimumIndex + vertexCount - 1; } + const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32); int buffer = 0; accel.vertexDataSize = 0; for (int attrCount = 0; attrCount < totalAttribCount; attrCount++) { - bool fixedAttribute = (fixedAttribMask & (1 << attrCount)) != 0; + auto& attr = accel.attributeInfo[attrCount]; + attr.fixed = (fixedAttribMask & (1 << attrCount)) != 0; - if (!fixedAttribute) { - auto& attr = attributeInfo[buffer]; // Get information for this attribute - - if (attr.componentCount != 0) { + // Variable attribute attribute + if (!attr.fixed) { + auto& attrData = attributeInfo[buffer]; // Get information for this attribute + u64 attrCfg = attrData.getConfigFull(); // Get config1 | (config2 << 32) + u32 attributeOffset = attrData.offset; + + if (attrData.componentCount != 0) { // Size of the attribute in bytes multiplied by the total number of vertices - const u32 bytes = attr.size * vertexCount; + const u32 bytes = attrData.size * vertexCount; // Add it to the total vertex data size, aligned to 4 bytes. accel.vertexDataSize += (bytes + 3) & ~3; } + for (int i = 0; i < attrData.componentCount; i++) { + uint index = (attrCfg >> (i * 4)) & 0xf; // Get index of attribute in vertexCfg + + // Vertex attributes used as padding + // 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively + if (index >= 12) [[unlikely]] { + Helpers::panic("Padding attribute"); + // Align attribute address up to a 4 byte boundary + attributeOffset = (attributeOffset + 3) & -4; + attributeOffset += (index - 11) << 2; + continue; + } + + u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf; + u32 attribType = attribInfo & 0x3; // Type of attribute(sbyte/ubyte/short/float) + u32 size = (attribInfo >> 2) + 1; // Total number of components + + attr.componentCount = size; + attr.offset = attributeOffset; + attr.type = attribType; + + // Size of each component based on the attribute type + static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4}; + attributeOffset += size * sizePerComponent[attribType]; + } + buffer++; + } else { + vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount]; + + for (int i = 0; i < 4; i++) { + attr.fixedValue[i] = fixedAttr[i].toFloat32(); + } } } diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index d0ecf4433..71346f9b5 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -1115,5 +1115,15 @@ void RendererGL::initUbershader(OpenGL::Program& program) { } void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) { + u32 buffer = 0; // Vertex buffer index for non-fixed attributes + u32 attrCount = 0; + const u32 totalAttribCount = accel->totalAttribCount; + + static constexpr GLenum attributeFormats[4] = { + GL_BYTE, // 0: Signed byte + GL_UNSIGNED_BYTE, // 1: Unsigned byte + GL_SHORT, // 2: Short + GL_FLOAT, // 3: Float + }; } \ No newline at end of file From a8b30ee2dc5b53f6bd7f62953189d767c01f7186 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 25 Aug 2024 00:45:23 +0300 Subject: [PATCH 36/63] More GPU vertex fetch code --- include/PICA/draw_acceleration.hpp | 1 + src/core/PICA/draw_acceleration.cpp | 33 ++++++++++++++++++++-------- src/core/renderer_gl/renderer_gl.cpp | 8 +++++++ 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp index f940fc7c0..bd3e428dd 100644 --- a/include/PICA/draw_acceleration.hpp +++ b/include/PICA/draw_acceleration.hpp @@ -9,6 +9,7 @@ namespace PICA { static constexpr u32 maxAttribCount = 12; struct AttributeInfo { + u8* data; u32 offset; u8 type; diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp index b96f6db42..e9546cf7b 100644 --- a/src/core/PICA/draw_acceleration.cpp +++ b/src/core/PICA/draw_acceleration.cpp @@ -50,15 +50,15 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { } const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32); - int buffer = 0; + u32 buffer = 0; + u32 attrCount = 0; accel.vertexDataSize = 0; - for (int attrCount = 0; attrCount < totalAttribCount; attrCount++) { - auto& attr = accel.attributeInfo[attrCount]; - attr.fixed = (fixedAttribMask & (1 << attrCount)) != 0; + while (attrCount < totalAttribCount) { + bool fixedAttrib = (fixedAttribMask & (1 << attrCount)) != 0; // Variable attribute attribute - if (!attr.fixed) { + if (!fixedAttrib) { auto& attrData = attributeInfo[buffer]; // Get information for this attribute u64 attrCfg = attrData.getConfigFull(); // Get config1 | (config2 << 32) u32 attributeOffset = attrData.offset; @@ -72,6 +72,8 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { for (int i = 0; i < attrData.componentCount; i++) { uint index = (attrCfg >> (i * 4)) & 0xf; // Get index of attribute in vertexCfg + auto& attr = accel.attributeInfo[attrCount]; + attr.fixed = false; // Vertex attributes used as padding // 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively @@ -83,26 +85,39 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { continue; } - u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf; - u32 attribType = attribInfo & 0x3; // Type of attribute(sbyte/ubyte/short/float) - u32 size = (attribInfo >> 2) + 1; // Total number of components + const u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf; + const u32 attribType = attribInfo & 0x3; // Type of attribute (sbyte/ubyte/short/float) + const u32 size = (attribInfo >> 2) + 1; // Total number of components attr.componentCount = size; attr.offset = attributeOffset; attr.type = attribType; + // Get a pointer to the data where this attribute is stored + const u32 attrAddress = vertexBase + attr.offset + (accel.minimumIndex * attrData.size); + attr.data = getPointerPhys(attrAddress); + // Size of each component based on the attribute type static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4}; attributeOffset += size * sizePerComponent[attribType]; + + attrCount += 1; } - buffer++; + buffer += 1; } else { vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount]; + auto& attr = accel.attributeInfo[attrCount]; + + attr.fixed = true; + // Set the data pointer to nullptr in order to catch any potential bugs + attr.data = nullptr; for (int i = 0; i < 4; i++) { attr.fixedValue[i] = fixedAttr[i].toFloat32(); } + + attrCount += 1; } } diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 71346f9b5..f5728346b 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -1126,4 +1126,12 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele GL_SHORT, // 2: Short GL_FLOAT, // 3: Float }; + + for (int i = 0; i < totalAttribCount; i++) { + const auto& attrib = accel->attributeInfo[i]; + printf( + "%s attribute starting from offset %d with a size of %d components\n", attrib.fixed ? "Fixed" : "Variable", (!attrib.fixed) ? attrib.offset : 0, + !attrib.fixed ? attrib.componentCount : 4 + ); + } } \ No newline at end of file From e34bdb68413a8a0560a3708813949ddd4ba175c8 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 25 Aug 2024 01:47:02 +0300 Subject: [PATCH 37/63] Add GL Stream Buffer from Duckstation --- CMakeLists.txt | 5 +- include/align.hpp | 99 +++++++ include/renderer_gl/renderer_gl.hpp | 8 +- src/core/PICA/draw_acceleration.cpp | 2 + src/core/renderer_gl/renderer_gl.cpp | 8 + third_party/duckstation/gl/stream_buffer.cpp | 288 +++++++++++++++++++ third_party/duckstation/gl/stream_buffer.h | 53 ++++ 7 files changed, 461 insertions(+), 2 deletions(-) create mode 100644 include/align.hpp create mode 100644 third_party/duckstation/gl/stream_buffer.cpp create mode 100644 third_party/duckstation/gl/stream_buffer.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 89322af45..6a94047c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -138,6 +138,7 @@ include_directories(${SDL2_INCLUDE_DIR}) include_directories(third_party/toml11) include_directories(third_party/glm) include_directories(third_party/renderdoc) +include_directories(third_party/duckstation) add_subdirectory(third_party/cmrc) @@ -302,6 +303,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp + include/align.hpp ) cmrc_add_resource_library( @@ -334,7 +336,6 @@ if(ENABLE_LUAJIT AND NOT ANDROID) endif() if(ENABLE_QT_GUI) - include_directories(third_party/duckstation) set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/window_info.cpp third_party/duckstation/gl/context.cpp) if(APPLE) @@ -377,6 +378,8 @@ if(ENABLE_OPENGL) src/host_shaders/opengl_fragment_shader.frag ) + set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/gl/stream_buffer.cpp) + set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES}) source_group("Source Files\\Core\\OpenGL Renderer" FILES ${RENDERER_GL_SOURCE_FILES}) diff --git a/include/align.hpp b/include/align.hpp new file mode 100644 index 000000000..6b79a6564 --- /dev/null +++ b/include/align.hpp @@ -0,0 +1,99 @@ +// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#pragma once + +#include + +#include "helpers.hpp" + +#ifdef _MSC_VER +#include +#endif + +namespace Common { + template + constexpr bool isAligned(T value, unsigned int alignment) { + return (value % static_cast(alignment)) == 0; + } + + template + constexpr T alignUp(T value, unsigned int alignment) { + return (value + static_cast(alignment - 1)) / static_cast(alignment) * static_cast(alignment); + } + + template + constexpr T alignDown(T value, unsigned int alignment) { + return value / static_cast(alignment) * static_cast(alignment); + } + + template + constexpr bool isAlignedPow2(T value, unsigned int alignment) { + return (value & static_cast(alignment - 1)) == 0; + } + + template + constexpr T alignUpPow2(T value, unsigned int alignment) { + return (value + static_cast(alignment - 1)) & static_cast(~static_cast(alignment - 1)); + } + + template + constexpr T alignDownPow2(T value, unsigned int alignment) { + return value & static_cast(~static_cast(alignment - 1)); + } + + template + constexpr bool isPow2(T value) { + return (value & (value - 1)) == 0; + } + + template + constexpr T previousPow2(T value) { + if (value == static_cast(0)) return 0; + + value |= (value >> 1); + value |= (value >> 2); + value |= (value >> 4); + if constexpr (sizeof(T) >= 16) value |= (value >> 8); + if constexpr (sizeof(T) >= 32) value |= (value >> 16); + if constexpr (sizeof(T) >= 64) value |= (value >> 32); + return value - (value >> 1); + } + + template + constexpr T nextPow2(T value) { + // https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + if (value == static_cast(0)) return 0; + + value--; + value |= (value >> 1); + value |= (value >> 2); + value |= (value >> 4); + if constexpr (sizeof(T) >= 16) value |= (value >> 8); + if constexpr (sizeof(T) >= 32) value |= (value >> 16); + if constexpr (sizeof(T) >= 64) value |= (value >> 32); + value++; + return value; + } + + ALWAYS_INLINE static void* alignedMalloc(size_t size, size_t alignment) { +#ifdef _MSC_VER + return _aligned_malloc(size, alignment); +#else + // Unaligned sizes are slow on macOS. +#ifdef __APPLE__ + if (isPow2(alignment)) size = (size + alignment - 1) & ~(alignment - 1); +#endif + void* ret = nullptr; + return (posix_memalign(&ret, alignment, size) == 0) ? ret : nullptr; +#endif + } + + ALWAYS_INLINE static void alignedFree(void* ptr) { +#ifdef _MSC_VER + _aligned_free(ptr); +#else + free(ptr); +#endif + } +} // namespace Common \ No newline at end of file diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index 397aaf534..63bbb474c 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -10,11 +11,12 @@ #include "PICA/float_types.hpp" #include "PICA/pica_frag_config.hpp" -#include "PICA/pica_vert_config.hpp" #include "PICA/pica_hash.hpp" +#include "PICA/pica_vert_config.hpp" #include "PICA/pica_vertex.hpp" #include "PICA/regs.hpp" #include "PICA/shader_gen.hpp" +#include "gl/stream_buffer.h" #include "gl_state.hpp" #include "helpers.hpp" #include "logger.hpp" @@ -83,6 +85,10 @@ class RendererGL final : public Renderer { // UBO for uploading the PICA uniforms when using hw shaders GLuint hwShaderUniformUBO; + using StreamBuffer = OpenGLStreamBuffer; + std::unique_ptr hwVertexBuffer; + std::unique_ptr hwIndexBuffer; + // Cached recompiled fragment shader struct CachedProgram { OpenGL::Program program; diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp index e9546cf7b..5fc21e48a 100644 --- a/src/core/PICA/draw_acceleration.cpp +++ b/src/core/PICA/draw_acceleration.cpp @@ -82,6 +82,8 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { // Align attribute address up to a 4 byte boundary attributeOffset = (attributeOffset + 3) & -4; attributeOffset += (index - 11) << 2; + + attr.data = nullptr; continue; } diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index f5728346b..3b2d1d70b 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -78,6 +78,14 @@ void RendererGL::initGraphicsContextInternal() { gl.useProgram(displayProgram); glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0); // Init sampler object + // Create stream buffers for vertex, index and uniform buffers + // TODO: Remove buffers from GL state tracking as the StreamBuffer implementation bypasses the state tracker. + static constexpr usize hwIndexBufferSize = 2_MB; + static constexpr usize hwVertexBufferSize = 16_MB; + + hwIndexBuffer = StreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, hwIndexBufferSize); + hwVertexBuffer = StreamBuffer::Create(GL_ARRAY_BUFFER, hwVertexBufferSize); + // Allocate memory for the shadergen fragment uniform UBO glGenBuffers(1, &shadergenFragmentUBO); gl.bindUBO(shadergenFragmentUBO); diff --git a/third_party/duckstation/gl/stream_buffer.cpp b/third_party/duckstation/gl/stream_buffer.cpp new file mode 100644 index 000000000..f4f8b54cf --- /dev/null +++ b/third_party/duckstation/gl/stream_buffer.cpp @@ -0,0 +1,288 @@ +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#include "gl/stream_buffer.h" + +#include +#include + +#include "align.hpp" + +OpenGLStreamBuffer::OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : m_target(target), m_buffer_id(buffer_id), m_size(size) {} +OpenGLStreamBuffer::~OpenGLStreamBuffer() { glDeleteBuffers(1, &m_buffer_id); } + +void OpenGLStreamBuffer::Bind() { glBindBuffer(m_target, m_buffer_id); } +void OpenGLStreamBuffer::Unbind() { glBindBuffer(m_target, 0); } + +void OpenGLStreamBuffer::SetDebugName(std::string_view name) { +#ifdef GPU_DEBUG_INFO + if (glObjectLabel) { + glObjectLabel(GL_BUFFER, GetGLBufferId(), static_cast(name.length()), static_cast(name.data())); + } +#endif +} + +namespace { + // Uses glBufferSubData() to update. Preferred for drivers which don't support {ARB,EXT}_buffer_storage. + class BufferSubDataStreamBuffer final : public OpenGLStreamBuffer { + public: + ~BufferSubDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); } + + MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast(m_cpu_buffer), 0, 0, m_size / alignment}; } + + u32 Unmap(u32 used_size) override { + if (used_size == 0) return 0; + + glBindBuffer(m_target, m_buffer_id); + glBufferSubData(m_target, 0, used_size, m_cpu_buffer); + return 0; + } + + u32 GetChunkSize() const override { return m_size; } + + static std::unique_ptr Create(GLenum target, u32 size) { + glGetError(); + + GLuint buffer_id; + glGenBuffers(1, &buffer_id); + glBindBuffer(target, buffer_id); + glBufferData(target, size, nullptr, GL_STREAM_DRAW); + + GLenum err = glGetError(); + if (err != GL_NO_ERROR) { + glBindBuffer(target, 0); + glDeleteBuffers(1, &buffer_id); + return {}; + } + + return std::unique_ptr(new BufferSubDataStreamBuffer(target, buffer_id, size)); + } + + private: + BufferSubDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) { + m_cpu_buffer = static_cast(Common::alignedMalloc(size, 32)); + if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer"); + } + + u8* m_cpu_buffer; + }; + + // Uses BufferData() to orphan the buffer after every update. Used on Mali where BufferSubData forces a sync. + class BufferDataStreamBuffer final : public OpenGLStreamBuffer { + public: + ~BufferDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); } + + MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast(m_cpu_buffer), 0, 0, m_size / alignment}; } + + u32 Unmap(u32 used_size) override { + if (used_size == 0) return 0; + + glBindBuffer(m_target, m_buffer_id); + glBufferData(m_target, used_size, m_cpu_buffer, GL_STREAM_DRAW); + return 0; + } + + u32 GetChunkSize() const override { return m_size; } + + static std::unique_ptr Create(GLenum target, u32 size) { + glGetError(); + + GLuint buffer_id; + glGenBuffers(1, &buffer_id); + glBindBuffer(target, buffer_id); + glBufferData(target, size, nullptr, GL_STREAM_DRAW); + + GLenum err = glGetError(); + if (err != GL_NO_ERROR) { + glBindBuffer(target, 0); + glDeleteBuffers(1, &buffer_id); + return {}; + } + + return std::unique_ptr(new BufferDataStreamBuffer(target, buffer_id, size)); + } + + private: + BufferDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) { + m_cpu_buffer = static_cast(Common::alignedMalloc(size, 32)); + if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer"); + } + + u8* m_cpu_buffer; + }; + + // Base class for implementations which require syncing. + class SyncingStreamBuffer : public OpenGLStreamBuffer { + public: + enum : u32 { NUM_SYNC_POINTS = 16 }; + + virtual ~SyncingStreamBuffer() override { + for (u32 i = m_available_block_index; i <= m_used_block_index; i++) { + glDeleteSync(m_sync_objects[i]); + } + } + + protected: + SyncingStreamBuffer(GLenum target, GLuint buffer_id, u32 size) + : OpenGLStreamBuffer(target, buffer_id, size), m_bytes_per_block((size + (NUM_SYNC_POINTS)-1) / NUM_SYNC_POINTS) {} + + ALWAYS_INLINE u32 GetSyncIndexForOffset(u32 offset) { return offset / m_bytes_per_block; } + + ALWAYS_INLINE void AddSyncsForOffset(u32 offset) { + const u32 end = GetSyncIndexForOffset(offset); + for (; m_used_block_index < end; m_used_block_index++) { + if (m_sync_objects[m_used_block_index]) { + Helpers::panic("GL stream buffer: Fence slot we're trying to insert is already in use"); + } + + m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); + } + } + + ALWAYS_INLINE void WaitForSync(GLsync& sync) { + glClientWaitSync(sync, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED); + glDeleteSync(sync); + sync = nullptr; + } + + ALWAYS_INLINE void EnsureSyncsWaitedForOffset(u32 offset) { + const u32 end = std::min(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS); + for (; m_available_block_index < end; m_available_block_index++) { + if (!m_sync_objects[m_used_block_index]) [[unlikely]] { + Helpers::panic("GL stream buffer: Fence slot we're trying to wait on in not in use"); + } + + WaitForSync(m_sync_objects[m_available_block_index]); + } + } + + void AllocateSpace(u32 size) { + // add sync objects for writes since the last allocation + AddSyncsForOffset(m_position); + + // wait for sync objects for the space we want to use + EnsureSyncsWaitedForOffset(m_position + size); + + // wrap-around? + if ((m_position + size) > m_size) { + // current position ... buffer end + AddSyncsForOffset(m_size); + + // rewind, and try again + m_position = 0; + + // wait for the sync at the start of the buffer + WaitForSync(m_sync_objects[0]); + m_available_block_index = 1; + + // and however much more we need to satisfy the allocation + EnsureSyncsWaitedForOffset(size); + m_used_block_index = 0; + } + } + + u32 GetChunkSize() const override { return m_size / NUM_SYNC_POINTS; } + + u32 m_position = 0; + u32 m_used_block_index = 0; + u32 m_available_block_index = NUM_SYNC_POINTS; + u32 m_bytes_per_block; + std::array m_sync_objects{}; + }; + + class BufferStorageStreamBuffer : public SyncingStreamBuffer { + public: + ~BufferStorageStreamBuffer() override { + glBindBuffer(m_target, m_buffer_id); + glUnmapBuffer(m_target); + glBindBuffer(m_target, 0); + } + + MappingResult Map(u32 alignment, u32 min_size) override { + if (m_position > 0) m_position = Common::alignUp(m_position, alignment); + + AllocateSpace(min_size); + if ((m_position + min_size) > (m_available_block_index * m_bytes_per_block)) [[unlikely]] { + Helpers::panic("GL stream buffer: Invalid size passed to Unmap"); + } + + const u32 free_space_in_block = ((m_available_block_index * m_bytes_per_block) - m_position); + return MappingResult{static_cast(m_mapped_ptr + m_position), m_position, m_position / alignment, free_space_in_block / alignment}; + } + + u32 Unmap(u32 used_size) override { + if ((m_position + used_size) > m_size) [[unlikely]] { + Helpers::panic("GL stream buffer: Invalid size passed to Unmap"); + } + + if (!m_coherent) { + if (GLAD_GL_VERSION_4_5 || GLAD_GL_ARB_direct_state_access) { + glFlushMappedNamedBufferRange(m_buffer_id, m_position, used_size); + } else { + Bind(); + glFlushMappedBufferRange(m_target, m_position, used_size); + } + } + + const u32 prev_position = m_position; + m_position += used_size; + return prev_position; + } + + static std::unique_ptr Create(GLenum target, u32 size, bool coherent = true) { + glGetError(); + + GLuint buffer_id; + glGenBuffers(1, &buffer_id); + glBindBuffer(target, buffer_id); + + const u32 flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0); + const u32 map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT); + if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage) + glBufferStorage(target, size, nullptr, flags); + else if (GLAD_GL_EXT_buffer_storage) + glBufferStorageEXT(target, size, nullptr, flags); + + GLenum err = glGetError(); + if (err != GL_NO_ERROR) { + glBindBuffer(target, 0); + glDeleteBuffers(1, &buffer_id); + return {}; + } + + u8* mapped_ptr = static_cast(glMapBufferRange(target, 0, size, map_flags)); + AssertMsg(mapped_ptr, "Persistent buffer was mapped"); + + return std::unique_ptr(new BufferStorageStreamBuffer(target, buffer_id, size, mapped_ptr, coherent)); + } + + private: + BufferStorageStreamBuffer(GLenum target, GLuint buffer_id, u32 size, u8* mapped_ptr, bool coherent) + : SyncingStreamBuffer(target, buffer_id, size), m_mapped_ptr(mapped_ptr), m_coherent(coherent) {} + + u8* m_mapped_ptr; + bool m_coherent; + }; + +} // namespace + +std::unique_ptr OpenGLStreamBuffer::Create(GLenum target, u32 size) { + std::unique_ptr buf; + if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage) { + buf = BufferStorageStreamBuffer::Create(target, size); + if (buf) return buf; + } + + // BufferSubData is slower on all drivers except NVIDIA... +#if 0 + const char* vendor = reinterpret_cast(glGetString(GL_VENDOR)); + if (std::strcmp(vendor, "ARM") == 0 || std::strcmp(vendor, "Qualcomm") == 0) { + // Mali and Adreno drivers can't do sub-buffer tracking... + return BufferDataStreamBuffer::Create(target, size); + } + + return BufferSubDataStreamBuffer::Create(target, size); +#else + return BufferDataStreamBuffer::Create(target, size); +#endif +} \ No newline at end of file diff --git a/third_party/duckstation/gl/stream_buffer.h b/third_party/duckstation/gl/stream_buffer.h new file mode 100644 index 000000000..6b3562e78 --- /dev/null +++ b/third_party/duckstation/gl/stream_buffer.h @@ -0,0 +1,53 @@ +// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin +// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0) + +#pragma once + +#include +// Comment to avoid clang-format reordering the glad header + +#include +#include +#include +#include + +#include "duckstation_compat.h" +#include "helpers.hpp" + +class OpenGLStreamBuffer { + public: + virtual ~OpenGLStreamBuffer(); + + ALWAYS_INLINE GLuint GetGLBufferId() const { return m_buffer_id; } + ALWAYS_INLINE GLenum GetGLTarget() const { return m_target; } + ALWAYS_INLINE u32 GetSize() const { return m_size; } + + void Bind(); + void Unbind(); + + void SetDebugName(std::string_view name); + + struct MappingResult { + void* pointer; + u32 buffer_offset; + u32 index_aligned; // offset / alignment, suitable for base vertex + u32 space_aligned; // remaining space / alignment + }; + + virtual MappingResult Map(u32 alignment, u32 min_size) = 0; + + /// Returns the position in the buffer *before* the start of used_size. + virtual u32 Unmap(u32 used_size) = 0; + + /// Returns the minimum granularity of blocks which sync objects will be created around. + virtual u32 GetChunkSize() const = 0; + + static std::unique_ptr Create(GLenum target, u32 size); + + protected: + OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size); + + GLenum m_target; + GLuint m_buffer_id; + u32 m_size; +}; \ No newline at end of file From f96b609123cda765397c6015bd0450603e6d37a1 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 25 Aug 2024 03:49:07 +0300 Subject: [PATCH 38/63] GL: Actually upload data to stream buffers --- include/PICA/draw_acceleration.hpp | 4 ++- include/renderer_gl/gl_state.hpp | 9 ------- src/core/PICA/draw_acceleration.cpp | 19 ++++++++------ src/core/renderer_gl/gl_state.cpp | 3 --- src/core/renderer_gl/renderer_gl.cpp | 38 +++++++++++++++++++++++----- 5 files changed, 46 insertions(+), 27 deletions(-) diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp index bd3e428dd..2ec3f318a 100644 --- a/include/PICA/draw_acceleration.hpp +++ b/include/PICA/draw_acceleration.hpp @@ -11,15 +11,16 @@ namespace PICA { struct AttributeInfo { u8* data; u32 offset; + u32 size; u8 type; u8 componentCount; bool fixed; + bool isPadding; std::array fixedValue; // For fixed attributes }; - u8* vertexBuffer; u8* indexBuffer; // Minimum and maximum index in the index buffer for a draw call @@ -31,5 +32,6 @@ namespace PICA { bool canBeAccelerated; bool indexed; + bool useShortIndices; }; } // namespace PICA \ No newline at end of file diff --git a/include/renderer_gl/gl_state.hpp b/include/renderer_gl/gl_state.hpp index e5591ea0f..4085cabcf 100644 --- a/include/renderer_gl/gl_state.hpp +++ b/include/renderer_gl/gl_state.hpp @@ -38,7 +38,6 @@ struct GLStateManager { GLuint stencilMask; GLuint boundVAO; - GLuint boundVBO; GLuint currentProgram; GLuint boundUBO; @@ -173,13 +172,6 @@ struct GLStateManager { } } - void bindVBO(GLuint handle) { - if (boundVBO != handle) { - boundVBO = handle; - glBindBuffer(GL_ARRAY_BUFFER, handle); - } - } - void useProgram(GLuint handle) { if (currentProgram != handle) { currentProgram = handle; @@ -195,7 +187,6 @@ struct GLStateManager { } void bindVAO(const OpenGL::VertexArray& vao) { bindVAO(vao.handle()); } - void bindVBO(const OpenGL::VertexBuffer& vbo) { bindVBO(vbo.handle()); } void useProgram(const OpenGL::Program& program) { useProgram(program.handle()); } void setColourMask(bool r, bool g, bool b, bool a) { diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp index 5fc21e48a..22b1f0413 100644 --- a/src/core/PICA/draw_acceleration.cpp +++ b/src/core/PICA/draw_acceleration.cpp @@ -12,7 +12,6 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16; const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg]; // Total # of vertices to transfer - accel.vertexBuffer = getPointerPhys(vertexBase); if (indexed) { u32 indexBufferConfig = regs[PICA::InternalRegs::IndexBufferConfig]; u32 indexBufferPointer = vertexBase + (indexBufferConfig & 0xfffffff); @@ -22,11 +21,12 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { u16 maximumIndex = 0; // Check whether the index buffer uses u16 indices or u8 - bool shortIndex = Helpers::getBit<31>(indexBufferConfig); // Indicates whether vert indices are 16-bit or 8-bit + accel.useShortIndices = Helpers::getBit<31>(indexBufferConfig); // Indicates whether vert indices are 16-bit or 8-bit // Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them - if (shortIndex) { + if (accel.useShortIndices) { u16* indexBuffer16 = reinterpret_cast(indexBuffer); + for (int i = 0; i < vertexCount; i++) { u16 index = indexBuffer16[i]; minimumIndex = std::min(minimumIndex, index); @@ -84,6 +84,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { attributeOffset += (index - 11) << 2; attr.data = nullptr; + attr.isPadding = true; continue; } @@ -91,18 +92,19 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { const u32 attribType = attribInfo & 0x3; // Type of attribute (sbyte/ubyte/short/float) const u32 size = (attribInfo >> 2) + 1; // Total number of components + // Size of each component based on the attribute type + static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4}; + attr.componentCount = size; attr.offset = attributeOffset; + attr.size = size * sizePerComponent[attribType]; attr.type = attribType; + attr.isPadding = false; + attributeOffset += attr.size; // Get a pointer to the data where this attribute is stored const u32 attrAddress = vertexBase + attr.offset + (accel.minimumIndex * attrData.size); attr.data = getPointerPhys(attrAddress); - - // Size of each component based on the attribute type - static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4}; - attributeOffset += size * sizePerComponent[attribType]; - attrCount += 1; } @@ -114,6 +116,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { attr.fixed = true; // Set the data pointer to nullptr in order to catch any potential bugs attr.data = nullptr; + attr.isPadding = false; for (int i = 0; i < 4; i++) { attr.fixedValue[i] = fixedAttr[i].toFloat32(); diff --git a/src/core/renderer_gl/gl_state.cpp b/src/core/renderer_gl/gl_state.cpp index 3d1c0681a..785cac411 100644 --- a/src/core/renderer_gl/gl_state.cpp +++ b/src/core/renderer_gl/gl_state.cpp @@ -73,10 +73,7 @@ void GLStateManager::resetVAO() { } void GLStateManager::resetBuffers() { - boundVBO = 0; boundUBO = 0; - - glBindBuffer(GL_ARRAY_BUFFER, 0); glBindBuffer(GL_UNIFORM_BUFFER, 0); } diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 3b2d1d70b..4ed1eac1e 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -97,7 +97,7 @@ void RendererGL::initGraphicsContextInternal() { glBufferData(GL_UNIFORM_BUFFER, PICAShader::totalUniformSize(), nullptr, GL_DYNAMIC_DRAW); vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize * 2, GL_STREAM_DRAW); - gl.bindVBO(vbo); + vbo.bind(); // Initialize the VAO used when not using hw shaders defaultVAO.create(); gl.bindVAO(defaultVAO); @@ -439,7 +439,7 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v const auto primitiveTopology = primTypes[static_cast(primType)]; gl.disableScissor(); - gl.bindVBO(vbo); + vbo.bind(); gl.bindVAO(usingAcceleratedShader ? hwShaderVAO : defaultVAO); gl.enableClipPlane(0); // Clipping plane 0 is always enabled @@ -1135,11 +1135,37 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele GL_FLOAT, // 3: Float }; + const u32 vertexCount = accel->maximumIndex - accel->minimumIndex + 1; + + // Update index buffer if necessary + if (accel->indexed) { + const bool shortIndex = accel->useShortIndices; + const usize indexBufferSize = usize(vertexCount) * (shortIndex ? sizeof(u16) : sizeof(u8)); + + auto indexBufferRes = hwIndexBuffer->Map(4, indexBufferSize); + std::memcpy(indexBufferRes.pointer, accel->indexBuffer, indexBufferSize); + hwIndexBuffer->Unmap(indexBufferSize); + } + + auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize); + u8* vertexData = static_cast(vertexBufferRes.pointer); + for (int i = 0; i < totalAttribCount; i++) { const auto& attrib = accel->attributeInfo[i]; - printf( - "%s attribute starting from offset %d with a size of %d components\n", attrib.fixed ? "Fixed" : "Variable", (!attrib.fixed) ? attrib.offset : 0, - !attrib.fixed ? attrib.componentCount : 4 - ); + + if (attrib.fixed) { + Helpers::panic("Fixed attribute!"); + } else { + if (attrib.isPadding) { + continue; + } + + const u32 attributeSize = attrib.size * vertexCount; + + std::memcpy(vertexData, attrib.data, attributeSize); + vertexData += attributeSize; + } } + + hwVertexBuffer->Unmap(accel->vertexDataSize); } \ No newline at end of file From 33e63f7d7ac826066ec8a8bfea2ed9021f29c8c2 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 25 Aug 2024 16:02:54 +0300 Subject: [PATCH 39/63] GPU: Cleanup immediate mode handling --- include/renderer.hpp | 2 +- include/renderer_gl/renderer_gl.hpp | 2 +- src/core/PICA/gpu.cpp | 2 +- src/core/PICA/regs.cpp | 2 +- src/core/renderer_gl/renderer_gl.cpp | 9 +++++---- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/include/renderer.hpp b/include/renderer.hpp index 94a0b0f3c..5a1efc773 100644 --- a/include/renderer.hpp +++ b/include/renderer.hpp @@ -84,7 +84,7 @@ class Renderer { // It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between // ubershaders and shadergen, and so on. // Returns whether this draw is eligible for using hardware-accelerated shaders or if shaders should run on the CPU - virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel, bool isImmediateMode) { return false; } + virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) { return false; } // Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window #ifdef PANDA3DS_FRONTEND_QT diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index 63bbb474c..162864845 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -160,7 +160,7 @@ class RendererGL final : public Renderer { virtual bool supportsShaderReload() override { return true; } virtual std::string getUbershader() override; virtual void setUbershader(const std::string& shader) override; - virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel, bool isImmediateMode) override; + virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) override; std::optional getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true); diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index 76f3acea1..64dc5beb7 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -131,7 +131,7 @@ void GPU::drawArrays(bool indexed) { getAcceleratedDrawInfo(accel, indexed); } - const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel, false); + const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel); if (hwShaders) { if (indexed) { diff --git a/src/core/PICA/regs.cpp b/src/core/PICA/regs.cpp index 091bd377d..4c865d122 100644 --- a/src/core/PICA/regs.cpp +++ b/src/core/PICA/regs.cpp @@ -249,7 +249,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) { // If we've reached 3 verts, issue a draw call // Handle rendering depending on the primitive type if (immediateModeVertIndex == 3) { - renderer->prepareForDraw(shaderUnit, nullptr, true); + renderer->prepareForDraw(shaderUnit, nullptr); renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices); switch (primType) { diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 4ed1eac1e..fc6e2ce6b 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -950,7 +950,7 @@ OpenGL::Program& RendererGL::getSpecializedShader() { return program; } -bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel, bool isImmediateMode) { +bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) { // First we figure out if we will be using an ubershader bool usingUbershader = emulatorConfig->useUbershaders; if (usingUbershader) { @@ -966,7 +966,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* // Then we figure out if we will use hw accelerated shaders, and try to fetch our shader // TODO: Ubershader support for accelerated shaders - usingAcceleratedShader = emulatorConfig->accelerateShaders && !isImmediateMode && !usingUbershader; + usingAcceleratedShader = emulatorConfig->accelerateShaders && !usingUbershader && accel != nullptr && accel->canBeAccelerated; if (usingAcceleratedShader) { PICA::VertConfig vertexConfig(shaderUnit.vs, regs, usingUbershader); @@ -1000,9 +1000,10 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* shaderUnit.vs.uniformsDirty = false; glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer()); } - } - accelerateVertexUpload(shaderUnit, accel); + // Upload vertex data and index buffer data to our GPU + accelerateVertexUpload(shaderUnit, accel); + } } if (usingUbershader) { From 5432a5a0d87ed17a81b7ac865f8b06413b893821 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 25 Aug 2024 17:14:19 +0300 Subject: [PATCH 40/63] Get first renders working with accelerated draws --- include/PICA/draw_acceleration.hpp | 1 + include/renderer_gl/renderer_gl.hpp | 1 + src/core/PICA/draw_acceleration.cpp | 1 + src/core/PICA/gpu.cpp | 96 +++++++------------- src/core/renderer_gl/renderer_gl.cpp | 38 +++++--- third_party/duckstation/gl/stream_buffer.cpp | 4 +- 6 files changed, 63 insertions(+), 78 deletions(-) diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp index 2ec3f318a..1671825ea 100644 --- a/include/PICA/draw_acceleration.hpp +++ b/include/PICA/draw_acceleration.hpp @@ -12,6 +12,7 @@ namespace PICA { u8* data; u32 offset; u32 size; + u32 stride; u8 type; u8 componentCount; diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index 162864845..b643534a6 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -62,6 +62,7 @@ class RendererGL final : public Renderer { bool oldDepthmapEnable = false; // Set by prepareDraw, tells us whether the current draw is using hw-accelerated shader bool usingAcceleratedShader = false; + bool performIndexedRender = false; // Cached pointer to the current vertex shader when using HW accelerated shaders OpenGL::Shader* generatedVertexShader = nullptr; diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp index 22b1f0413..7646577fc 100644 --- a/src/core/PICA/draw_acceleration.cpp +++ b/src/core/PICA/draw_acceleration.cpp @@ -98,6 +98,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { attr.componentCount = size; attr.offset = attributeOffset; attr.size = size * sizePerComponent[attribType]; + attr.stride = attrData.size; attr.type = attribType; attr.isPadding = false; attributeOffset += attr.size; diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index 64dc5beb7..dad24a22a 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -120,6 +120,8 @@ void GPU::reset() { renderer->reset(); } +static std::array vertices; + // Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter) // And whether we are going to use the shader JIT (second template parameter) void GPU::drawArrays(bool indexed) { @@ -134,11 +136,13 @@ void GPU::drawArrays(bool indexed) { const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel); if (hwShaders) { - if (indexed) { - drawArrays(); - } else { - drawArrays(); - } + // Hardware shaders have their own accelerated code path for draws, so they skip everything here + const PICA::PrimType primType = static_cast(Helpers::getBits<8, 2>(regs[PICA::InternalRegs::PrimitiveConfig])); + // Total # of vertices to render + const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg]; + + // Note: In the hardware shader path the vertices span shouldn't actually be used as the rasterizer will perform its own attribute fetching + renderer->drawVertices(primType, std::span(vertices).first(vertexCount)); } else { const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled; @@ -158,33 +162,17 @@ void GPU::drawArrays(bool indexed) { } } -// We need a union here, because unfortunately in CPU shaders we only need to store the vertex shader outputs in the vertex buffer, -// which consist of 8 vec4 attributes, while with GPU shaders we need to pass all the vertex shader inputs to the GPU, which consist -// of 16 vec4 attributes -union PICAVertexBuffer { - // Used with CPU shaders - std::array vertices; - // Used with GPU shaders. We can have up to 16 attributes per vertex, each attribute with 4 floats - std::array vsInputs; - - PICAVertexBuffer() {} -}; - -static PICAVertexBuffer vertexBuffer; - template void GPU::drawArrays() { if constexpr (mode == ShaderExecMode::JIT) { shaderJIT.prepare(shaderUnit.vs); + } else if constexpr (mode == ShaderExecMode::Hardware) { + // Hardware shaders have their own accelerated code path for draws, so they're not meant to take this path + Helpers::panic("GPU::DrawArrays: Hardware shaders shouldn't take this path!"); } // We can have up to 16 attributes, each one consisting of 4 floats constexpr u32 maxAttrSizeInFloats = 16 * 4; - auto& vertices = vertexBuffer.vertices; - - if constexpr (mode != ShaderExecMode::Hardware) { - setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]); - } // Base address for vertex attributes // The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible @@ -257,15 +245,7 @@ void GPU::drawArrays() { size_t tag = vertexIndex % vertexCacheSize; // Cache hit if (cache.validBits[tag] && cache.ids[tag] == vertexIndex) { - if constexpr (mode != ShaderExecMode::Hardware) { - vertices[i] = vertices[cache.bufferPositions[tag]]; - } else { - const u32 cachedBufferPosition = cache.bufferPositions[tag] * maxAttrSizeInFloats; - std::memcpy( - &vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cachedBufferPosition], - sizeof(float) * maxAttrSizeInFloats - ); - } + vertices[i] = vertices[cache.bufferPositions[tag]]; continue; } @@ -370,39 +350,29 @@ void GPU::drawArrays() { } } - // Running shader on the CPU instead of the GPU - if constexpr (mode == ShaderExecMode::Interpreter || mode == ShaderExecMode::JIT) { - // Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers - // Based on the SH_ATTRIBUTES_PERMUTATION registers. - // Ie it might map attribute #0 to v2, #1 to v7, etc - for (int j = 0; j < totalAttribCount; j++) { - const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf; - std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f)); - } + // Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers + // Based on the SH_ATTRIBUTES_PERMUTATION registers. + // Ie it might map attribute #0 to v2, #1 to v7, etc + for (int j = 0; j < totalAttribCount; j++) { + const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf; + std::memcpy(&shaderUnit.vs.inputs[mapping], ¤tAttributes[j], sizeof(vec4f)); + } - if constexpr (mode == ShaderExecMode::JIT) { - shaderJIT.run(shaderUnit.vs); - } else { - shaderUnit.vs.run(); - } + if constexpr (mode == ShaderExecMode::JIT) { + shaderJIT.run(shaderUnit.vs); + } else { + shaderUnit.vs.run(); + } - PICA::Vertex& out = vertices[i]; - // Map shader outputs to fixed function properties - const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7; - for (int i = 0; i < totalShaderOutputs; i++) { - const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i]; + PICA::Vertex& out = vertices[i]; + // Map shader outputs to fixed function properties + const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7; + for (int i = 0; i < totalShaderOutputs; i++) { + const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i]; - for (int j = 0; j < 4; j++) { // pls unroll - const u32 mapping = (config >> (j * 8)) & 0x1F; - out.raw[mapping] = vsOutputRegisters[i][j]; - } - } - } else { // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly - float* out = &vertexBuffer.vsInputs[i * maxAttrSizeInFloats]; - for (int j = 0; j < totalAttribCount; j++) { - const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf; - // Multiply mapping * 4 as mapping refers to a vec4 whereas out is an array of floats - std::memcpy(&out[mapping * 4], ¤tAttributes[j], sizeof(vec4f)); + for (int j = 0; j < 4; j++) { // pls unroll + const u32 mapping = (config >> (j * 8)) & 0x1F; + out.raw[mapping] = vsOutputRegisters[i][j]; } } } diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index fc6e2ce6b..82248d535 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -129,11 +129,6 @@ void RendererGL::initGraphicsContextInternal() { // Initialize the VAO used for hw shaders hwShaderVAO.create(); - gl.bindVAO(hwShaderVAO); - for (int attr = 0; attr < 16; attr++) { - hwShaderVAO.setAttributeFloat(attr, 4, sizeof(Vertex) * 2, attr * sizeof(float) * 4); - hwShaderVAO.enableAttribute(attr); - } dummyVBO.create(); dummyVAO.create(); @@ -439,8 +434,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v const auto primitiveTopology = primTypes[static_cast(primType)]; gl.disableScissor(); - vbo.bind(); - gl.bindVAO(usingAcceleratedShader ? hwShaderVAO : defaultVAO); + + if (usingAcceleratedShader) { + hwVertexBuffer->Bind(); + gl.bindVAO(hwShaderVAO); + } else { + vbo.bind(); + gl.bindVAO(defaultVAO); + } gl.enableClipPlane(0); // Clipping plane 0 is always enabled if (regs[PICA::InternalRegs::ClipEnable] & 1) { @@ -503,15 +504,19 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v setupStencilTest(stencilEnable); - // If we're using hardware shaders, the vertex array works completely different - // And instead of 8 vec4 attributes, each vertex is 16 vec4 attributes. We use a union + aliasing which is not ideal for readability. if (!usingAcceleratedShader) { vbo.bufferVertsSub(vertices); + OpenGL::draw(primitiveTopology, GLsizei(vertices.size())); } else { - glBufferSubData(GL_ARRAY_BUFFER, 0, vertices.size_bytes() * 2, vertices.data()); + if (performIndexedRender) { + // When doing indexed rendering, bind the IBO and use glDrawRangeElementsBaseVertex to issue the indexed draw + hwIndexBuffer->Bind(); + //glDrawRangeElementsBaseVertex(); + } else { + // When doing non-indexed rendering, just use glDrawArrays + OpenGL::draw(primitiveTopology, GLsizei(vertices.size())); + } } - - OpenGL::draw(primitiveTopology, GLsizei(vertices.size())); } void RendererGL::display() { @@ -1003,6 +1008,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* // Upload vertex data and index buffer data to our GPU accelerateVertexUpload(shaderUnit, accel); + performIndexedRender = accel->indexed; } } @@ -1149,7 +1155,9 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele } auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize); + u8* vertexData = static_cast(vertexBufferRes.pointer); + gl.bindVAO(hwShaderVAO); for (int i = 0; i < totalAttribCount; i++) { const auto& attrib = accel->attributeInfo[i]; @@ -1161,9 +1169,13 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele continue; } - const u32 attributeSize = attrib.size * vertexCount; + glVertexAttribPointer(i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride, reinterpret_cast(vertexBufferRes.buffer_offset + attrib.offset)); + // TODO: Disable unused attributes as well + hwShaderVAO.enableAttribute(i); + const u32 attributeSize = attrib.size * vertexCount; std::memcpy(vertexData, attrib.data, attributeSize); + vertexData += attributeSize; } } diff --git a/third_party/duckstation/gl/stream_buffer.cpp b/third_party/duckstation/gl/stream_buffer.cpp index f4f8b54cf..ff6c79f9b 100644 --- a/third_party/duckstation/gl/stream_buffer.cpp +++ b/third_party/duckstation/gl/stream_buffer.cpp @@ -132,7 +132,7 @@ namespace { const u32 end = GetSyncIndexForOffset(offset); for (; m_used_block_index < end; m_used_block_index++) { if (m_sync_objects[m_used_block_index]) { - Helpers::panic("GL stream buffer: Fence slot we're trying to insert is already in use"); + Helpers::warn("GL stream buffer: Fence slot we're trying to insert is already in use"); } m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0); @@ -149,7 +149,7 @@ namespace { const u32 end = std::min(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS); for (; m_available_block_index < end; m_available_block_index++) { if (!m_sync_objects[m_used_block_index]) [[unlikely]] { - Helpers::panic("GL stream buffer: Fence slot we're trying to wait on in not in use"); + Helpers::warn("GL stream buffer: Fence slot we're trying to wait on in not in use"); } WaitForSync(m_sync_objects[m_available_block_index]); From e925a91e405545c22dc13d5c326d2fdccf17f72c Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 25 Aug 2024 18:38:22 +0300 Subject: [PATCH 41/63] Shader decompiler: Fix control flow analysis bugs --- src/core/PICA/shader_decompiler.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 133637a7b..75de4e504 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -18,7 +18,7 @@ void ControlFlow::analyze(const PICAShader& shader, u32 entrypoint) { analysisFailed = false; const Function* function = addFunction(shader, entrypoint, PICAShader::maxInstructionCount); - if (function == nullptr) { + if (function == nullptr || function->exitMode != ExitMode::AlwaysEnd) { analysisFailed = true; } } @@ -83,6 +83,7 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e it->second = exitParallel(branchTakenExit, branchNotTakenExit); return it->second; } + case ShaderOpcodes::IFU: case ShaderOpcodes::IFC: { const u32 num = instruction & 0xff; @@ -114,7 +115,7 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e it->second = parallel; return it->second; } else { - ExitMode afterConditional = analyzeFunction(shader, pc + 1, end, labels); + ExitMode afterConditional = analyzeFunction(shader, dest + num, end, labels); ExitMode conditionalExitMode = exitSeries(parallel, afterConditional); it->second = conditionalExitMode; return it->second; @@ -139,7 +140,7 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e // Exit mode of the remainder of this function, after we return from the callee const ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels); - const ExitMode exitMode = exitSeries(postCallExitMode, calledFunction->exitMode); + const ExitMode exitMode = exitSeries(calledFunction->exitMode, postCallExitMode); it->second = exitMode; return exitMode; @@ -179,7 +180,7 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e } const ExitMode afterLoop = analyzeFunction(shader, dest + 1, end, labels); - const ExitMode exitMode = exitSeries(afterLoop, loopFunction->exitMode); + const ExitMode exitMode = exitSeries(loopFunction->exitMode, afterLoop); it->second = exitMode; return it->second; } @@ -190,7 +191,8 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e } // A function without control flow instructions will always reach its "return point" and return - return ExitMode::AlwaysReturn; + it->second = ExitMode::AlwaysReturn; + return it->second; } std::pair ShaderDecompiler::compileRange(const AddressRange& range) { From 37a43e245f2e901d46c8cacf948d8909c1d343a5 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 25 Aug 2024 19:13:37 +0300 Subject: [PATCH 42/63] HW shaders: Accelerate indexed draws --- include/renderer_gl/renderer_gl.hpp | 8 +++++++- src/core/PICA/gpu.cpp | 2 +- src/core/renderer_gl/renderer_gl.cpp | 26 +++++++++++++++++--------- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index b643534a6..30b170266 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -60,9 +60,15 @@ class RendererGL final : public Renderer { float oldDepthScale = -1.0; float oldDepthOffset = 0.0; bool oldDepthmapEnable = false; - // Set by prepareDraw, tells us whether the current draw is using hw-accelerated shader + // Set by prepareForDraw, tells us whether the current draw is using hw-accelerated shader bool usingAcceleratedShader = false; bool performIndexedRender = false; + bool usingShortIndices = false; + + // Set by prepareForDraw, metadata for indexed renders + GLuint minimumIndex = 0; + GLuint maximumIndex = 0; + void* hwIndexBufferOffset = nullptr; // Cached pointer to the current vertex shader when using HW accelerated shaders OpenGL::Shader* generatedVertexShader = nullptr; diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index dad24a22a..2797e09fb 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -141,7 +141,7 @@ void GPU::drawArrays(bool indexed) { // Total # of vertices to render const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg]; - // Note: In the hardware shader path the vertices span shouldn't actually be used as the rasterizer will perform its own attribute fetching + // Note: In the hardware shader path the vertices span shouldn't actually be used as the renderer will perform its own attribute fetching renderer->drawVertices(primType, std::span(vertices).first(vertexCount)); } else { const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled; diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 82248d535..536cb6fad 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -435,10 +435,8 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v const auto primitiveTopology = primTypes[static_cast(primType)]; gl.disableScissor(); - if (usingAcceleratedShader) { - hwVertexBuffer->Bind(); - gl.bindVAO(hwShaderVAO); - } else { + // If we're using accelerated shaders, the hw VAO, VBO and EBO objects will have already been bound in prepareForDraw + if (!usingAcceleratedShader) { vbo.bind(); gl.bindVAO(defaultVAO); } @@ -509,9 +507,12 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v OpenGL::draw(primitiveTopology, GLsizei(vertices.size())); } else { if (performIndexedRender) { - // When doing indexed rendering, bind the IBO and use glDrawRangeElementsBaseVertex to issue the indexed draw + // When doing indexed rendering, bind the EBO and use glDrawRangeElementsBaseVertex to issue the indexed draw hwIndexBuffer->Bind(); - //glDrawRangeElementsBaseVertex(); + glDrawRangeElementsBaseVertex( + primitiveTopology, minimumIndex, maximumIndex, GLsizei(vertices.size()), usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE, + hwIndexBufferOffset, -minimumIndex + ); } else { // When doing non-indexed rendering, just use glDrawArrays OpenGL::draw(primitiveTopology, GLsizei(vertices.size())); @@ -1008,7 +1009,10 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* // Upload vertex data and index buffer data to our GPU accelerateVertexUpload(shaderUnit, accel); + performIndexedRender = accel->indexed; + minimumIndex = GLsizei(accel->minimumIndex); + maximumIndex = GLsizei(accel->maximumIndex); } } @@ -1146,17 +1150,21 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele // Update index buffer if necessary if (accel->indexed) { - const bool shortIndex = accel->useShortIndices; - const usize indexBufferSize = usize(vertexCount) * (shortIndex ? sizeof(u16) : sizeof(u8)); + usingShortIndices = accel->useShortIndices; + const usize indexBufferSize = usize(vertexCount) * (usingShortIndices ? sizeof(u16) : sizeof(u8)); + hwIndexBuffer->Bind(); auto indexBufferRes = hwIndexBuffer->Map(4, indexBufferSize); + hwIndexBufferOffset = reinterpret_cast(usize(indexBufferRes.buffer_offset)); + std::memcpy(indexBufferRes.pointer, accel->indexBuffer, indexBufferSize); hwIndexBuffer->Unmap(indexBufferSize); } + hwVertexBuffer->Bind(); auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize); - u8* vertexData = static_cast(vertexBufferRes.pointer); + gl.bindVAO(hwShaderVAO); for (int i = 0; i < totalAttribCount; i++) { From ca2d7e40eaab6f5278d6b70aa1f6fbae2f308aa7 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 25 Aug 2024 19:34:56 +0300 Subject: [PATCH 43/63] Shader decompiler: Add support for compilation errors --- include/PICA/shader_decompiler.hpp | 1 + src/core/PICA/shader_decompiler.cpp | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/include/PICA/shader_decompiler.hpp b/include/PICA/shader_decompiler.hpp index b7bd869c3..4a5cdc138 100644 --- a/include/PICA/shader_decompiler.hpp +++ b/include/PICA/shader_decompiler.hpp @@ -99,6 +99,7 @@ namespace PICA::ShaderGen { API api; Language language; + bool compilationError = false; void compileInstruction(u32& pc, bool& finished); // Compile range "range" and returns the end PC or if we're "finished" with the program (called an END instruction) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 75de4e504..2d4d29632 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -247,6 +247,7 @@ std::string ShaderDecompiler::decompile() { return ""; } + compilationError = false; decompiledShader = ""; switch (api) { @@ -324,6 +325,13 @@ std::string ShaderDecompiler::decompile() { } } + // We allow some leeway for "compilation errors" in addition to control flow errors, in cases where eg an unimplemented instruction + // or an instruction that we can't emulate in GLSL is found in the instruction stream. Just like control flow errors, these return an empty string + // and the renderer core will decide to use CPU shaders instead + if (compilationError) [[unlikely]] { + return ""; + } + return decompiledShader; } @@ -707,7 +715,11 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { return; case ShaderOpcodes::NOP: break; - default: Helpers::panic("GLSL recompiler: Unknown opcode: %X", opcode); break; + + default: + Helpers::warn("GLSL recompiler: Unknown opcode: %X. Falling back to CPU shaders", opcode); + compilationError = true; + break; } } From 0c2ae1b7d07df911ded44b581ea6125dcada7a0c Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 25 Aug 2024 19:55:47 +0300 Subject: [PATCH 44/63] GLSL decompiler: Fall back for LITP --- src/core/PICA/shader_decompiler.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 2d4d29632..347df5c5e 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -546,7 +546,10 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { break; } - default: Helpers::panic("GLSL recompiler: Unknown common opcode: %X", opcode); break; + default: + Helpers::warn("GLSL recompiler: Unknown common opcode: %02X. Falling back to CPU shaders", opcode); + compilationError = true; + break; } } else if (opcode >= 0x30 && opcode <= 0x3F) { // MAD and MADI const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x1f]; @@ -717,7 +720,7 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { case ShaderOpcodes::NOP: break; default: - Helpers::warn("GLSL recompiler: Unknown opcode: %X. Falling back to CPU shaders", opcode); + Helpers::warn("GLSL recompiler: Unknown opcode: %02X. Falling back to CPU shaders", opcode); compilationError = true; break; } From 0e7697dc673c4c58a01fba0a74bdb4c941002292 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Mon, 26 Aug 2024 00:43:36 +0300 Subject: [PATCH 45/63] Add Renderdoc scope classes --- include/renderdoc.hpp | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/include/renderdoc.hpp b/include/renderdoc.hpp index 94a0f4944..ea2c8a3d6 100644 --- a/include/renderdoc.hpp +++ b/include/renderdoc.hpp @@ -35,4 +35,35 @@ namespace Renderdoc { static void setOutputDir(const std::string& path, const std::string& prefix) {} static constexpr bool isSupported() { return false; } } // namespace Renderdoc -#endif \ No newline at end of file +#endif + +namespace Renderdoc { + // RAII scope class that encloses a Renderdoc capture, as long as it's triggered by triggerCapture + struct Scope { + Scope() { Renderdoc::startCapture(); } + ~Scope() { Renderdoc::endCapture(); } + + Scope(const Scope&) = delete; + Scope& operator=(const Scope&) = delete; + + Scope(Scope&&) = delete; + Scope& operator=(const Scope&&) = delete; + }; + + // RAII scope class that encloses a Renderdoc capture. Unlike regular Scope it doesn't wait for a trigger, it will always issue the capture + // trigger on its own and take a capture + struct InstantScope { + InstantScope() { + Renderdoc::triggerCapture(); + Renderdoc::startCapture(); + } + + ~InstantScope() { Renderdoc::endCapture(); } + + InstantScope(const InstantScope&) = delete; + InstantScope& operator=(const InstantScope&) = delete; + + InstantScope(InstantScope&&) = delete; + InstantScope& operator=(const InstantScope&&) = delete; + }; +} // namespace Renderdoc \ No newline at end of file From e332ab2e58b7ce42e245ca765bbdc7950ace1dd7 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Tue, 3 Sep 2024 01:59:58 +0300 Subject: [PATCH 46/63] Fix control flow analysis bug --- include/PICA/draw_acceleration.hpp | 2 ++ include/renderer_gl/renderer_gl.hpp | 3 ++ src/core/PICA/draw_acceleration.cpp | 12 +++++++- src/core/PICA/shader_decompiler.cpp | 3 +- src/core/PICA/shader_gen_glsl.cpp | 2 -- src/core/renderer_gl/renderer_gl.cpp | 46 ++++++++++++++++++++++------ 6 files changed, 54 insertions(+), 14 deletions(-) diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp index 1671825ea..72eb8944c 100644 --- a/include/PICA/draw_acceleration.hpp +++ b/include/PICA/draw_acceleration.hpp @@ -14,6 +14,7 @@ namespace PICA { u32 size; u32 stride; + u8 inputReg; // Which input reg should this attribute go to in the vertex shader? u8 type; u8 componentCount; bool fixed; @@ -27,6 +28,7 @@ namespace PICA { // Minimum and maximum index in the index buffer for a draw call u16 minimumIndex, maximumIndex; u32 totalAttribCount; + u32 enabledAttributeMask; u32 vertexDataSize; std::array attributeInfo; diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index 30b170266..137c48898 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -70,6 +70,9 @@ class RendererGL final : public Renderer { GLuint maximumIndex = 0; void* hwIndexBufferOffset = nullptr; + // When doing hw shaders, we cache which attributes are enabled in our VAO to avoid having to enable/disable all attributes on each draw + u32 previousAttributeMask = 0; + // Cached pointer to the current vertex shader when using HW accelerated shaders OpenGL::Shader* generatedVertexShader = nullptr; diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp index 7646577fc..538a714eb 100644 --- a/src/core/PICA/draw_acceleration.cpp +++ b/src/core/PICA/draw_acceleration.cpp @@ -8,7 +8,8 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { accel.indexed = indexed; accel.totalAttribCount = totalAttribCount; - + accel.enabledAttributeMask = 0; + const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16; const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg]; // Total # of vertices to transfer @@ -50,6 +51,8 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { } const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32); + const u64 inputAttrCfg = getVertexShaderInputConfig(); + u32 buffer = 0; u32 attrCount = 0; accel.vertexDataSize = 0; @@ -94,7 +97,11 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { // Size of each component based on the attribute type static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4}; + const u32 inputReg = (inputAttrCfg >> (attrCount * 4)) & 0xf; + // Mark the attribute as enabled + accel.enabledAttributeMask |= 1 << inputReg; + attr.inputReg = inputReg; attr.componentCount = size; attr.offset = attributeOffset; attr.size = size * sizePerComponent[attribType]; @@ -123,6 +130,9 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { attr.fixedValue[i] = fixedAttr[i].toFloat32(); } + const u32 inputReg = (inputAttrCfg >> (attrCount * 4)) & 0xf; + + attr.inputReg = inputReg; attrCount += 1; } } diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 347df5c5e..ead984100 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -79,7 +79,7 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e // This opens up 2 parallel paths of execution auto branchTakenExit = analyzeFunction(shader, dest, end, labels); - auto branchNotTakenExit = analyzeFunction(shader, pc + 1, dest, labels); + auto branchNotTakenExit = analyzeFunction(shader, pc + 1, end, labels); it->second = exitParallel(branchTakenExit, branchNotTakenExit); return it->second; } @@ -122,6 +122,7 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e } break; } + case ShaderOpcodes::CALL: { const u32 num = instruction & 0xff; const u32 dest = getBits<10, 12>(instruction); diff --git a/src/core/PICA/shader_gen_glsl.cpp b/src/core/PICA/shader_gen_glsl.cpp index affe9837f..8fc2b126c 100644 --- a/src/core/PICA/shader_gen_glsl.cpp +++ b/src/core/PICA/shader_gen_glsl.cpp @@ -778,8 +778,6 @@ void main() { gl_ClipDistance[1] = dot(clipCoords, a_coords); #endif })"; - - std::cout << ret << "\n"; return ret; } } diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 536cb6fad..3d0119552 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -2,6 +2,7 @@ #include +#include #include #include "PICA/float_types.hpp" @@ -987,7 +988,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL ); - // Empty source means compilation error, if the source is not empty then we convert the rcompiled PICA code into a valid shader and upload + // Empty source means compilation error, if the source is not empty then we convert the recompiled PICA code into a valid shader and upload // it to the GPU if (!picaShaderSource.empty()) { std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, vertexConfig, usingUbershader); @@ -1167,24 +1168,49 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele gl.bindVAO(hwShaderVAO); + // Enable or disable vertex attributes as needed + const u32 currentAttributeMask = accel->enabledAttributeMask; + // Use bitwise xor to calculate which attributes chanced + u32 attributeMaskDiff = currentAttributeMask ^ previousAttributeMask; + + while (attributeMaskDiff != 0) { + // Get index of next different attribute and turn it off + const u32 index = 31 - std::countl_zero(attributeMaskDiff); + const u32 mask = 1u << index; + attributeMaskDiff ^= mask; + + if ((currentAttributeMask & mask) != 0) { + // Attribute was disabled and is now enabled + hwShaderVAO.enableAttribute(index); + } else { + // Attribute was enabled and is now disabled + hwShaderVAO.disableAttribute(index); + } + } + + previousAttributeMask = currentAttributeMask; + for (int i = 0; i < totalAttribCount; i++) { const auto& attrib = accel->attributeInfo[i]; - + if (attrib.fixed) { - Helpers::panic("Fixed attribute!"); + if ((currentAttributeMask & (1u << i)) == 0) { + glVertexAttrib4f(attrib.inputReg, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]); + } } else { - if (attrib.isPadding) { + if (attrib.isPadding) [[unlikely]] { continue; } - - glVertexAttribPointer(i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride, reinterpret_cast(vertexBufferRes.buffer_offset + attrib.offset)); - // TODO: Disable unused attributes as well - hwShaderVAO.enableAttribute(i); - + const u32 attributeSize = attrib.size * vertexCount; std::memcpy(vertexData, attrib.data, attributeSize); - + vertexData += attributeSize; + + glVertexAttribPointer( + attrib.inputReg, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride, + reinterpret_cast(vertexBufferRes.buffer_offset + attrib.offset) + ); } } From 15b6a9e2d947e46a192041dbe860e2e502eac619 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Tue, 3 Sep 2024 02:21:20 +0300 Subject: [PATCH 47/63] HW shaders: Fix attribute fetch --- src/core/PICA/draw_acceleration.cpp | 9 +++++---- src/core/renderer_gl/renderer_gl.cpp | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp index 538a714eb..84096fb74 100644 --- a/src/core/PICA/draw_acceleration.cpp +++ b/src/core/PICA/draw_acceleration.cpp @@ -64,7 +64,6 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { if (!fixedAttrib) { auto& attrData = attributeInfo[buffer]; // Get information for this attribute u64 attrCfg = attrData.getConfigFull(); // Get config1 | (config2 << 32) - u32 attributeOffset = attrData.offset; if (attrData.componentCount != 0) { // Size of the attribute in bytes multiplied by the total number of vertices @@ -73,6 +72,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { accel.vertexDataSize += (bytes + 3) & ~3; } + u32 attributeOffset = 0; for (int i = 0; i < attrData.componentCount; i++) { uint index = (attrCfg >> (i * 4)) & 0xf; // Get index of attribute in vertexCfg auto& attr = accel.attributeInfo[attrCount]; @@ -101,6 +101,10 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { // Mark the attribute as enabled accel.enabledAttributeMask |= 1 << inputReg; + // Get a pointer to the data where this attribute is stored + const u32 attrAddress = vertexBase + attributeOffset + attrData.offset + (accel.minimumIndex * attrData.size); + + attr.data = getPointerPhys(attrAddress); attr.inputReg = inputReg; attr.componentCount = size; attr.offset = attributeOffset; @@ -110,9 +114,6 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { attr.isPadding = false; attributeOffset += attr.size; - // Get a pointer to the data where this attribute is stored - const u32 attrAddress = vertexBase + attr.offset + (accel.minimumIndex * attrData.size); - attr.data = getPointerPhys(attrAddress); attrCount += 1; } diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 3d0119552..6447f7635 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -512,7 +512,7 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v hwIndexBuffer->Bind(); glDrawRangeElementsBaseVertex( primitiveTopology, minimumIndex, maximumIndex, GLsizei(vertices.size()), usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE, - hwIndexBufferOffset, -minimumIndex + hwIndexBufferOffset, -GLint(minimumIndex) ); } else { // When doing non-indexed rendering, just use glDrawArrays From 4a39b06262fb2b9b4fcc28293f23d82b7d4ff628 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Wed, 4 Sep 2024 03:18:39 +0300 Subject: [PATCH 48/63] Rewriting hw vertex fetch --- include/PICA/draw_acceleration.hpp | 17 ++- src/core/PICA/draw_acceleration.cpp | 145 ++++++++++--------- src/core/PICA/gpu.cpp | 2 - src/core/renderer_gl/renderer_gl.cpp | 42 +++--- third_party/duckstation/gl/stream_buffer.cpp | 2 +- 5 files changed, 110 insertions(+), 98 deletions(-) diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp index 72eb8944c..6a66cdc1f 100644 --- a/include/PICA/draw_acceleration.hpp +++ b/include/PICA/draw_acceleration.hpp @@ -6,32 +6,37 @@ namespace PICA { struct DrawAcceleration { - static constexpr u32 maxAttribCount = 12; + static constexpr u32 maxAttribCount = 16; + static constexpr u32 maxLoaderCount = 12; struct AttributeInfo { - u8* data; u32 offset; - u32 size; u32 stride; - u8 inputReg; // Which input reg should this attribute go to in the vertex shader? u8 type; u8 componentCount; - bool fixed; - bool isPadding; std::array fixedValue; // For fixed attributes }; + struct Loader { + // Data to upload for this loader + u8* data; + usize size; + }; + u8* indexBuffer; // Minimum and maximum index in the index buffer for a draw call u16 minimumIndex, maximumIndex; u32 totalAttribCount; + u32 totalLoaderCount; u32 enabledAttributeMask; + u32 fixedAttributes; u32 vertexDataSize; std::array attributeInfo; + std::array loaders; bool canBeAccelerated; bool indexed; diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp index 84096fb74..a65fd1b54 100644 --- a/src/core/PICA/draw_acceleration.cpp +++ b/src/core/PICA/draw_acceleration.cpp @@ -1,5 +1,6 @@ #include "PICA/draw_acceleration.hpp" +#include #include #include "PICA/gpu.hpp" @@ -53,88 +54,94 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32); const u64 inputAttrCfg = getVertexShaderInputConfig(); - u32 buffer = 0; u32 attrCount = 0; + u32 loaderOffset = 0; accel.vertexDataSize = 0; + accel.totalLoaderCount = 0; - while (attrCount < totalAttribCount) { - bool fixedAttrib = (fixedAttribMask & (1 << attrCount)) != 0; + for (int i = 0; i < PICA::DrawAcceleration::maxLoaderCount; i++) { + auto& loaderData = attributeInfo[i]; // Get information for this attribute loader - // Variable attribute attribute - if (!fixedAttrib) { - auto& attrData = attributeInfo[buffer]; // Get information for this attribute - u64 attrCfg = attrData.getConfigFull(); // Get config1 | (config2 << 32) + // This loader is empty, skip it + if (loaderData.componentCount == 0 || loaderData.size == 0) { + continue; + } - if (attrData.componentCount != 0) { - // Size of the attribute in bytes multiplied by the total number of vertices - const u32 bytes = attrData.size * vertexCount; - // Add it to the total vertex data size, aligned to 4 bytes. - accel.vertexDataSize += (bytes + 3) & ~3; + auto& loader = accel.loaders[accel.totalLoaderCount++]; + + // The size of the loader in bytes is equal to the bytes supplied for 1 vertex, multiplied by the number of vertices we'll be uploading + // Which is equal to maximumIndex - minimumIndex + 1 + const u32 bytes = loaderData.size * (accel.maximumIndex - accel.minimumIndex + 1); + loader.size = bytes; + + // Add it to the total vertex data size, aligned to 4 bytes. + accel.vertexDataSize += (bytes + 3) & ~3; + + // Get a pointer to the data where this loader's data is stored + const u32 loaderAddress = vertexBase + loaderData.offset + (accel.minimumIndex * loaderData.size); + loader.data = getPointerPhys(loaderAddress); + + u64 attrCfg = loaderData.getConfigFull(); // Get config1 | (config2 << 32) + u32 attributeOffset = 0; + + for (int component = 0; component < loaderData.componentCount; component++) { + uint attributeIndex = (attrCfg >> (component * 4)) & 0xf; // Get index of attribute in vertexCfg + + // Vertex attributes used as padding + // 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively + if (attributeIndex >= 12) [[unlikely]] { + Helpers::panic("Padding attribute"); + // Align attribute address up to a 4 byte boundary + attributeOffset = (attributeOffset + 3) & -4; + attributeOffset += (attributeIndex - 11) << 2; + continue; } - u32 attributeOffset = 0; - for (int i = 0; i < attrData.componentCount; i++) { - uint index = (attrCfg >> (i * 4)) & 0xf; // Get index of attribute in vertexCfg - auto& attr = accel.attributeInfo[attrCount]; - attr.fixed = false; - - // Vertex attributes used as padding - // 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively - if (index >= 12) [[unlikely]] { - Helpers::panic("Padding attribute"); - // Align attribute address up to a 4 byte boundary - attributeOffset = (attributeOffset + 3) & -4; - attributeOffset += (index - 11) << 2; - - attr.data = nullptr; - attr.isPadding = true; - continue; - } - - const u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf; - const u32 attribType = attribInfo & 0x3; // Type of attribute (sbyte/ubyte/short/float) - const u32 size = (attribInfo >> 2) + 1; // Total number of components - - // Size of each component based on the attribute type - static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4}; - const u32 inputReg = (inputAttrCfg >> (attrCount * 4)) & 0xf; - // Mark the attribute as enabled - accel.enabledAttributeMask |= 1 << inputReg; - - // Get a pointer to the data where this attribute is stored - const u32 attrAddress = vertexBase + attributeOffset + attrData.offset + (accel.minimumIndex * attrData.size); - - attr.data = getPointerPhys(attrAddress); - attr.inputReg = inputReg; - attr.componentCount = size; - attr.offset = attributeOffset; - attr.size = size * sizePerComponent[attribType]; - attr.stride = attrData.size; - attr.type = attribType; - attr.isPadding = false; - attributeOffset += attr.size; - - attrCount += 1; - } + const u32 attribInfo = (vertexCfg >> (attributeIndex * 4)) & 0xf; + const u32 attribType = attribInfo & 0x3; // Type of attribute (sbyte/ubyte/short/float) + const u32 size = (attribInfo >> 2) + 1; // Total number of components + + // Size of each component based on the attribute type + static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4}; + const u32 inputReg = (inputAttrCfg >> (attributeIndex * 4)) & 0xf; + // Mark the attribute as enabled + accel.enabledAttributeMask |= 1 << inputReg; + + auto& attr = accel.attributeInfo[inputReg]; + attr.componentCount = size; + attr.offset = attributeOffset + loaderOffset; + attr.stride = loaderData.size; + attr.type = attribType; + attributeOffset += size * sizePerComponent[attribType]; + } - buffer += 1; - } else { - vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount]; - auto& attr = accel.attributeInfo[attrCount]; + loaderOffset += loader.size; + } + + u32 fixedAttributes = fixedAttribMask; + accel.fixedAttributes = 0; - attr.fixed = true; - // Set the data pointer to nullptr in order to catch any potential bugs - attr.data = nullptr; - attr.isPadding = false; + // Fetch values for all fixed attributes using CLZ on the fixed attribute mask to find the attributes that are actually fixed + while (fixedAttributes != 0) { + // Get index of next fixed attribute and turn it off + const u32 index = std::countr_zero(fixedAttributes); + const u32 mask = 1u << index; + fixedAttributes ^= mask; + + // PICA register this fixed attribute is meant to go to + const u32 inputReg = (inputAttrCfg >> (index * 4)) & 0xf; + const u32 inputRegMask = 1u << inputReg; + + // If this input reg is already used for a non-fixed attribute then it will not be replaced by a fixed attribute + if ((accel.enabledAttributeMask & inputRegMask) == 0) { + vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[index]; + auto& attr = accel.attributeInfo[inputReg]; + + accel.fixedAttributes |= inputRegMask; for (int i = 0; i < 4; i++) { attr.fixedValue[i] = fixedAttr[i].toFloat32(); } - - const u32 inputReg = (inputAttrCfg >> (attrCount * 4)) & 0xf; - - attr.inputReg = inputReg; - attrCount += 1; } } diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp index 2797e09fb..2624903fc 100644 --- a/src/core/PICA/gpu.cpp +++ b/src/core/PICA/gpu.cpp @@ -337,8 +337,6 @@ void GPU::drawArrays() { } // Fill the remaining attribute lanes with default parameters (1.0 for alpha/w, 0.0) for everything else - // Corgi does this although I'm not sure if it's actually needed for anything. - // TODO: Find out while (component < 4) { attribute[component] = (component == 3) ? f24::fromFloat32(1.0) : f24::fromFloat32(0.0); component++; diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 6447f7635..954c30bc9 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -508,7 +508,7 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v OpenGL::draw(primitiveTopology, GLsizei(vertices.size())); } else { if (performIndexedRender) { - // When doing indexed rendering, bind the EBO and use glDrawRangeElementsBaseVertex to issue the indexed draw + // When doing indexed rendering, use glDrawRangeElementsBaseVertex to issue the indexed draw hwIndexBuffer->Bind(); glDrawRangeElementsBaseVertex( primitiveTopology, minimumIndex, maximumIndex, GLsizei(vertices.size()), usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE, @@ -1165,12 +1165,13 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele hwVertexBuffer->Bind(); auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize); u8* vertexData = static_cast(vertexBufferRes.pointer); + const u32 vertexBufferOffset = vertexBufferRes.buffer_offset; gl.bindVAO(hwShaderVAO); // Enable or disable vertex attributes as needed const u32 currentAttributeMask = accel->enabledAttributeMask; - // Use bitwise xor to calculate which attributes chanced + // Use bitwise xor to calculate which attributes changed u32 attributeMaskDiff = currentAttributeMask ^ previousAttributeMask; while (attributeMaskDiff != 0) { @@ -1190,29 +1191,30 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele previousAttributeMask = currentAttributeMask; - for (int i = 0; i < totalAttribCount; i++) { - const auto& attrib = accel->attributeInfo[i]; + // Upload the data for each (enabled) attribute loader into our vertex buffer + for (int i = 0; i < accel->totalLoaderCount; i++) { + auto& loader = accel->loaders[i]; - if (attrib.fixed) { - if ((currentAttributeMask & (1u << i)) == 0) { - glVertexAttrib4f(attrib.inputReg, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]); - } - } else { - if (attrib.isPadding) [[unlikely]] { - continue; - } - - const u32 attributeSize = attrib.size * vertexCount; - std::memcpy(vertexData, attrib.data, attributeSize); + std::memcpy(vertexData, loader.data, loader.size); + vertexData += loader.size; + } - vertexData += attributeSize; + hwVertexBuffer->Unmap(accel->vertexDataSize); + + // Iterate over the 16 PICA input registers and configure how they should be fetched. + for (int i = 0; i < 16; i++) { + const auto& attrib = accel->attributeInfo[i]; + const u32 attributeMask = 1u << i; + if (accel->fixedAttributes & attributeMask) { + // This is a fixed attribute, so set its fixed value + // TODO: Don't update these if the value does not change, it generates way too many calls + glVertexAttrib4f(i, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]); + } else if (accel->enabledAttributeMask & attributeMask) { glVertexAttribPointer( - attrib.inputReg, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride, - reinterpret_cast(vertexBufferRes.buffer_offset + attrib.offset) + i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride, + reinterpret_cast(vertexBufferOffset + attrib.offset) ); } } - - hwVertexBuffer->Unmap(accel->vertexDataSize); } \ No newline at end of file diff --git a/third_party/duckstation/gl/stream_buffer.cpp b/third_party/duckstation/gl/stream_buffer.cpp index ff6c79f9b..6fff8b95e 100644 --- a/third_party/duckstation/gl/stream_buffer.cpp +++ b/third_party/duckstation/gl/stream_buffer.cpp @@ -149,7 +149,7 @@ namespace { const u32 end = std::min(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS); for (; m_available_block_index < end; m_available_block_index++) { if (!m_sync_objects[m_used_block_index]) [[unlikely]] { - Helpers::warn("GL stream buffer: Fence slot we're trying to wait on in not in use"); + Helpers::warn("GL stream buffer: Fence slot we're trying to wait on is not in use"); } WaitForSync(m_sync_objects[m_available_block_index]); From 16425379e3c52d7e71be5c3df22b154c8f223153 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Fri, 4 Oct 2024 19:14:55 +0300 Subject: [PATCH 49/63] Stream buffer: Fix copy-paste mistake --- third_party/duckstation/gl/stream_buffer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/duckstation/gl/stream_buffer.cpp b/third_party/duckstation/gl/stream_buffer.cpp index 6fff8b95e..b7a406036 100644 --- a/third_party/duckstation/gl/stream_buffer.cpp +++ b/third_party/duckstation/gl/stream_buffer.cpp @@ -148,7 +148,7 @@ namespace { ALWAYS_INLINE void EnsureSyncsWaitedForOffset(u32 offset) { const u32 end = std::min(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS); for (; m_available_block_index < end; m_available_block_index++) { - if (!m_sync_objects[m_used_block_index]) [[unlikely]] { + if (!m_sync_objects[m_available_block_index]) [[unlikely]] { Helpers::warn("GL stream buffer: Fence slot we're trying to wait on is not in use"); } From 09b04704f82a3ce483763cde94ef95fd91979834 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Fri, 4 Oct 2024 19:38:43 +0300 Subject: [PATCH 50/63] HW shaders: Fix indexed rendering --- src/core/renderer_gl/renderer_gl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 954c30bc9..80d2ab415 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -1067,7 +1067,7 @@ void RendererGL::screenshot(const std::string& name) { // Flip the image vertically for (int y = 0; y < height; y++) { - memcpy(&flippedPixels[y * width * 4], &pixels[(height - y - 1) * width * 4], width * 4); + std::memcpy(&flippedPixels[y * width * 4], &pixels[(height - y - 1) * width * 4], width * 4); // Swap R and B channels for (int x = 0; x < width; x++) { std::swap(flippedPixels[y * width * 4 + x * 4 + 0], flippedPixels[y * width * 4 + x * 4 + 2]); @@ -1152,7 +1152,7 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele // Update index buffer if necessary if (accel->indexed) { usingShortIndices = accel->useShortIndices; - const usize indexBufferSize = usize(vertexCount) * (usingShortIndices ? sizeof(u16) : sizeof(u8)); + const usize indexBufferSize = regs[PICA::InternalRegs::VertexCountReg] * (usingShortIndices ? sizeof(u16) : sizeof(u8)); hwIndexBuffer->Bind(); auto indexBufferRes = hwIndexBuffer->Map(4, indexBufferSize); From 0a2bc7c909f1a392253dad241dab86c196acbe40 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Fri, 4 Oct 2024 19:48:47 +0300 Subject: [PATCH 51/63] HW shaders: Add padding attributes --- src/core/PICA/draw_acceleration.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp index a65fd1b54..1850d8190 100644 --- a/src/core/PICA/draw_acceleration.cpp +++ b/src/core/PICA/draw_acceleration.cpp @@ -90,7 +90,6 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) { // Vertex attributes used as padding // 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively if (attributeIndex >= 12) [[unlikely]] { - Helpers::panic("Padding attribute"); // Align attribute address up to a 4 byte boundary attributeOffset = (attributeOffset + 3) & -4; attributeOffset += (attributeIndex - 11) << 2; From e3252ec4ef28c9563fd0fb735a63320dc3f1838b Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sat, 5 Oct 2024 21:14:40 +0300 Subject: [PATCH 52/63] HW shaders: Avoid redundant glVertexAttrib4f calls --- include/renderer_gl/renderer_gl.hpp | 3 +++ src/core/renderer_gl/renderer_gl.cpp | 22 ++++++++++++++++------ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp index 137c48898..738ce15a1 100644 --- a/include/renderer_gl/renderer_gl.hpp +++ b/include/renderer_gl/renderer_gl.hpp @@ -99,6 +99,9 @@ class RendererGL final : public Renderer { std::unique_ptr hwVertexBuffer; std::unique_ptr hwIndexBuffer; + // Cache of fixed attribute values so that we don't do any duplicate updates + std::array, 16> fixedAttrValues; + // Cached recompiled fragment shader struct CachedProgram { OpenGL::Program program; diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 80d2ab415..7e68e0c9a 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -184,6 +184,12 @@ void RendererGL::initGraphicsContextInternal() { OpenGL::clearColor(); OpenGL::setViewport(oldViewport[0], oldViewport[1], oldViewport[2], oldViewport[3]); + // Initialize fixed attributes + for (int i = 0; i < fixedAttrValues.size(); i++) { + fixedAttrValues[i] = {0.f, 0.f, 0.f, 0.f}; + glVertexAttrib4f(i, 0.0, 0.0, 0.0, 0.0); + } + reset(); // Initialize the default vertex shader used with shadergen @@ -1008,12 +1014,12 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer()); } - // Upload vertex data and index buffer data to our GPU - accelerateVertexUpload(shaderUnit, accel); - performIndexedRender = accel->indexed; minimumIndex = GLsizei(accel->minimumIndex); maximumIndex = GLsizei(accel->maximumIndex); + + // Upload vertex data and index buffer data to our GPU + accelerateVertexUpload(shaderUnit, accel); } } @@ -1207,9 +1213,13 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele const u32 attributeMask = 1u << i; if (accel->fixedAttributes & attributeMask) { - // This is a fixed attribute, so set its fixed value - // TODO: Don't update these if the value does not change, it generates way too many calls - glVertexAttrib4f(i, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]); + auto& attrValue = fixedAttrValues[i]; + // This is a fixed attribute, so set its fixed value, but only if it actually needs to be updated + if (attrValue[0] != attrib.fixedValue[0] || attrValue[1] != attrib.fixedValue[1] || attrValue[2] != attrib.fixedValue[2] || + attrValue[3] != attrib.fixedValue[3]) { + std::memcpy(attrValue.data(), attrib.fixedValue.data(), sizeof(attrib.fixedValue)); + glVertexAttrib4f(i, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]); + } } else if (accel->enabledAttributeMask & attributeMask) { glVertexAttribPointer( i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride, From 872a6baccac2536a7eb3ef57f14cc5ea37312bac Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 6 Oct 2024 15:39:13 +0300 Subject: [PATCH 53/63] HW shaders: Fix loops --- src/core/PICA/shader_decompiler.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index ead984100..aaa38a466 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -694,10 +694,10 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { const u32 uniformIndex = getBits<22, 2>(instruction); // loop counter = uniform.y - decompiledShader += fmt::format("addr_reg.z = int((uniform_int[{}] >> 16u) & 0xFFu);\n", uniformIndex); + decompiledShader += fmt::format("addr_reg.z = int((uniform_int[{}] >> 8u) & 0xFFu);\n", uniformIndex); decompiledShader += fmt::format( - "for (uint loopCtr{} = 0u; loopCtr{} <= ((uniform_int[{}] >> 24) & 0xFFu); loopCtr{}++, addr_reg.z += int((uniform_int[{}] >> " - "8u) & 0xFFu)) {{\n", + "for (uint loopCtr{} = 0u; loopCtr{} <= ((uniform_int[{}] >> 0) & 0xFFu); loopCtr{}++, addr_reg.z += int((uniform_int[{}] >> " + "16u) & 0xFFu)) {{\n", pc, pc, uniformIndex, pc, uniformIndex ); @@ -706,6 +706,10 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { callFunction(*func); decompiledShader += "}\n"; + // Jump to the end of the loop. We don't want to compile the code inside the loop again. + // This will be incremented by 1 due to the pc++ at the end of this loop. + pc = dest; + if (func->exitMode == ExitMode::AlwaysEnd) { finished = true; return; From bb7b1b3ef19def8fd00569d74c7058865ee4a42e Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 6 Oct 2024 16:07:57 +0300 Subject: [PATCH 54/63] HW shaders: Make generated shaders slightly smaller --- src/core/PICA/shader_decompiler.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index aaa38a466..13a05f161 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -222,21 +222,21 @@ void ShaderDecompiler::writeAttributes() { decompiledShader += R"( layout(location = 0) in vec4 inputs[16]; layout(std140) uniform PICAShaderUniforms { - vec4 uniform_float[96]; - uvec4 uniform_int; + vec4 uniform_f[96]; + uvec4 uniform_i; uint uniform_bool; }; - vec4 tmp_regs[16]; + vec4 temp[16]; vec4 out_regs[16]; vec4 dummy_vec = vec4(0.0); ivec3 addr_reg = ivec3(0); bvec2 cmp_reg = bvec2(false); - vec4 float_uniform_indexed(int source, int offset) { + vec4 uniform_indexed(int source, int offset) { int clipped_offs = (offset >= -128 && offset <= 127) ? offset : 0; uint index = uint(clipped_offs + source) & 127u; - return (index < 96u) ? uniform_float[index] : vec4(1.0); + return (index < 96u) ? uniform_f[index] : vec4(1.0); } )"; } @@ -340,7 +340,7 @@ std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index) if (source < 0x10) { return "inputs[" + std::to_string(source) + "]"; } else if (source < 0x20) { - return "tmp_regs[" + std::to_string(source - 0x10) + "]"; + return "temp[" + std::to_string(source - 0x10) + "]"; } else { const usize floatIndex = (source - 0x20) & 0x7f; @@ -348,10 +348,10 @@ std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index) if (floatIndex >= 96) [[unlikely]] { return "dummy_vec"; } - return "uniform_float[" + std::to_string(floatIndex) + "]"; + return "uniform_f[" + std::to_string(floatIndex) + "]"; } else { static constexpr std::array offsets = {"0", "addr_reg.x", "addr_reg.y", "addr_reg.z"}; - return fmt::format("float_uniform_indexed({}, {})", floatIndex, offsets[index]); + return fmt::format("uniform_indexed({}, {})", floatIndex, offsets[index]); } } } @@ -360,7 +360,7 @@ std::string ShaderDecompiler::getDest(u32 dest) const { if (dest < 0x10) { return "out_regs[" + std::to_string(dest) + "]"; } else if (dest < 0x20) { - return "tmp_regs[" + std::to_string(dest - 0x10) + "]"; + return "temp[" + std::to_string(dest - 0x10) + "]"; } else { return "dummy_vec"; } @@ -694,9 +694,9 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) { const u32 uniformIndex = getBits<22, 2>(instruction); // loop counter = uniform.y - decompiledShader += fmt::format("addr_reg.z = int((uniform_int[{}] >> 8u) & 0xFFu);\n", uniformIndex); + decompiledShader += fmt::format("addr_reg.z = int((uniform_i[{}] >> 8u) & 0xFFu);\n", uniformIndex); decompiledShader += fmt::format( - "for (uint loopCtr{} = 0u; loopCtr{} <= ((uniform_int[{}] >> 0) & 0xFFu); loopCtr{}++, addr_reg.z += int((uniform_int[{}] >> " + "for (uint loopCtr{} = 0u; loopCtr{} <= (uniform_i[{}] & 0xFFu); loopCtr{}++, addr_reg.z += int((uniform_i[{}] >> " "16u) & 0xFFu)) {{\n", pc, pc, uniformIndex, pc, uniformIndex ); From 53097cc53ec5e2ad5d0f3b5c40a5cf04da9789f9 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 6 Oct 2024 17:01:27 +0300 Subject: [PATCH 55/63] Fix libretro build --- src/libretro_core.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libretro_core.cpp b/src/libretro_core.cpp index d77ee7260..21a62f230 100644 --- a/src/libretro_core.cpp +++ b/src/libretro_core.cpp @@ -198,7 +198,7 @@ static void configUpdate() { config.sdWriteProtected = fetchVariableBool("panda3ds_write_protect_virtual_sd", false); config.accurateShaderMul = fetchVariableBool("panda3ds_accurate_shader_mul", false); config.useUbershaders = fetchVariableBool("panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault); - config.accelerateShaders = FetchVariableBool("panda3ds_accelerate_shaders", EmulatorConfig::accelerateShadersDefault); + config.accelerateShaders = fetchVariableBool("panda3ds_accelerate_shaders", EmulatorConfig::accelerateShadersDefault); config.forceShadergenForLights = fetchVariableBool("panda3ds_ubershader_lighting_override", true); config.lightShadergenThreshold = fetchVariableRange("panda3ds_ubershader_lighting_override_threshold", 1, 8); From b833e071d153f8f03f10aaa4b886326ba10df8cc Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 6 Oct 2024 17:13:51 +0300 Subject: [PATCH 56/63] Update config.hpp --- include/config.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/config.hpp b/include/config.hpp index da5b69408..a8ba89466 100644 --- a/include/config.hpp +++ b/include/config.hpp @@ -20,7 +20,7 @@ struct EmulatorConfig { #else static constexpr bool ubershaderDefault = true; #endif - static constexpr bool accelerateShadersDefault = false; + static constexpr bool accelerateShadersDefault = true; bool shaderJitEnabled = shaderJitDefault; bool useUbershaders = ubershaderDefault; From 12d081096a5cafbaabe31fd5b9c44faa983e6045 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 6 Oct 2024 17:48:34 +0300 Subject: [PATCH 57/63] Update renderer_gl.cpp --- src/core/renderer_gl/renderer_gl.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index fa0df0f16..641785272 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -510,6 +510,7 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span v setupStencilTest(stencilEnable); if (!usingAcceleratedShader) { + return; vbo.bufferVertsSub(vertices); OpenGL::draw(primitiveTopology, GLsizei(vertices.size())); } else { @@ -1227,4 +1228,4 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele ); } } -} \ No newline at end of file +} From 56c3e738adaa8f2d1f803956176b020f9e2ef4dc Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 6 Oct 2024 18:54:03 +0300 Subject: [PATCH 58/63] Add Android logging when a shader fails to compile --- third_party/opengl/opengl.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/third_party/opengl/opengl.hpp b/third_party/opengl/opengl.hpp index 607815fa0..b17ec8e3b 100644 --- a/third_party/opengl/opengl.hpp +++ b/third_party/opengl/opengl.hpp @@ -30,6 +30,7 @@ #include #include +#include #include // Check if we have C++20. If yes, we can add C++20 std::span support @@ -383,7 +384,7 @@ namespace OpenGL { if (success == GL_FALSE) { char buf[4096]; glGetShaderInfoLog(m_handle, 4096, nullptr, buf); - fprintf(stderr, "Failed to compile shader\nError: %s\n", buf); + __android_log_print("Failed to compile shader\nError: %s\nShader: %s", buf, sources[0]); glDeleteShader(m_handle); m_handle = 0; From 40a7ac6d29ac639ee8ca443603f41926550aee9d Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 6 Oct 2024 19:03:32 +0300 Subject: [PATCH 59/63] Update opengl.hpp --- third_party/opengl/opengl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/opengl/opengl.hpp b/third_party/opengl/opengl.hpp index b17ec8e3b..5ed630b2d 100644 --- a/third_party/opengl/opengl.hpp +++ b/third_party/opengl/opengl.hpp @@ -384,7 +384,7 @@ namespace OpenGL { if (success == GL_FALSE) { char buf[4096]; glGetShaderInfoLog(m_handle, 4096, nullptr, buf); - __android_log_print("Failed to compile shader\nError: %s\nShader: %s", buf, sources[0]); + __android_log_print(ANDROID_LOG_INFO, "AlberDriver", "Failed to compile shader\nError: %s\nShader: %s", buf, sources[0]); glDeleteShader(m_handle); m_handle = 0; From 5202d9172e48ab585e71978fc58614e2037c927d Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 6 Oct 2024 19:23:02 +0300 Subject: [PATCH 60/63] Shader Decompiler: Add int/float precision qualifiers. --- src/core/PICA/shader_decompiler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index 13a05f161..be05dd44b 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -253,7 +253,7 @@ std::string ShaderDecompiler::decompile() { switch (api) { case API::GL: decompiledShader += "#version 410 core\n"; break; - case API::GLES: decompiledShader += "#version 300 es\n"; break; + case API::GLES: decompiledShader += "#version 300 es\nprecision mediump float;\nprecision mediump int;\n"; break; default: break; } From 214c1d8bed9b6b0b47f7c3c01ebfce0740ac9734 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 6 Oct 2024 19:52:56 +0300 Subject: [PATCH 61/63] Update renderer_gl.cpp --- src/core/renderer_gl/renderer_gl.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp index 641785272..c4ce4227c 100644 --- a/src/core/renderer_gl/renderer_gl.cpp +++ b/src/core/renderer_gl/renderer_gl.cpp @@ -992,7 +992,8 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* shader = OpenGL::Shader(); std::string picaShaderSource = PICA::ShaderGen::decompileShader( - shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL + shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, + Helpers::isAndroid() ? PICA::ShaderGen::API::GLES : PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL ); // Empty source means compilation error, if the source is not empty then we convert the recompiled PICA code into a valid shader and upload From 0252a2a996bf9b210dd69e8098c6bd22780610ab Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Sun, 6 Oct 2024 20:34:01 +0300 Subject: [PATCH 62/63] Update shader_decompiler.cpp --- src/core/PICA/shader_decompiler.cpp | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp index be05dd44b..021a03a38 100644 --- a/src/core/PICA/shader_decompiler.cpp +++ b/src/core/PICA/shader_decompiler.cpp @@ -220,7 +220,23 @@ const Function* ShaderDecompiler::findFunction(const AddressRange& range) { void ShaderDecompiler::writeAttributes() { decompiledShader += R"( - layout(location = 0) in vec4 inputs[16]; + layout(location = 0) in vec4 input0; + layout(location = 1) in vec4 input1; + layout(location = 2) in vec4 input2; + layout(location = 3) in vec4 input3; + layout(location = 4) in vec4 input4; + layout(location = 5) in vec4 input5; + layout(location = 6) in vec4 input6; + layout(location = 7) in vec4 input7; + layout(location = 8) in vec4 input8; + layout(location = 9) in vec4 input9; + layout(location = 10) in vec4 input10; + layout(location = 11) in vec4 input11; + layout(location = 12) in vec4 input12; + layout(location = 13) in vec4 input13; + layout(location = 14) in vec4 input14; + layout(location = 15) in vec4 input15; + layout(std140) uniform PICAShaderUniforms { vec4 uniform_f[96]; uvec4 uniform_i; @@ -338,7 +354,7 @@ std::string ShaderDecompiler::decompile() { std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index) const { if (source < 0x10) { - return "inputs[" + std::to_string(source) + "]"; + return "input" + std::to_string(source); } else if (source < 0x20) { return "temp[" + std::to_string(source - 0x10) + "]"; } else { From aa181292fc59e034ff79561c578c5ccaa6fac769 Mon Sep 17 00:00:00 2001 From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com> Date: Wed, 9 Oct 2024 01:35:29 +0300 Subject: [PATCH 63/63] Update opengl.hpp --- third_party/opengl/opengl.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/third_party/opengl/opengl.hpp b/third_party/opengl/opengl.hpp index 5ed630b2d..01b93373e 100644 --- a/third_party/opengl/opengl.hpp +++ b/third_party/opengl/opengl.hpp @@ -30,7 +30,6 @@ #include #include -#include #include // Check if we have C++20. If yes, we can add C++20 std::span support @@ -384,7 +383,6 @@ namespace OpenGL { if (success == GL_FALSE) { char buf[4096]; glGetShaderInfoLog(m_handle, 4096, nullptr, buf); - __android_log_print(ANDROID_LOG_INFO, "AlberDriver", "Failed to compile shader\nError: %s\nShader: %s", buf, sources[0]); glDeleteShader(m_handle); m_handle = 0;