From 90f411889ba271f747d7d230c461abbc2b970667 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Wed, 24 Jul 2024 23:23:24 +0300
Subject: [PATCH 01/63] Renderer: Add prepareForDraw callback

---
 include/PICA/shader_unit.hpp         |   7 +-
 include/renderer.hpp                 |   9 ++-
 include/renderer_gl/renderer_gl.hpp  |   4 +-
 src/core/PICA/gpu.cpp                |   2 +-
 src/core/PICA/regs.cpp               |   1 +
 src/core/PICA/shader_decompiler.cpp  |  22 +++---
 src/core/renderer_gl/renderer_gl.cpp | 109 +++++++++++++++------------
 7 files changed, 82 insertions(+), 72 deletions(-)
diff --git a/include/PICA/shader_unit.hpp b/include/PICA/shader_unit.hpp
index d8d931603..80e013468 100644
--- a/include/PICA/shader_unit.hpp
+++ b/include/PICA/shader_unit.hpp
@@ -2,10 +2,9 @@
 #include "PICA/shader.hpp"
 
 class ShaderUnit {
-
-public:
-	PICAShader vs; // Vertex shader
-	PICAShader gs; // Geometry shader
+  public:
+	PICAShader vs;  // Vertex shader
+	PICAShader gs;  // Geometry shader
 
 	ShaderUnit() : vs(ShaderType::Vertex), gs(ShaderType::Geometry) {}
 	void reset();
diff --git a/include/renderer.hpp b/include/renderer.hpp
index 569a730b7..1d1fb6824 100644
--- a/include/renderer.hpp
+++ b/include/renderer.hpp
@@ -21,9 +21,11 @@ enum class RendererType : s8 {
 };
 
 struct EmulatorConfig;
-class GPU;
 struct SDL_Window;
 
+class GPU;
+class ShaderUnit;
+
 class Renderer {
   protected:
 	GPU& gpu;
@@ -77,7 +79,10 @@ class Renderer {
 	virtual std::string getUbershader() { return ""; }
 	virtual void setUbershader(const std::string& shader) {}
 
-	virtual void setUbershaderSetting(bool value) {}
+	// This function is called on every draw call before parsing vertex data.
+	// It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between
+	// ubershaders and shadergen, and so on.
+	virtual void prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {}
 
 	// Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window
 #ifdef PANDA3DS_FRONTEND_QT
diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp
index f5a964a34..6c18a0c63 100644
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@@ -30,7 +30,6 @@ class RendererGL final : public Renderer {
 
 	OpenGL::VertexArray vao;
 	OpenGL::VertexBuffer vbo;
-	bool enableUbershader = true;
 
 	// Data 
 	struct {
@@ -110,8 +109,7 @@ class RendererGL final : public Renderer {
 	virtual bool supportsShaderReload() override { return true; }
 	virtual std::string getUbershader() override;
 	virtual void setUbershader(const std::string& shader) override;
-
-	virtual void setUbershaderSetting(bool value) override { enableUbershader = value; }
+	virtual void prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) override;
 	
 	std::optional<ColourBuffer> getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true);
 
diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp
index fe336edc8..b6d903e4a 100644
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@@ -117,13 +117,13 @@ void GPU::reset() {
 	externalRegs[Framebuffer1Config] = static_cast<u32>(PICA::ColorFmt::RGB8);
 	externalRegs[Framebuffer1Select] = 0;
 
-	renderer->setUbershaderSetting(config.useUbershaders);
 	renderer->reset();
 }
 
 // Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
 // And whether we are going to use the shader JIT (second template parameter)
 void GPU::drawArrays(bool indexed) {
+	renderer->prepareForDraw(shaderUnit, false);
 	const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
 
 	if (indexed) {
diff --git a/src/core/PICA/regs.cpp b/src/core/PICA/regs.cpp
index f805de60a..c9412fc8f 100644
--- a/src/core/PICA/regs.cpp
+++ b/src/core/PICA/regs.cpp
@@ -249,6 +249,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 						// If we've reached 3 verts, issue a draw call
 						// Handle rendering depending on the primitive type
 						if (immediateModeVertIndex == 3) {
+							renderer->prepareForDraw(shaderUnit, true);
 							renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices);
 
 							switch (primType) {
diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 482aa36ce..ce7d9a330 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -72,19 +72,17 @@ const Function* ShaderDecompiler::findFunction(const AddressRange& range) {
 
 void ShaderDecompiler::writeAttributes() {
 	decompiledShader += R"(
-		layout(location = 0) in vec4 inputs[8];
-
-		layout(std140) uniform PICAShaderUniforms {
-			vec4 uniform_float[96];
-			uvec4 uniform_int;
-			uint uniform_bool;
-		};
-	
-		vec4 temp_registers[16];
-		vec4 dummy_vec = vec4(0.0);
+	layout(location = 0) in vec4 inputs[8];
+	layout(std140) uniform PICAShaderUniforms {
+		vec4 uniform_float[96];
+		uvec4 uniform_int;
+		uint uniform_bool;
+	};
+
+	vec4 temp_registers[16];
+	vec4 output_registers[8];
+	vec4 dummy_vec = vec4(0.0);
 )";
-
-	decompiledShader += "\n";
 }
 
 std::string ShaderDecompiler::decompile() {
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 8b614d2db..90eccf47a 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -4,11 +4,12 @@
 
 #include <cmrc/cmrc.hpp>
 
-#include "config.hpp"
 #include "PICA/float_types.hpp"
-#include "PICA/pica_frag_uniforms.hpp"
 #include "PICA/gpu.hpp"
+#include "PICA/pica_frag_uniforms.hpp"
 #include "PICA/regs.hpp"
+#include "PICA/shader_decompiler.hpp"
+#include "config.hpp"
 #include "math_util.hpp"
 
 CMRC_DECLARE(RendererGL);
@@ -409,25 +410,6 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 		OpenGL::Triangle,
 	};
 
-	bool usingUbershader = enableUbershader;
-	if (usingUbershader) {
-		const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
-		const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1;
-
-		// Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen
-		// This way we generate fewer shaders overall than with full shadergen, but don't tank performance 
-		if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) {
-			usingUbershader = false;
-		}
-	}
-		
-	if (usingUbershader) {
-		gl.useProgram(triangleProgram);
-	} else {
-		OpenGL::Program& program = getSpecializedShader();
-		gl.useProgram(program);
-	}
-
 	const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
 	gl.disableScissor();
 	gl.bindVBO(vbo);
@@ -449,38 +431,9 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 	const int depthFunc = getBits<4, 3>(depthControl);
 	const int colourMask = getBits<8, 4>(depthControl);
 	gl.setColourMask(colourMask & 1, colourMask & 2, colourMask & 4, colourMask & 8);
-
 	static constexpr std::array<GLenum, 8> depthModes = {GL_NEVER, GL_ALWAYS, GL_EQUAL, GL_NOTEQUAL, GL_LESS, GL_LEQUAL, GL_GREATER, GL_GEQUAL};
 
-	// Update ubershader uniforms
-	if (usingUbershader) {
-		const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
-		const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
-		const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1;
-
-		if (oldDepthScale != depthScale) {
-			oldDepthScale = depthScale;
-			glUniform1f(ubershaderData.depthScaleLoc, depthScale);
-		}
-
-		if (oldDepthOffset != depthOffset) {
-			oldDepthOffset = depthOffset;
-			glUniform1f(ubershaderData.depthOffsetLoc, depthOffset);
-		}
-
-		if (oldDepthmapEnable != depthMapEnable) {
-			oldDepthmapEnable = depthMapEnable;
-			glUniform1i(ubershaderData.depthmapEnableLoc, depthMapEnable);
-		}
-
-		// Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48)
-		// The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates
-		glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, &regs[0x48]);
-		setupUbershaderTexEnv();
-	}
-
 	bindTexturesToSlots();
-
 	if (gpu.fogLUTDirty) {
 		updateFogLUT();
 	}
@@ -951,6 +904,62 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
 	return program;
 }
 
+void RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
+	std::string vertShaderSource = PICA::ShaderGen::decompileShader(
+		shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL
+	);
+	
+	OpenGL::Shader vert({vertShaderSource.c_str(), vertShaderSource.size()}, OpenGL::Vertex);
+	//triangleProgram.create({vert, frag});
+	std::cout << vertShaderSource << "\n";
+
+	bool usingUbershader = emulatorConfig->useUbershaders;
+	if (usingUbershader) {
+		const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
+		const uint lightCount = (regs[InternalRegs::LightNumber] & 0x7) + 1;
+
+		// Emulating lights in the ubershader is incredibly slow, so we've got an option to render draws using moret han N lights via shadergen
+		// This way we generate fewer shaders overall than with full shadergen, but don't tank performance
+		if (emulatorConfig->forceShadergenForLights && lightsEnabled && lightCount >= emulatorConfig->lightShadergenThreshold) {
+			usingUbershader = false;
+		}
+	}
+
+	if (usingUbershader) {
+		gl.useProgram(triangleProgram);
+	} else {
+		OpenGL::Program& program = getSpecializedShader();
+		gl.useProgram(program);
+	}
+
+	// Update ubershader uniforms
+	if (usingUbershader) {
+		const float depthScale = f24::fromRaw(regs[PICA::InternalRegs::DepthScale] & 0xffffff).toFloat32();
+		const float depthOffset = f24::fromRaw(regs[PICA::InternalRegs::DepthOffset] & 0xffffff).toFloat32();
+		const bool depthMapEnable = regs[PICA::InternalRegs::DepthmapEnable] & 1;
+
+		if (oldDepthScale != depthScale) {
+			oldDepthScale = depthScale;
+			glUniform1f(ubershaderData.depthScaleLoc, depthScale);
+		}
+
+		if (oldDepthOffset != depthOffset) {
+			oldDepthOffset = depthOffset;
+			glUniform1f(ubershaderData.depthOffsetLoc, depthOffset);
+		}
+
+		if (oldDepthmapEnable != depthMapEnable) {
+			oldDepthmapEnable = depthMapEnable;
+			glUniform1i(ubershaderData.depthmapEnableLoc, depthMapEnable);
+		}
+
+		// Upload PICA Registers as a single uniform. The shader needs access to the rasterizer registers (for depth, starting from index 0x48)
+		// The texturing and the fragment lighting registers. Therefore we upload them all in one go to avoid multiple slow uniform updates
+		glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, &regs[0x48]);
+		setupUbershaderTexEnv();
+	}
+}
+
 void RendererGL::screenshot(const std::string& name) {
 	constexpr uint width = 400;
 	constexpr uint height = 2 * 240;

From a2b8a7b23d19c7c1ddd704e91a8b848b1fd1c847 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Wed, 24 Jul 2024 23:48:55 +0300
Subject: [PATCH 02/63] Add fmt submodule and port shader decompiler
 instructions to it

---
 .gitmodules                         |  3 +++
 CMakeLists.txt                      |  3 ++-
 src/core/PICA/shader_decompiler.cpp | 18 ++++++++++--------
 third_party/fmt                     |  1 +
 4 files changed, 16 insertions(+), 9 deletions(-)
 create mode 160000 third_party/fmt

diff --git a/.gitmodules b/.gitmodules
index 656e1f41d..5b6301b7e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -76,3 +76,6 @@
 [submodule "third_party/metal-cpp"]
 	path = third_party/metal-cpp
 	url = https://github.com/Panda3DS-emu/metal-cpp
+[submodule "third_party/fmt"]
+	path = third_party/fmt
+	url = https://github.com/fmtlib/fmt
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a3fe41dd2..7c2ec9f15 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -93,6 +93,7 @@ if (NOT ANDROID)
     target_link_libraries(AlberCore PUBLIC SDL2-static)
 endif()
 
+add_subdirectory(third_party/fmt)
 add_subdirectory(third_party/toml11)
 include_directories(${SDL2_INCLUDE_DIR})
 include_directories(third_party/toml11)
@@ -419,7 +420,7 @@ set(ALL_SOURCES ${SOURCE_FILES} ${FS_SOURCE_FILES} ${CRYPTO_SOURCE_FILES} ${KERN
 target_sources(AlberCore PRIVATE ${ALL_SOURCES})
 
 target_link_libraries(AlberCore PRIVATE dynarmic cryptopp glad resources_console_fonts teakra)
-target_link_libraries(AlberCore PUBLIC glad capstone)
+target_link_libraries(AlberCore PUBLIC glad capstone fmt::fmt)
 
 if(ENABLE_DISCORD_RPC AND NOT ANDROID)
     target_compile_definitions(AlberCore PUBLIC "PANDA3DS_ENABLE_DISCORD_RPC=1")
diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index ce7d9a330..826cfaccf 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -1,5 +1,7 @@
 #include "PICA/shader_decompiler.hpp"
 
+#include <fmt/format.h>
+
 #include "config.hpp"
 
 using namespace PICA;
@@ -254,14 +256,14 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 
 		switch (opcode) {
 			case ShaderOpcodes::MOV: setDest(operandDescriptor, dest, src1); break;
-			case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, src1 + " + " + src2); break;
-			case ShaderOpcodes::MUL: setDest(operandDescriptor, dest, src1 + " * " + src2); break;
-			case ShaderOpcodes::MAX: setDest(operandDescriptor, dest, "max(" + src1 + ", " + src2 + ")"); break;
-			case ShaderOpcodes::MIN: setDest(operandDescriptor, dest, "min(" + src1 + ", " + src2 + ")"); break;
-
-			case ShaderOpcodes::DP3: setDest(operandDescriptor, dest, "vec4(dot(" + src1 + ".xyz, " + src2 + ".xyz))"); break;
-			case ShaderOpcodes::DP4: setDest(operandDescriptor, dest, "vec4(dot(" + src1 + ", " + src2 + "))"); break;
-			case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, "vec4(inversesqrt(" + src1 + ".x))"); break;
+			case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, fmt::format("{} + {}", src1, src2)); break;
+			case ShaderOpcodes::MUL: setDest(operandDescriptor, dest, fmt::format("{} * {}", src1, src2)); break;
+			case ShaderOpcodes::MAX: setDest(operandDescriptor, dest, fmt::format("max({}, {})", src1, src2)); break;
+			case ShaderOpcodes::MIN: setDest(operandDescriptor, dest, fmt::format("min({}, {})", src1, src2)); break;
+
+			case ShaderOpcodes::DP3: setDest(operandDescriptor, dest, fmt::format("vec4(dot({}.xyz, {}.xyz))", src1, src2)); break;
+			case ShaderOpcodes::DP4: setDest(operandDescriptor, dest, fmt::format("vec4(dot({}, {}))", src1, src2)); break;
+			case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, fmt::format("vec4(inversesqrt({}.x))", src1)); break;
 
 			default: Helpers::panic("GLSL recompiler: Unknown common opcode: %X", opcode); break;
 		}
diff --git a/third_party/fmt b/third_party/fmt
new file mode 160000
index 000000000..f8581bcec
--- /dev/null
+++ b/third_party/fmt
@@ -0,0 +1 @@
+Subproject commit f8581bcecf317e8753887b68187c9ef1ba0524f4

From 251ff5ee495039b5f023cbba9191d4e8323da44c Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Thu, 25 Jul 2024 00:19:07 +0300
Subject: [PATCH 03/63] Add shader acceleration setting

---
 include/config.hpp    | 6 ++++--
 src/config.cpp        | 2 ++
 src/libretro_core.cpp | 6 +++++-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/config.hpp b/include/config.hpp
index 52be1af7e..46d2fec81 100644
--- a/include/config.hpp
+++ b/include/config.hpp
@@ -20,11 +20,13 @@ struct EmulatorConfig {
 #else
 	static constexpr bool ubershaderDefault = true;
 #endif
-
+	static constexpr bool accelerateShadersDefault = false;
+	
 	bool shaderJitEnabled = shaderJitDefault;
-	bool discordRpcEnabled = false;
 	bool useUbershaders = ubershaderDefault;
+	bool accelerateShaders = accelerateShadersDefault;
 	bool accurateShaderMul = false;
+	bool discordRpcEnabled = false;
 
 	// Toggles whether to force shadergen when there's more than N lights active and we're using the ubershader, for better performance
 	bool forceShadergenForLights = true;
diff --git a/src/config.cpp b/src/config.cpp
index dae5a0ab0..b774d0640 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -64,6 +64,7 @@ void EmulatorConfig::load() {
 			vsyncEnabled = toml::find_or<toml::boolean>(gpu, "EnableVSync", true);
 			useUbershaders = toml::find_or<toml::boolean>(gpu, "UseUbershaders", ubershaderDefault);
 			accurateShaderMul = toml::find_or<toml::boolean>(gpu, "AccurateShaderMultiplication", false);
+			accelerateShaders = toml::find_or<toml::boolean>(gpu, "AccelerateShaders", accelerateShadersDefault);
 
 			forceShadergenForLights = toml::find_or<toml::boolean>(gpu, "ForceShadergenForLighting", true);
 			lightShadergenThreshold = toml::find_or<toml::integer>(gpu, "ShadergenLightThreshold", 1);
@@ -135,6 +136,7 @@ void EmulatorConfig::save() {
 	data["GPU"]["UseUbershaders"] = useUbershaders;
 	data["GPU"]["ForceShadergenForLighting"] = forceShadergenForLights;
 	data["GPU"]["ShadergenLightThreshold"] = lightShadergenThreshold;
+	data["GPU"]["AccelerateShaders"] = accelerateShaders;
 
 	data["Audio"]["DSPEmulation"] = std::string(Audio::DSPCore::typeToString(dspType));
 	data["Audio"]["EnableAudio"] = audioEnabled;
diff --git a/src/libretro_core.cpp b/src/libretro_core.cpp
index 3e0436b86..fa9f6d14e 100644
--- a/src/libretro_core.cpp
+++ b/src/libretro_core.cpp
@@ -148,6 +148,8 @@ static bool FetchVariableBool(std::string key, bool def) {
 static void configInit() {
 	static const retro_variable values[] = {
 		{"panda3ds_use_shader_jit", "Enable shader JIT; enabled|disabled"},
+		{"panda3ds_accelerate_shaders",
+		 EmulatorConfig::accelerateShadersDefault ? "Run 3DS shaders on the GPU; enabled|disabled" : "Run 3DS shaders on the GPU; disabled|enabled"},
 		{"panda3ds_accurate_shader_mul", "Enable accurate shader multiplication; disabled|enabled"},
 		{"panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault ? "Use ubershaders (No stutter, maybe slower); enabled|disabled"
 																	  : "Use ubershaders (No stutter, maybe slower); disabled|enabled"},
@@ -179,7 +181,9 @@ static void configUpdate() {
 	config.sdCardInserted = FetchVariableBool("panda3ds_use_virtual_sd", true);
 	config.sdWriteProtected = FetchVariableBool("panda3ds_write_protect_virtual_sd", false);
 	config.accurateShaderMul = FetchVariableBool("panda3ds_accurate_shader_mul", false);
-	config.useUbershaders = FetchVariableBool("panda3ds_use_ubershader", true);
+	config.useUbershaders = FetchVariableBool("panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault);
+	config.accelerateShaders = FetchVariableBool("panda3ds_accelerate_shaders", EmulatorConfig::accelerateShadersDefault);
+
 	config.forceShadergenForLights = FetchVariableBool("panda3ds_ubershader_lighting_override", true);
 	config.lightShadergenThreshold = std::clamp(std::stoi(FetchVariable("panda3ds_ubershader_lighting_override_threshold", "1")), 1, 8);
 	config.discordRpcEnabled = false;

From 2f4c169cad4ab489d0141921c983e35b80eb8d2f Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Thu, 25 Jul 2024 04:04:41 +0300
Subject: [PATCH 04/63] Hook up vertex shaders to shader cache

---
 CMakeLists.txt                       |  1 +
 include/PICA/gpu.hpp                 |  8 ++-
 include/PICA/pica_vert_config.hpp    | 31 ++++++++++
 include/PICA/shader.hpp              | 10 ++--
 include/PICA/shader_gen.hpp          |  2 +
 include/renderer.hpp                 |  3 +-
 include/renderer_gl/renderer_gl.hpp  | 44 +++++++++++++-
 src/core/PICA/gpu.cpp                | 88 ++++++++++++++++-----------
 src/core/PICA/shader_gen_glsl.cpp    | 59 ++++++++++++++++--
 src/core/renderer_gl/renderer_gl.cpp | 89 +++++++++++++++++++---------
 10 files changed, 257 insertions(+), 78 deletions(-)
 create mode 100644 include/PICA/pica_vert_config.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c2ec9f15..a43b7f634 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -256,6 +256,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
                  include/audio/miniaudio_device.hpp include/ring_buffer.hpp include/bitfield.hpp include/audio/dsp_shared_mem.hpp
                  include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp
                  include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
+                 include/PICA/pica_vert_config.hpp
 )
 
 cmrc_add_resource_library(
diff --git a/include/PICA/gpu.hpp b/include/PICA/gpu.hpp
index ac2a49e6f..1e1d3c4bd 100644
--- a/include/PICA/gpu.hpp
+++ b/include/PICA/gpu.hpp
@@ -13,6 +13,12 @@
 #include "memory.hpp"
 #include "renderer.hpp"
 
+enum class ShaderExecMode {
+	Interpreter,  // Interpret shaders on the CPU
+	JIT,          // Recompile shaders to CPU machine code
+	Hardware,     // Recompiler shaders to host shaders and run them on the GPU
+};
+
 class GPU {
 	static constexpr u32 regNum = 0x300;
 	static constexpr u32 extRegNum = 0x1000;
@@ -45,7 +51,7 @@ class GPU {
 	uint immediateModeVertIndex;
 	uint immediateModeAttrIndex;  // Index of the immediate mode attribute we're uploading
 
-	template <bool indexed, bool useShaderJIT>
+	template <bool indexed, ShaderExecMode mode>
 	void drawArrays();
 
 	// Silly method of avoiding linking problems. TODO: Change to something less silly
diff --git a/include/PICA/pica_vert_config.hpp b/include/PICA/pica_vert_config.hpp
new file mode 100644
index 000000000..ae774405d
--- /dev/null
+++ b/include/PICA/pica_vert_config.hpp
@@ -0,0 +1,31 @@
+#pragma once
+#include <array>
+#include <cstring>
+#include <type_traits>
+#include <unordered_map>
+
+#include "PICA/pica_hash.hpp"
+#include "PICA/regs.hpp"
+#include "bitfield.hpp"
+#include "helpers.hpp"
+
+namespace PICA {
+	// Configuration struct used 
+	struct VertConfig {
+		PICAHash::HashType shaderHash;
+		PICAHash::HashType opdescHash;
+		u32 entrypoint;
+		bool usingUbershader;
+
+		bool operator==(const VertConfig& config) const {
+			// Hash function and equality operator required by std::unordered_map
+			return std::memcmp(this, &config, sizeof(VertConfig)) == 0;
+		}
+	};
+}  // namespace PICA
+
+// Override std::hash for our vertex config class
+template <>
+struct std::hash<PICA::VertConfig> {
+	std::size_t operator()(const PICA::VertConfig& config) const noexcept { return PICAHash::computeHash((const char*)&config, sizeof(config)); }
+};
\ No newline at end of file
diff --git a/include/PICA/shader.hpp b/include/PICA/shader.hpp
index 68b16de88..c725c180a 100644
--- a/include/PICA/shader.hpp
+++ b/include/PICA/shader.hpp
@@ -107,6 +107,11 @@ class PICAShader {
 	alignas(16) std::array<vec4f, 16> inputs;           // Attributes passed to the shader
 	alignas(16) std::array<vec4f, 16> outputs;
 	alignas(16) vec4f dummy = vec4f({f24::zero(), f24::zero(), f24::zero(), f24::zero()});  // Dummy register used by the JIT
+	
+	// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
+	// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
+	// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
+	using Hash = PICAHash::HashType;
 
   protected:
 	std::array<u32, 128> operandDescriptors;
@@ -125,11 +130,6 @@ class PICAShader {
 	std::array<CallInfo, 4> callInfo;
 	ShaderType type;
 
-	// We use a hashmap for matching 3DS shaders to their equivalent compiled code in our shader cache in the shader JIT
-	// We choose our hash type to be a 64-bit integer by default, as the collision chance is very tiny and generating it is decently optimal
-	// Ideally we want to be able to support multiple different types of hash depending on compilation settings, but let's get this working first
-	using Hash = PICAHash::HashType;
-
 	Hash lastCodeHash = 0;    // Last hash computed for the shader code (Used for the JIT caching mechanism)
 	Hash lastOpdescHash = 0;  // Last hash computed for the operand descriptors (Also used for the JIT)
 
diff --git a/include/PICA/shader_gen.hpp b/include/PICA/shader_gen.hpp
index 215e5adb0..2d39e0787 100644
--- a/include/PICA/shader_gen.hpp
+++ b/include/PICA/shader_gen.hpp
@@ -30,6 +30,8 @@ namespace PICA::ShaderGen {
 		FragmentGenerator(API api, Language language) : api(api), language(language) {}
 		std::string generate(const PICA::FragmentConfig& config);
 		std::string getDefaultVertexShader();
+		// For when PICA shader is acceleration is enabled. Turn the PICA shader source into a proper vertex shader
+		std::string getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader);
 
 		void setTarget(API api, Language language) {
 			this->api = api;
diff --git a/include/renderer.hpp b/include/renderer.hpp
index 1d1fb6824..721364c1a 100644
--- a/include/renderer.hpp
+++ b/include/renderer.hpp
@@ -82,7 +82,8 @@ class Renderer {
 	// This function is called on every draw call before parsing vertex data.
 	// It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between
 	// ubershaders and shadergen, and so on.
-	virtual void prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {}
+	// Returns whether this draw is eligible for using hardware-accelerated shaders or if shaders should run on the CPU
+	virtual bool prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { return false; }
 
 	// Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window
 #ifdef PANDA3DS_FRONTEND_QT
diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp
index 6c18a0c63..0597235bb 100644
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@@ -3,11 +3,14 @@
 #include <array>
 #include <cstring>
 #include <functional>
+#include <optional>
 #include <span>
 #include <unordered_map>
+#include <utility>
 
 #include "PICA/float_types.hpp"
 #include "PICA/pica_frag_config.hpp"
+#include "PICA/pica_vert_config.hpp"
 #include "PICA/pica_hash.hpp"
 #include "PICA/pica_vertex.hpp"
 #include "PICA/regs.hpp"
@@ -52,6 +55,11 @@ class RendererGL final : public Renderer {
 	float oldDepthScale = -1.0;
 	float oldDepthOffset = 0.0;
 	bool oldDepthmapEnable = false;
+	// Set by prepareDraw, tells us whether the current draw is using hw-accelerated shader
+	bool usingAcceleratedShader = false;
+
+	// Cached pointer to the current vertex shader when using HW accelerated shaders
+	OpenGL::Shader* generatedVertexShader = nullptr;
 
 	SurfaceCache<DepthBuffer, 16, true> depthBufferCache;
 	SurfaceCache<ColourBuffer, 16, true> colourBufferCache;
@@ -74,7 +82,38 @@ class RendererGL final : public Renderer {
 		OpenGL::Program program;
 		uint uboBinding;
 	};
-	std::unordered_map<PICA::FragmentConfig, CachedProgram> shaderCache;
+
+	struct ShaderCache {
+		std::unordered_map<PICA::VertConfig, std::optional<OpenGL::Shader>> vertexShaderCache;
+		std::unordered_map<PICA::FragmentConfig, OpenGL::Shader> fragmentShaderCache;
+
+		// Program cache indexed by GLuints for the vertex and fragment shader to use
+		// Top 32 bits are the vertex shader GLuint, bottom 32 bits are the fs GLuint
+		std::unordered_map<u64, CachedProgram> programCache;
+
+		void clear() {
+			for (auto& it : programCache) {
+				CachedProgram& cachedProgram = it.second;
+				cachedProgram.program.free();
+				glDeleteBuffers(1, &cachedProgram.uboBinding);
+			}
+
+			for (auto& it : vertexShaderCache) {
+				if (it.second.has_value()) {
+					it.second->free();
+				}
+			}
+
+			for (auto& it : fragmentShaderCache) {
+				it.second.free();
+			}
+
+			programCache.clear();
+			vertexShaderCache.clear();
+			fragmentShaderCache.clear();
+		}
+	};
+	ShaderCache shaderCache;
 
 	OpenGL::Framebuffer getColourFBO();
 	OpenGL::Texture getTexture(Texture& tex);
@@ -109,14 +148,13 @@ class RendererGL final : public Renderer {
 	virtual bool supportsShaderReload() override { return true; }
 	virtual std::string getUbershader() override;
 	virtual void setUbershader(const std::string& shader) override;
-	virtual void prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) override;
+	virtual bool prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) override;
 	
 	std::optional<ColourBuffer> getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true);
 
 	// Note: The caller is responsible for deleting the currently bound FBO before calling this
 	void setFBO(uint handle) { screenFramebuffer.m_handle = handle; }
 	void resetStateManager() { gl.reset(); }
-	void clearShaderCache();
 	void initUbershader(OpenGL::Program& program);
 
 #ifdef PANDA3DS_FRONTEND_QT
diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp
index b6d903e4a..077c65aa5 100644
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@@ -123,27 +123,38 @@ void GPU::reset() {
 // Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
 // And whether we are going to use the shader JIT (second template parameter)
 void GPU::drawArrays(bool indexed) {
-	renderer->prepareForDraw(shaderUnit, false);
-	const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
-
-	if (indexed) {
-		if (shaderJITEnabled)
-			drawArrays<true, true>();
-		else
-			drawArrays<true, false>();
+	const bool hwShaders = renderer->prepareForDraw(shaderUnit, false);
+
+	if (hwShaders) {
+		if (indexed) {
+			drawArrays<true, ShaderExecMode::Hardware>();
+		} else {
+			drawArrays<false, ShaderExecMode::Hardware>();
+		}
 	} else {
-		if (shaderJITEnabled)
-			drawArrays<false, true>();
-		else
-			drawArrays<false, false>();
+		const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
+
+		if (indexed) {
+			if (shaderJITEnabled) {
+				drawArrays<true, ShaderExecMode::JIT>();
+			} else {
+				drawArrays<true, ShaderExecMode::Interpreter>();
+			}
+		} else {
+			if (shaderJITEnabled) {
+				drawArrays<false, ShaderExecMode::JIT>();
+			} else {
+				drawArrays<false, ShaderExecMode::Interpreter>();
+			}
+		}
 	}
 }
 
 static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
 
-template <bool indexed, bool useShaderJIT>
+template <bool indexed, ShaderExecMode mode>
 void GPU::drawArrays() {
-	if constexpr (useShaderJIT) {
+	if constexpr (mode == ShaderExecMode::JIT) {
 		shaderJIT.prepare(shaderUnit.vs);
 	}
 
@@ -322,29 +333,38 @@ void GPU::drawArrays() {
 			}
 		}
 
-		// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
-		// Based on the SH_ATTRIBUTES_PERMUTATION registers.
-		// Ie it might attribute #0 to v2, #1 to v7, etc
-		for (int j = 0; j < totalAttribCount; j++) {
-			const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
-			std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
-		}
+		// Running shader on the CPU instead of the GPU
+		if constexpr (mode == ShaderExecMode::Interpreter || mode == ShaderExecMode::JIT) {
+			// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
+			// Based on the SH_ATTRIBUTES_PERMUTATION registers.
+			// Ie it might map attribute #0 to v2, #1 to v7, etc
+			for (int j = 0; j < totalAttribCount; j++) {
+				const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
+				std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
+			}
 
-		if constexpr (useShaderJIT) {
-			shaderJIT.run(shaderUnit.vs);
-		} else {
-			shaderUnit.vs.run();
-		}
+			if constexpr (mode == ShaderExecMode::JIT) {
+				shaderJIT.run(shaderUnit.vs);
+			} else {
+				shaderUnit.vs.run();
+			}
 
-		PICA::Vertex& out = vertices[i];
-		// Map shader outputs to fixed function properties
-		const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
-		for (int i = 0; i < totalShaderOutputs; i++) {
-			const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
+			PICA::Vertex& out = vertices[i];
+			// Map shader outputs to fixed function properties
+			const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
+			for (int i = 0; i < totalShaderOutputs; i++) {
+				const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
 
-			for (int j = 0; j < 4; j++) {  // pls unroll
-				const u32 mapping = (config >> (j * 8)) & 0x1F;
-				out.raw[mapping] = vsOutputRegisters[i][j];
+				for (int j = 0; j < 4; j++) {  // pls unroll
+					const u32 mapping = (config >> (j * 8)) & 0x1F;
+					out.raw[mapping] = vsOutputRegisters[i][j];
+				}
+			}
+		} else {  // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly
+			PICA::Vertex& out = vertices[i];
+			for (int j = 0; j < totalAttribCount; j++) {
+				const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
+				std::memcpy(&out.raw[mapping], &currentAttributes[j], sizeof(vec4f));
 			}
 		}
 	}
diff --git a/src/core/PICA/shader_gen_glsl.cpp b/src/core/PICA/shader_gen_glsl.cpp
index 9802be902..d4deee356 100644
--- a/src/core/PICA/shader_gen_glsl.cpp
+++ b/src/core/PICA/shader_gen_glsl.cpp
@@ -72,11 +72,6 @@ std::string FragmentGenerator::getDefaultVertexShader() {
 		out float gl_ClipDistance[2];
 	#endif
 
-		vec4 abgr8888ToVec4(uint abgr) {
-			const float scale = 1.0 / 255.0;
-			return scale * vec4(float(abgr & 0xffu), float((abgr >> 8) & 0xffu), float((abgr >> 16) & 0xffu), float(abgr >> 24));
-		}
-
 		void main() {
 			gl_Position = a_coords;
 			vec4 colourAbs = abs(a_vertexColour);
@@ -677,4 +672,58 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf
 	shader += "vec2 value = texelFetch(u_tex_luts, ivec2(int(clamped_index), 24), 0).rg;"; // fog LUT is past the light LUTs
 	shader += "float fog_factor = clamp(value.r + value.g * delta, 0.0, 1.0);";
 	shader += "combinerOutput.rgb = mix(fog_color, combinerOutput.rgb, fog_factor);";
+}
+
+std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader) {
+	if (usingUbershader) {
+		Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader");
+		return picaSource;
+	} else {
+		// TODO: Uniforms and don't hardcode fixed-function semantic indices...
+		std::string ret = picaSource;
+		if (api == API::GLES) {
+			ret += "\n#define USING_GLES\n";
+		}
+
+		ret += R"(
+out vec4 v_quaternion;
+out vec4 v_colour;
+out vec3 v_texcoord0;
+out vec2 v_texcoord1;
+out vec3 v_view;
+out vec2 v_texcoord2;
+
+#ifndef USING_GLES
+	out float gl_ClipDistance[2];
+#endif
+
+void main() {
+	pica_shader_main();
+	vec4 a_coords = output_registers[0];
+	vec4 a_vertexColour = output_registers[1];
+	vec2 a_texcoord0 = output_registers[2].xy;
+	float a_texcoord0_w = output_registers[2].w;
+	vec2 a_texcoord1 = output_registers[3].xy;
+	vec2 a_texcoord2 = output_registers[4].xy;
+	vec3 a_view = output_registers[5].xyz;
+	vec4 a_quaternion = output_registers[6];
+
+	gl_Position = a_coords;
+	vec4 colourAbs = abs(a_vertexColour);
+	v_colour = min(colourAbs, vec4(1.f));
+
+	v_texcoord0 = vec3(a_texcoord0.x, 1.0 - a_texcoord0.y, a_texcoord0_w);
+	v_texcoord1 = vec2(a_texcoord1.x, 1.0 - a_texcoord1.y);
+	v_texcoord2 = vec2(a_texcoord2.x, 1.0 - a_texcoord2.y);
+	v_view = a_view;
+	v_quaternion = a_quaternion;
+
+#ifndef USING_GLES
+	//gl_ClipDistance[0] = -a_coords.z;
+	//gl_ClipDistance[1] = dot(clipCoords, a_coords);
+#endif
+})";
+
+		return ret;
+	}
 }
\ No newline at end of file
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 90eccf47a..c593ad96f 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -25,7 +25,7 @@ void RendererGL::reset() {
 	colourBufferCache.reset();
 	textureCache.reset();
 
-	clearShaderCache();
+	shaderCache.clear();
 
 	// Init the colour/depth buffer settings to some random defaults on reset
 	colourBufferLoc = 0;
@@ -788,18 +788,24 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
 
 	PICA::FragmentConfig fsConfig(regs);
 
-	CachedProgram& programEntry = shaderCache[fsConfig];
+	OpenGL::Shader& fragShader = shaderCache.fragmentShaderCache[fsConfig];
+	if (!fragShader.exists()) {
+		std::string fs = fragShaderGen.generate(fsConfig);
+		fragShader.create({fs.c_str(), fs.size()}, OpenGL::Fragment);
+	}
+
+	// Get the handle of the current vertex shader
+	OpenGL::Shader& vertexShader = usingAcceleratedShader ? *generatedVertexShader : defaultShadergenVs;
+	// And form the key for looking up a shader program
+	const u64 programKey = (u64(vertexShader.handle()) << 32) | u64(fragShader.handle());
+
+	CachedProgram& programEntry = shaderCache.programCache[programKey];
 	OpenGL::Program& program = programEntry.program;
 
 	if (!program.exists()) {
-		std::string fs = fragShaderGen.generate(fsConfig);
-
-		OpenGL::Shader fragShader({fs.c_str(), fs.size()}, OpenGL::Fragment);
-		program.create({defaultShadergenVs, fragShader});
+		program.create({vertexShader, fragShader});
 		gl.useProgram(program);
 
-		fragShader.free();
-
 		// Init sampler objects. Texture 0 goes in texture unit 0, texture 1 in TU 1, texture 2 in TU 2, and the light maps go in TU 3
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex0"), 0);
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1);
@@ -904,15 +910,8 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
 	return program;
 }
 
-void RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
-	std::string vertShaderSource = PICA::ShaderGen::decompileShader(
-		shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL
-	);
-	
-	OpenGL::Shader vert({vertShaderSource.c_str(), vertShaderSource.size()}, OpenGL::Vertex);
-	//triangleProgram.create({vert, frag});
-	std::cout << vertShaderSource << "\n";
-
+bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
+	// First we figure out if we will be using an ubershader
 	bool usingUbershader = emulatorConfig->useUbershaders;
 	if (usingUbershader) {
 		const bool lightsEnabled = (regs[InternalRegs::LightingEnable] & 1) != 0;
@@ -925,6 +924,46 @@ void RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
 		}
 	}
 
+	// Then we figure out if we will use hw accelerated shaders, and try to fetch our shader
+	// TODO: Ubershader support for accelerated shaders
+	usingAcceleratedShader = emulatorConfig->accelerateShaders && !isImmediateMode && !usingUbershader;
+
+	if (usingAcceleratedShader) {
+		auto shaderCodeHash = shaderUnit.vs.getCodeHash();
+		auto opdescHash = shaderUnit.vs.getOpdescHash();
+		auto vertexConfig = PICA::VertConfig{
+			.shaderHash = shaderCodeHash,
+			.opdescHash = opdescHash,
+			.entrypoint = shaderUnit.vs.entrypoint,
+			.usingUbershader = usingUbershader,
+		};
+
+		std::optional<OpenGL::Shader>& shader = shaderCache.vertexShaderCache[vertexConfig];
+		// If the optional is false, we have never tried to recompile the shader before. Try to recompile it and see if it works.
+		if (!shader.has_value()) {
+			// Initialize shader to a "null" shader (handle == 0)
+			*shader = OpenGL::Shader();
+
+			std::string picaShaderSource = PICA::ShaderGen::decompileShader(
+				shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL
+			);
+			
+			// Empty source means compilation error, if the source is not empty then we convert the rcompiled PICA code into a valid shader and upload
+			// it to the GPU
+			if (!picaShaderSource.empty()) {
+				std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, usingUbershader);
+				shader->create({vertexShaderSource}, OpenGL::Vertex);
+			}
+		}
+
+		// Shader generation did not work out, so set usingAcceleratedShader to false
+		if (!shader->exists()) {
+			usingAcceleratedShader = false;
+		} else {
+			generatedVertexShader = &(*shader);
+		}
+	}
+
 	if (usingUbershader) {
 		gl.useProgram(triangleProgram);
 	} else {
@@ -958,6 +997,8 @@ void RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
 		glUniform1uiv(ubershaderData.picaRegLoc, 0x200 - 0x48, &regs[0x48]);
 		setupUbershaderTexEnv();
 	}
+
+	return usingAcceleratedShader;
 }
 
 void RendererGL::screenshot(const std::string& name) {
@@ -985,22 +1026,12 @@ void RendererGL::screenshot(const std::string& name) {
 	stbi_write_png(name.c_str(), width, height, 4, flippedPixels.data(), 0);
 }
 
-void RendererGL::clearShaderCache() {
-	for (auto& shader : shaderCache) {
-		CachedProgram& cachedProgram = shader.second;
-		cachedProgram.program.free();
-		glDeleteBuffers(1, &cachedProgram.uboBinding);
-	}
-
-	shaderCache.clear();
-}
-
 void RendererGL::deinitGraphicsContext() {
 	// Invalidate all surface caches since they'll no longer be valid
 	textureCache.reset();
 	depthBufferCache.reset();
 	colourBufferCache.reset();
-	clearShaderCache();
+	shaderCache.clear();
 
 	// All other GL objects should be invalidated automatically and be recreated by the next call to initGraphicsContext
 	// TODO: Make it so that depth and colour buffers get written back to 3DS memory
@@ -1048,4 +1079,4 @@ void RendererGL::initUbershader(OpenGL::Program& program) {
 	glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1);
 	glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2);
 	glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3);
-}
+}
\ No newline at end of file

From efcb42af2c15fbe5e837dcdc159384ca87034551 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Thu, 25 Jul 2024 23:36:22 +0300
Subject: [PATCH 05/63] Shader decompiler: Fix redundant compilations

---
 include/renderer_gl/renderer_gl.hpp  | 1 -
 src/core/renderer_gl/renderer_gl.cpp | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp
index 41dba6eb6..2471bae03 100644
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@@ -95,7 +95,6 @@ class RendererGL final : public Renderer {
 			for (auto& it : programCache) {
 				CachedProgram& cachedProgram = it.second;
 				cachedProgram.program.free();
-				glDeleteBuffers(1, &cachedProgram.uboBinding);
 			}
 
 			for (auto& it : vertexShaderCache) {
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 6f0cab1a9..5cd7ccedb 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -944,7 +944,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
 		// If the optional is false, we have never tried to recompile the shader before. Try to recompile it and see if it works.
 		if (!shader.has_value()) {
 			// Initialize shader to a "null" shader (handle == 0)
-			*shader = OpenGL::Shader();
+			shader = OpenGL::Shader();
 
 			std::string picaShaderSource = PICA::ShaderGen::decompileShader(
 				shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL

From d9f4f3736f48fdb20da6c33915daa68eb6adaf23 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Fri, 26 Jul 2024 00:21:26 +0300
Subject: [PATCH 06/63] Shader Decompiler: Fix vertex attribute upload

---
 include/renderer_gl/renderer_gl.hpp  |  5 ++-
 src/core/PICA/gpu.cpp                |  3 +-
 src/core/renderer_gl/renderer_gl.cpp | 47 +++++++++++++++++-----------
 3 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp
index 2471bae03..cb9328276 100644
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@@ -31,7 +31,10 @@ class RendererGL final : public Renderer {
 	OpenGL::Program triangleProgram;
 	OpenGL::Program displayProgram;
 
-	OpenGL::VertexArray vao;
+	// VAO for when not using accelerated vertex shaders. Contains attribute declarations matching to the PICA fixed function fragment attributes
+	OpenGL::VertexArray defaultVAO;
+	// VAO for when using accelerated vertex shaders. The PICA vertex shader inputs are passed as attributes without CPU processing.
+	OpenGL::VertexArray hwShaderVAO;
 	OpenGL::VertexBuffer vbo;
 
 	// Data 
diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp
index 077c65aa5..a6d734fd0 100644
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@@ -364,7 +364,8 @@ void GPU::drawArrays() {
 			PICA::Vertex& out = vertices[i];
 			for (int j = 0; j < totalAttribCount; j++) {
 				const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
-				std::memcpy(&out.raw[mapping], &currentAttributes[j], sizeof(vec4f));
+				// Multiply mapping * 4 as mapping refers to a vec4 whereas out.raw is an array of floats
+				std::memcpy(&out.raw[mapping * 4], &currentAttributes[j], sizeof(vec4f));
 			}
 		}
 	}
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 5cd7ccedb..c2c041b3d 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -85,33 +85,42 @@ void RendererGL::initGraphicsContextInternal() {
 
 	vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW);
 	gl.bindVBO(vbo);
-	vao.create();
-	gl.bindVAO(vao);
+	// Initialize the VAO used when not using hw shaders
+	defaultVAO.create();
+	gl.bindVAO(defaultVAO);
 
 	// Position (x, y, z, w) attributes
-	vao.setAttributeFloat<float>(0, 4, sizeof(Vertex), offsetof(Vertex, s.positions));
-	vao.enableAttribute(0);
+	defaultVAO.setAttributeFloat<float>(0, 4, sizeof(Vertex), offsetof(Vertex, s.positions));
+	defaultVAO.enableAttribute(0);
 	// Quaternion attribute
-	vao.setAttributeFloat<float>(1, 4, sizeof(Vertex), offsetof(Vertex, s.quaternion));
-	vao.enableAttribute(1);
+	defaultVAO.setAttributeFloat<float>(1, 4, sizeof(Vertex), offsetof(Vertex, s.quaternion));
+	defaultVAO.enableAttribute(1);
 	// Colour attribute
-	vao.setAttributeFloat<float>(2, 4, sizeof(Vertex), offsetof(Vertex, s.colour));
-	vao.enableAttribute(2);
+	defaultVAO.setAttributeFloat<float>(2, 4, sizeof(Vertex), offsetof(Vertex, s.colour));
+	defaultVAO.enableAttribute(2);
 	// UV 0 attribute
-	vao.setAttributeFloat<float>(3, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord0));
-	vao.enableAttribute(3);
+	defaultVAO.setAttributeFloat<float>(3, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord0));
+	defaultVAO.enableAttribute(3);
 	// UV 1 attribute
-	vao.setAttributeFloat<float>(4, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord1));
-	vao.enableAttribute(4);
+	defaultVAO.setAttributeFloat<float>(4, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord1));
+	defaultVAO.enableAttribute(4);
 	// UV 0 W-component attribute
-	vao.setAttributeFloat<float>(5, 1, sizeof(Vertex), offsetof(Vertex, s.texcoord0_w));
-	vao.enableAttribute(5);
+	defaultVAO.setAttributeFloat<float>(5, 1, sizeof(Vertex), offsetof(Vertex, s.texcoord0_w));
+	defaultVAO.enableAttribute(5);
 	// View
-	vao.setAttributeFloat<float>(6, 3, sizeof(Vertex), offsetof(Vertex, s.view));
-	vao.enableAttribute(6);
+	defaultVAO.setAttributeFloat<float>(6, 3, sizeof(Vertex), offsetof(Vertex, s.view));
+	defaultVAO.enableAttribute(6);
 	// UV 2 attribute
-	vao.setAttributeFloat<float>(7, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord2));
-	vao.enableAttribute(7);
+	defaultVAO.setAttributeFloat<float>(7, 2, sizeof(Vertex), offsetof(Vertex, s.texcoord2));
+	defaultVAO.enableAttribute(7);
+
+	// Initialize the VAO used for hw shaders
+	hwShaderVAO.create();
+	gl.bindVAO(hwShaderVAO);
+	for (int attr = 0; attr < 8; attr++) {
+		hwShaderVAO.setAttributeFloat<float>(attr, 4, sizeof(Vertex), attr * sizeof(float) * 4);
+		hwShaderVAO.enableAttribute(attr);
+	}
 
 	dummyVBO.create();
 	dummyVAO.create();
@@ -418,7 +427,7 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 	const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
 	gl.disableScissor();
 	gl.bindVBO(vbo);
-	gl.bindVAO(vao);
+	gl.bindVAO(usingAcceleratedShader ? hwShaderVAO : defaultVAO);
 
 	gl.enableClipPlane(0);  // Clipping plane 0 is always enabled
 	if (regs[PICA::InternalRegs::ClipEnable] & 1) {

From 2fc09223aa0f131ddff5448d6796d08381a1478e Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Fri, 26 Jul 2024 01:08:00 +0300
Subject: [PATCH 07/63] Shader compiler: Simplify generated code for reading
 and faster compilation

---
 src/core/PICA/shader_decompiler.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 826cfaccf..d2414c13a 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -163,6 +163,12 @@ std::string ShaderDecompiler::getDest(u32 dest) const {
 }
 
 std::string ShaderDecompiler::getSwizzlePattern(u32 swizzle) const {
+	// If the swizzle field is this value then the swizzle pattern is .xyzw so we don't need a shuffle
+	static constexpr uint noSwizzle = 0x1B;
+	if (swizzle == noSwizzle) {
+		return "";
+	}
+
 	static constexpr std::array<char, 4> names = {'x', 'y', 'z', 'w'};
 	std::string ret(".    ");
 	
@@ -211,8 +217,10 @@ void ShaderDecompiler::setDest(u32 operandDescriptor, const std::string& dest, c
 	decompiledShader += dest + destSwizzle + " = ";
 	if (writtenLaneCount == 1) {
 		decompiledShader += "float(" + value + ");\n";
-	} else {
-		decompiledShader += "vec" + std::to_string(writtenLaneCount) + "(" + value + ");\n";
+	} else if (writtenLaneCount <= 3) { // We don't need to cast for vec4, as we guarantee the rhs will be a vec4
+		decompiledShader += fmt::format("vec{}({});\n", writtenLaneCount, value);
+	} else if (writtenLaneCount == 4) {
+		decompiledShader += fmt::format("{};\n", value);
 	}
 }
 

From 213183895abe05e4720520dbce6f06ba7cee1403 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Fri, 26 Jul 2024 01:15:03 +0300
Subject: [PATCH 08/63] Further simplify shader decompiler output

---
 src/core/PICA/shader_decompiler.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index d2414c13a..5559bcc56 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -182,7 +182,6 @@ std::string ShaderDecompiler::getSwizzlePattern(u32 swizzle) const {
 
 std::string ShaderDecompiler::getDestSwizzle(u32 destinationMask) const {
 	std::string ret = ".";
-	
 	if (destinationMask & 0b1000) {
 		ret += "x";
 	}
@@ -214,7 +213,8 @@ void ShaderDecompiler::setDest(u32 operandDescriptor, const std::string& dest, c
 		return;
 	}
 
-	decompiledShader += dest + destSwizzle + " = ";
+	// Don't write destination swizzle if all lanes are getting written to
+	decompiledShader += fmt::format("{}{} = ", dest, writtenLaneCount == 4 ? "" : destSwizzle);
 	if (writtenLaneCount == 1) {
 		decompiledShader += "float(" + value + ");\n";
 	} else if (writtenLaneCount <= 3) { // We don't need to cast for vec4, as we guarantee the rhs will be a vec4

From e8b4992036eb254ed48b3775a072bf4da16e22fb Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Fri, 26 Jul 2024 01:24:52 +0300
Subject: [PATCH 09/63] Shader decompiler: More smallen-ing

---
 src/core/PICA/shader_decompiler.cpp | 10 +++++-----
 src/core/PICA/shader_gen_glsl.cpp   | 16 ++++++++--------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 5559bcc56..599bd31e1 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -81,8 +81,8 @@ void ShaderDecompiler::writeAttributes() {
 		uint uniform_bool;
 	};
 
-	vec4 temp_registers[16];
-	vec4 output_registers[8];
+	vec4 tmp_regs[16];
+	vec4 out_regs[8];
 	vec4 dummy_vec = vec4(0.0);
 )";
 }
@@ -141,7 +141,7 @@ std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index)
 	if (source < 0x10) {
 		return "inputs[" + std::to_string(source) + "]";
 	} else if (source < 0x20) {
-		return "temp_registers[" + std::to_string(source - 0x10) + "]";
+		return "tmp_regs[" + std::to_string(source - 0x10) + "]";
 	} else {
 		const usize floatIndex = (source - 0x20) & 0x7f;
 
@@ -154,9 +154,9 @@ std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index)
 
 std::string ShaderDecompiler::getDest(u32 dest) const {
 	if (dest < 0x10) {
-		return "output_registers[" + std::to_string(dest) + "]";
+		return "out_regs[" + std::to_string(dest) + "]";
 	} else if (dest < 0x20) {
-		return "temp_registers[" + std::to_string(dest - 0x10) + "]";
+		return "tmp_regs[" + std::to_string(dest - 0x10) + "]";
 	} else {
 		return "dummy_vec";
 	}
diff --git a/src/core/PICA/shader_gen_glsl.cpp b/src/core/PICA/shader_gen_glsl.cpp
index cb78242dc..edc8a293c 100644
--- a/src/core/PICA/shader_gen_glsl.cpp
+++ b/src/core/PICA/shader_gen_glsl.cpp
@@ -696,14 +696,14 @@ out vec2 v_texcoord2;
 
 void main() {
 	pica_shader_main();
-	vec4 a_coords = output_registers[0];
-	vec4 a_vertexColour = output_registers[1];
-	vec2 a_texcoord0 = output_registers[2].xy;
-	float a_texcoord0_w = output_registers[2].w;
-	vec2 a_texcoord1 = output_registers[3].xy;
-	vec2 a_texcoord2 = output_registers[4].xy;
-	vec3 a_view = output_registers[5].xyz;
-	vec4 a_quaternion = output_registers[6];
+	vec4 a_coords = out_regs[0];
+	vec4 a_vertexColour = out_regs[1];
+	vec2 a_texcoord0 = out_regs[2].xy;
+	float a_texcoord0_w = out_regs[2].w;
+	vec2 a_texcoord1 = out_regs[3].xy;
+	vec2 a_texcoord2 = out_regs[4].xy;
+	vec3 a_view = out_regs[5].xyz;
+	vec4 a_quaternion = out_regs[6];
 
 	gl_Position = a_coords;
 	vec4 colourAbs = abs(a_vertexColour);

From 67ff1ccb8b50e0ad51c4f870cc656e0dbdebbf9d Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Fri, 26 Jul 2024 14:28:48 +0300
Subject: [PATCH 10/63] Shader decompiler: Get PICA uniforms uploaded to the
 GPU

---
 include/PICA/shader.hpp              |  4 ++++
 include/renderer_gl/renderer_gl.hpp  |  2 ++
 src/core/renderer_gl/renderer_gl.cpp | 28 ++++++++++++++++++++++------
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/include/PICA/shader.hpp b/include/PICA/shader.hpp
index c725c180a..7f127795e 100644
--- a/include/PICA/shader.hpp
+++ b/include/PICA/shader.hpp
@@ -301,6 +301,10 @@ class PICAShader {
 
 	Hash getCodeHash();
 	Hash getOpdescHash();
+
+	// Returns how big the PICA uniforms are combined. Used for hw accelerated shaders where we upload the uniforms to our GPU.
+	static constexpr usize totalUniformSize() { return sizeof(floatUniforms) + sizeof(intUniforms) + sizeof(boolUniform); }
+	void* getUniformPointer() { return static_cast<void*>(&floatUniforms); }
 };
 
 static_assert(
diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp
index cb9328276..73b52cc5a 100644
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@@ -80,6 +80,8 @@ class RendererGL final : public Renderer {
 	// We can compile this once and then link it with all other generated fragment shaders
 	OpenGL::Shader defaultShadergenVs;
 	GLuint shadergenFragmentUBO;
+	// UBO for uploading the PICA uniforms when using hw shaders
+	GLuint hwShaderUniformUBO;
 
 	// Cached recompiled fragment shader
 	struct CachedProgram {
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index c2c041b3d..17e3702f3 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -83,6 +83,11 @@ void RendererGL::initGraphicsContextInternal() {
 	gl.bindUBO(shadergenFragmentUBO);
 	glBufferData(GL_UNIFORM_BUFFER, sizeof(PICA::FragmentUniforms), nullptr, GL_DYNAMIC_DRAW);
 
+	// Allocate memory for the accelerated vertex shader uniform UBO
+	glGenBuffers(1, &hwShaderUniformUBO);
+	gl.bindUBO(hwShaderUniformUBO);
+	glBufferData(GL_UNIFORM_BUFFER, PICAShader::totalUniformSize(), nullptr, GL_DYNAMIC_DRAW);
+
 	vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW);
 	gl.bindVBO(vbo);
 	// Initialize the VAO used when not using hw shaders
@@ -798,7 +803,8 @@ std::optional<ColourBuffer> RendererGL::getColourBuffer(u32 addr, PICA::ColorFmt
 }
 
 OpenGL::Program& RendererGL::getSpecializedShader() {
-	constexpr uint uboBlockBinding = 2;
+	constexpr uint vsUBOBlockBinding = 1;
+	constexpr uint fsUBOBlockBinding = 2;
 
 	PICA::FragmentConfig fsConfig(regs);
 
@@ -826,12 +832,20 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2);
 		glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3);
 
-		// Set up the binding for our UBO. Sadly we can't specify it in the shader like normal people,
+		// Set up the binding for our UBOs. Sadly we can't specify it in the shader like normal people,
 		// As it's an OpenGL 4.2 feature that MacOS doesn't support...
-		uint uboIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms");
-		glUniformBlockBinding(program.handle(), uboIndex, uboBlockBinding);
+		uint fsUBOIndex = glGetUniformBlockIndex(program.handle(), "FragmentUniforms");
+		glUniformBlockBinding(program.handle(), fsUBOIndex, fsUBOBlockBinding);
+
+		if (usingAcceleratedShader) {
+			uint vertexUBOIndex = glGetUniformBlockIndex(program.handle(), "PICAShaderUniforms");
+			glUniformBlockBinding(program.handle(), vertexUBOIndex, vsUBOBlockBinding);
+		}
+	}
+	glBindBufferBase(GL_UNIFORM_BUFFER, fsUBOBlockBinding, shadergenFragmentUBO);
+	if (usingAcceleratedShader) {
+		glBindBufferBase(GL_UNIFORM_BUFFER, vsUBOBlockBinding, hwShaderUniformUBO);
 	}
-	glBindBufferBase(GL_UNIFORM_BUFFER, uboBlockBinding, shadergenFragmentUBO);
 
 	// Upload uniform data to our shader's UBO
 	PICA::FragmentUniforms uniforms;
@@ -958,7 +972,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
 			std::string picaShaderSource = PICA::ShaderGen::decompileShader(
 				shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL
 			);
-			
+
 			// Empty source means compilation error, if the source is not empty then we convert the rcompiled PICA code into a valid shader and upload
 			// it to the GPU
 			if (!picaShaderSource.empty()) {
@@ -972,6 +986,8 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
 			usingAcceleratedShader = false;
 		} else {
 			generatedVertexShader = &(*shader);
+			gl.bindUBO(hwShaderUniformUBO);
+			glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer());
 		}
 	}
 

From db64b0a260d09ebd0e3c1bba1c07b21ba40ee52c Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Fri, 26 Jul 2024 14:41:28 +0300
Subject: [PATCH 11/63] Shader decompiler: Readd clipping

---
 src/core/PICA/shader_gen_glsl.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/core/PICA/shader_gen_glsl.cpp b/src/core/PICA/shader_gen_glsl.cpp
index edc8a293c..2dbccaeb7 100644
--- a/src/core/PICA/shader_gen_glsl.cpp
+++ b/src/core/PICA/shader_gen_glsl.cpp
@@ -682,6 +682,8 @@ std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& pic
 			ret += "\n#define USING_GLES\n";
 		}
 
+		ret += uniformDefinition;
+
 		ret += R"(
 out vec4 v_quaternion;
 out vec4 v_colour;
@@ -716,8 +718,8 @@ void main() {
 	v_quaternion = a_quaternion;
 
 #ifndef USING_GLES
-	//gl_ClipDistance[0] = -a_coords.z;
-	//gl_ClipDistance[1] = dot(clipCoords, a_coords);
+	gl_ClipDistance[0] = -a_coords.z;
+	gl_ClipDistance[1] = dot(clipCoords, a_coords);
 #endif
 })";
 

From 67daf03e446371bd9d07ce7e061b0a00605b988a Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Fri, 26 Jul 2024 16:27:41 +0300
Subject: [PATCH 12/63] Shader decompiler: Actually `break` on control flow
 instructions

---
 src/core/PICA/shader_decompiler.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 599bd31e1..3cdbeb8ed 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -34,14 +34,14 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 		const u32 opcode = instruction >> 26;
 
 		switch (opcode) {
-			case ShaderOpcodes::JMPC: Helpers::panic("Unimplemented control flow operation (JMPC)");
-			case ShaderOpcodes::JMPU: Helpers::panic("Unimplemented control flow operation (JMPU)");
-			case ShaderOpcodes::IFU: Helpers::panic("Unimplemented control flow operation (IFU)");
-			case ShaderOpcodes::IFC: Helpers::panic("Unimplemented control flow operation (IFC)");
-			case ShaderOpcodes::CALL: Helpers::panic("Unimplemented control flow operation (CALL)");
-			case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)");
-			case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)");
-			case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)");
+			case ShaderOpcodes::JMPC: Helpers::panic("Unimplemented control flow operation (JMPC)"); break;
+			case ShaderOpcodes::JMPU: Helpers::panic("Unimplemented control flow operation (JMPU)"); break;
+			case ShaderOpcodes::IFU: Helpers::panic("Unimplemented control flow operation (IFU)"); break;
+			case ShaderOpcodes::IFC: Helpers::panic("Unimplemented control flow operation (IFC)"); break;
+			case ShaderOpcodes::CALL: Helpers::panic("Unimplemented control flow operation (CALL)"); break;
+			case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)"); break;
+			case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)"); break;
+			case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)"); break;
 			case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second;
 
 			default: break;

From 5eb15de431ecbec71475f55b28c31f1ce62fe046 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Fri, 26 Jul 2024 22:02:03 +0300
Subject: [PATCH 13/63] Shader decompiler: More control flow handling

---
 include/PICA/shader_decompiler.hpp  |   5 +-
 src/core/PICA/shader_decompiler.cpp | 217 ++++++++++++++++++++++++++--
 2 files changed, 206 insertions(+), 16 deletions(-)

diff --git a/include/PICA/shader_decompiler.hpp b/include/PICA/shader_decompiler.hpp
index 1253226fc..42bd56429 100644
--- a/include/PICA/shader_decompiler.hpp
+++ b/include/PICA/shader_decompiler.hpp
@@ -4,6 +4,7 @@
 #include <tuple>
 #include <map>
 #include <vector>
+#include <utility>
 
 #include "PICA/shader.hpp"
 #include "PICA/shader_gen_types.hpp"
@@ -95,7 +96,8 @@ namespace PICA::ShaderGen {
 		Language language;
 
 		void compileInstruction(u32& pc, bool& finished);
-		void compileRange(const AddressRange& range);
+		// Compile range "range" and returns the end PC or if we're "finished" with the program (called an END instruction)
+		std::pair<u32, bool> compileRange(const AddressRange& range);
 		void callFunction(const Function& function);
 		const Function* findFunction(const AddressRange& range);
 
@@ -105,6 +107,7 @@ namespace PICA::ShaderGen {
 		std::string getDest(u32 dest) const;
 		std::string getSwizzlePattern(u32 swizzle) const;
 		std::string getDestSwizzle(u32 destinationMask) const;
+		const char* getCondition(u32 cond, u32 refX, u32 refY);
 
 		void setDest(u32 operandDescriptor, const std::string& dest, const std::string& value);
 		// Returns if the instruction uses the typical register encodings most instructions use
diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 3cdbeb8ed..5134845ed 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -2,6 +2,9 @@
 
 #include <fmt/format.h>
 
+#include <array>
+#include <cassert>
+
 #include "config.hpp"
 
 using namespace PICA;
@@ -20,6 +23,40 @@ void ControlFlow::analyze(const PICAShader& shader, u32 entrypoint) {
 	}
 }
 
+// Helpers for merging parallel/series exit methods from Citra
+// Merges exit method of two parallel branches.
+static ExitMode exitParallel(ExitMode a, ExitMode b) {
+	if (a == ExitMode::Unknown) {
+		return b;
+	}
+	else if (b == ExitMode::Unknown) {
+		return a;
+	}
+	else if (a == b) {
+		return a;
+	}
+	return ExitMode::Conditional;
+}
+
+// Cascades exit method of two blocks of code.
+static ExitMode exitSeries(ExitMode a, ExitMode b) {
+	assert(a != ExitMode::AlwaysEnd);
+
+	if (a == ExitMode::Unknown) {
+		return ExitMode::Unknown;
+	}
+
+	if (a == ExitMode::AlwaysReturn) {
+		return b;
+	}
+
+	if (b == ExitMode::Unknown || b == ExitMode::AlwaysEnd) {
+		return ExitMode::AlwaysEnd;
+	}
+
+	return ExitMode::Conditional;
+}
+
 ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 end, Function::Labels& labels) {
 	// Initialize exit mode to unknown by default, in order to detect things like unending loops
 	auto [it, inserted] = exitMap.emplace(AddressRange(start, end), ExitMode::Unknown);
@@ -32,17 +69,63 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 	for (u32 pc = start; pc < PICAShader::maxInstructionCount && pc != end; pc++) {
 		const u32 instruction = shader.loadedShader[pc];
 		const u32 opcode = instruction >> 26;
+		auto setExitMode = [&it](ExitMode mode) {
+			it->second = mode;
+			return it->second;
+		};
 
 		switch (opcode) {
-			case ShaderOpcodes::JMPC: Helpers::panic("Unimplemented control flow operation (JMPC)"); break;
-			case ShaderOpcodes::JMPU: Helpers::panic("Unimplemented control flow operation (JMPU)"); break;
-			case ShaderOpcodes::IFU: Helpers::panic("Unimplemented control flow operation (IFU)"); break;
-			case ShaderOpcodes::IFC: Helpers::panic("Unimplemented control flow operation (IFC)"); break;
+			case ShaderOpcodes::JMPC:
+			case ShaderOpcodes::JMPU: {
+				const u32 dest = getBits<10, 12>(instruction);
+				// Register this jump address to our outLabels set
+				labels.insert(dest);
+
+				// This opens up 2 parallel paths of execution
+				auto branchTakenExit = analyzeFunction(shader, dest, end, labels);
+				auto branchNotTakenExit = analyzeFunction(shader, pc + 1, dest, labels);
+				return setExitMode(exitParallel(branchTakenExit, branchNotTakenExit));
+			}
+			case ShaderOpcodes::IFU:
+			case ShaderOpcodes::IFC: {
+				Helpers::panic("IFC/IFU");
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+
+				const Function* branchTakenFunc = addFunction(shader, pc + 1, dest);
+				// Check if analysis of the branch taken func failed and return unknown if it did
+				if (analysisFailed) {
+					return setExitMode(ExitMode::Unknown);
+				}
+
+				// Next analyze the not taken func
+				ExitMode branchNotTakenExitMode = ExitMode::AlwaysReturn;
+				if (num != 0) {
+					const Function* branchNotTakenFunc = addFunction(shader, dest, dest + num);
+					// Check if analysis failed and return unknown if it did
+					if (analysisFailed) {
+						return setExitMode(ExitMode::Unknown);
+					}
+
+					branchNotTakenExitMode = branchNotTakenFunc->exitMode;
+				}
+
+				auto parallel = exitParallel(branchTakenFunc->exitMode, branchNotTakenExitMode);
+				// Both branches of the if/else end, so there's nothing after the call
+				if (parallel == ExitMode::AlwaysEnd) {
+					return setExitMode(parallel);
+				} else {
+					ExitMode afterConditional = analyzeFunction(shader, pc + 1, end, labels);
+					ExitMode conditionalExitMode = exitSeries(parallel, afterConditional);
+					return setExitMode(conditionalExitMode);
+				}
+				break;
+			}
 			case ShaderOpcodes::CALL: Helpers::panic("Unimplemented control flow operation (CALL)"); break;
 			case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)"); break;
 			case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)"); break;
 			case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)"); break;
-			case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second;
+			case ShaderOpcodes::END: return setExitMode(ExitMode::AlwaysEnd);
 
 			default: break;
 		}
@@ -52,7 +135,7 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 	return ExitMode::AlwaysReturn;
 }
 
-void ShaderDecompiler::compileRange(const AddressRange& range) {
+std::pair<u32, bool> ShaderDecompiler::compileRange(const AddressRange& range) {
 	u32 pc = range.start;
 	const u32 end = range.end >= range.start ? range.end : PICAShader::maxInstructionCount;
 	bool finished = false;
@@ -60,6 +143,8 @@ void ShaderDecompiler::compileRange(const AddressRange& range) {
 	while (pc < end && !finished) {
 		compileInstruction(pc, finished);
 	}
+
+	return std::make_pair(pc, finished);
 }
 
 const Function* ShaderDecompiler::findFunction(const AddressRange& range) {
@@ -84,6 +169,7 @@ void ShaderDecompiler::writeAttributes() {
 	vec4 tmp_regs[16];
 	vec4 out_regs[8];
 	vec4 dummy_vec = vec4(0.0);
+	bvec2 cmp_reg = bvec2(false);
 )";
 }
 
@@ -124,14 +210,45 @@ std::string ShaderDecompiler::decompile() {
 	callFunction(*findFunction(mainFunctionRange));
 	decompiledShader += "}\n";
 
-	for (auto& func : controlFlow.functions) {
-		if (func.outLabels.size() > 0) {
-			Helpers::panic("Function with out labels");
-		}
+	for (const Function& func : controlFlow.functions) {
+		if (func.outLabels.empty()) {
+			decompiledShader += fmt::format("void {}() {{\n", func.getIdentifier());
+			compileRange(AddressRange(func.start, func.end));
+			decompiledShader += "}\n";
+		} else {
+			auto labels = func.outLabels;
+			labels.insert(func.start);
+
+			// If a function has jumps and "labels", this needs to be emulated using a switch-case, with the variable being switched on being the
+			// current PC
+			decompiledShader += fmt::format("void {}() {{\n", func.getIdentifier());
+			decompiledShader += fmt::format("uint pc = {}u;\n", func.start);
+			decompiledShader += "while(true){\nswitch(pc){\n";
+
+			for (u32 label : labels) {
+				decompiledShader += fmt::format("case {}u: {{", label);
+				// Fetch the next label whose address > label
+				auto it = labels.lower_bound(label + 1);
+				u32 next = (it == labels.end()) ? func.end : *it;
+
+				auto [endPC, finished] = compileRange(AddressRange(label, next));
+				if (endPC > next && !finished) {
+					labels.insert(endPC);
+					decompiledShader += fmt::format("pc = {}u; break;", endPC);
+				}
+
+				// Fallthrough to next label
+				decompiledShader += "}\n";
+			}
 
-		decompiledShader += "void " + func.getIdentifier() + "() {\n";
-		compileRange(AddressRange(func.start, func.end));
-		decompiledShader += "}\n";
+			decompiledShader += "default: return;\n";
+			// Exit the switch and loop
+			decompiledShader += "} }\n";
+
+			// Exit the function
+			decompiledShader += "return;\n";
+			decompiledShader += "}\n";
+		}
 	}
 
 	return decompiledShader;
@@ -272,6 +389,33 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 			case ShaderOpcodes::DP3: setDest(operandDescriptor, dest, fmt::format("vec4(dot({}.xyz, {}.xyz))", src1, src2)); break;
 			case ShaderOpcodes::DP4: setDest(operandDescriptor, dest, fmt::format("vec4(dot({}, {}))", src1, src2)); break;
 			case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, fmt::format("vec4(inversesqrt({}.x))", src1)); break;
+			case ShaderOpcodes::RCP: setDest(operandDescriptor, dest, fmt::format("vec4(1.0 / {}.x)", src1)); break;
+
+			case ShaderOpcodes::CMP1:
+			case ShaderOpcodes::CMP2: {
+				static constexpr std::array<const char*, 8> operators = {
+					// The last 2 operators always return true and are handled specially
+					"==", "!=", "<", "<=", ">", ">=", "", "",
+				};
+
+				const u32 cmpY = getBits<21, 3>(instruction);
+				const u32 cmpX = getBits<24, 3>(instruction);
+
+				// Compare x first
+				if (cmpX >= 6) {
+					decompiledShader += "cmp_reg.x = true;\n";
+				} else {
+					decompiledShader += fmt::format("cmp_reg.x = {}.x {} {}.x;\n", src1, operators[cmpX], src2);
+				}
+
+				// Then compare Y
+				if (cmpY >= 6) {
+					decompiledShader += "cmp_reg.y = true;\n";
+				} else {
+					decompiledShader += fmt::format("cmp_reg.y = {}.y {} {}.y;\n", src1, operators[cmpY], src2);
+				}
+				break;
+			}
 
 			default: Helpers::panic("GLSL recompiler: Unknown common opcode: %X", opcode); break;
 		}
@@ -315,7 +459,20 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 		setDest(operandDescriptor, dest, src1 + " * " + src2 + " + " + src3);
 	} else {
 		switch (opcode) {
-			case ShaderOpcodes::END: finished = true; return;
+			case ShaderOpcodes::JMPC: {
+				const u32 dest = getBits<10, 12>(instruction);
+				const u32 condOp = getBits<22, 2>(instruction);
+				const uint refY = getBit<24>(instruction);
+				const uint refX = getBit<25>(instruction);
+				const char* condition = getCondition(condOp, refX, refY);
+				
+				decompiledShader += fmt::format("if ({}) {{ pc = {}u; break; }}", condition, dest);
+				break;
+			}
+			case ShaderOpcodes::END:
+				decompiledShader += "return;\n";
+				finished = true;
+				return;
 			default: Helpers::panic("GLSL recompiler: Unknown opcode: %X", opcode); break;
 		}
 	}
@@ -323,7 +480,6 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 	pc++;
 }
 
-
 bool ShaderDecompiler::usesCommonEncoding(u32 instruction) const {
 	const u32 opcode = instruction >> 26;
 	switch (opcode) {
@@ -360,3 +516,34 @@ std::string ShaderGen::decompileShader(PICAShader& shader, EmulatorConfig& confi
 
 	return decompiler.decompile();
 }
+
+const char* ShaderDecompiler::getCondition(u32 cond, u32 refX, u32 refY) {
+	static constexpr std::array<const char*, 16> conditions = {
+		// ref(Y, X) = (0, 0)
+		"!all(cmp_reg)",
+		"all(not(cmp_reg))",
+		"!cmp_reg.x",
+		"!cmp_reg.y",
+
+		// ref(Y, X) = (0, 1)
+		"cmp_reg.x || !cmp_reg.y",
+		"cmp_reg.x && !cmp_reg.y",
+		"cmp_reg.x",
+		"!cmp_reg.y",
+
+		// ref(Y, X) = (1, 0)
+		"!cmp_reg.x || cmp_reg.y",
+		"!cmp_reg.x && cmp_reg.y",
+		"!cmp_reg.x",
+		"cmp_reg.y",
+
+		// ref(Y, X) = (1, 1)
+		"any(cmp_reg)",
+		"all(cmp_reg)",
+		"cmp_reg.x",
+		"cmp_reg.y",
+	};
+	u32 key = (cond & 0b11) | (refX << 2) | (refY << 3);
+
+	return conditions[key];
+}

From a20982f78acaaf519388884572d1dc03995070a1 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Fri, 26 Jul 2024 23:30:31 +0300
Subject: [PATCH 14/63] Shader decompiler: Fix desitnation mask

---
 src/core/PICA/shader_decompiler.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 5134845ed..9a7d768a0 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -65,14 +65,15 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 		return it->second;
 	}
 
+	auto setExitMode = [&it](ExitMode mode) {
+		it->second = mode;
+		return it->second;
+	};
+
 	// Make sure not to go out of bounds on the shader
 	for (u32 pc = start; pc < PICAShader::maxInstructionCount && pc != end; pc++) {
 		const u32 instruction = shader.loadedShader[pc];
 		const u32 opcode = instruction >> 26;
-		auto setExitMode = [&it](ExitMode mode) {
-			it->second = mode;
-			return it->second;
-		};
 
 		switch (opcode) {
 			case ShaderOpcodes::JMPC:
@@ -332,10 +333,8 @@ void ShaderDecompiler::setDest(u32 operandDescriptor, const std::string& dest, c
 
 	// Don't write destination swizzle if all lanes are getting written to
 	decompiledShader += fmt::format("{}{} = ", dest, writtenLaneCount == 4 ? "" : destSwizzle);
-	if (writtenLaneCount == 1) {
-		decompiledShader += "float(" + value + ");\n";
-	} else if (writtenLaneCount <= 3) { // We don't need to cast for vec4, as we guarantee the rhs will be a vec4
-		decompiledShader += fmt::format("vec{}({});\n", writtenLaneCount, value);
+	if (writtenLaneCount <= 3) {
+		decompiledShader += fmt::format("({}){};\n", value, destSwizzle);
 	} else if (writtenLaneCount == 4) {
 		decompiledShader += fmt::format("{};\n", value);
 	}

From 44705508ffd1f3baeb394da7743dac77120acd2a Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 28 Jul 2024 00:47:50 +0300
Subject: [PATCH 15/63] Shader Decomp: Remove pair member capture in lambda
 (unsupported on NDK)

---
 src/core/PICA/shader_decompiler.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 9a7d768a0..1395f8e3d 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -65,11 +65,6 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 		return it->second;
 	}
 
-	auto setExitMode = [&it](ExitMode mode) {
-		it->second = mode;
-		return it->second;
-	};
-
 	// Make sure not to go out of bounds on the shader
 	for (u32 pc = start; pc < PICAShader::maxInstructionCount && pc != end; pc++) {
 		const u32 instruction = shader.loadedShader[pc];
@@ -85,7 +80,8 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 				// This opens up 2 parallel paths of execution
 				auto branchTakenExit = analyzeFunction(shader, dest, end, labels);
 				auto branchNotTakenExit = analyzeFunction(shader, pc + 1, dest, labels);
-				return setExitMode(exitParallel(branchTakenExit, branchNotTakenExit));
+				it->second = exitParallel(branchTakenExit, branchNotTakenExit);
+				return it->second;
 			}
 			case ShaderOpcodes::IFU:
 			case ShaderOpcodes::IFC: {
@@ -96,7 +92,8 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 				const Function* branchTakenFunc = addFunction(shader, pc + 1, dest);
 				// Check if analysis of the branch taken func failed and return unknown if it did
 				if (analysisFailed) {
-					return setExitMode(ExitMode::Unknown);
+					it->second = ExitMode::Unknown;
+					return it->second;
 				}
 
 				// Next analyze the not taken func
@@ -105,7 +102,8 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 					const Function* branchNotTakenFunc = addFunction(shader, dest, dest + num);
 					// Check if analysis failed and return unknown if it did
 					if (analysisFailed) {
-						return setExitMode(ExitMode::Unknown);
+						it->second = ExitMode::Unknown;
+						return it->second;
 					}
 
 					branchNotTakenExitMode = branchNotTakenFunc->exitMode;
@@ -114,11 +112,13 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 				auto parallel = exitParallel(branchTakenFunc->exitMode, branchNotTakenExitMode);
 				// Both branches of the if/else end, so there's nothing after the call
 				if (parallel == ExitMode::AlwaysEnd) {
-					return setExitMode(parallel);
+					it->second = parallel;
+					return it->second;
 				} else {
 					ExitMode afterConditional = analyzeFunction(shader, pc + 1, end, labels);
 					ExitMode conditionalExitMode = exitSeries(parallel, afterConditional);
-					return setExitMode(conditionalExitMode);
+					it->second = conditionalExitMode;
+					return it->second;
 				}
 				break;
 			}
@@ -126,7 +126,7 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 			case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)"); break;
 			case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)"); break;
 			case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)"); break;
-			case ShaderOpcodes::END: return setExitMode(ExitMode::AlwaysEnd);
+			case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second;
 
 			default: break;
 		}

From 37d7bad5aaf5da0f6080b5691f96ff23a72ce952 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 28 Jul 2024 03:38:23 +0300
Subject: [PATCH 16/63] Disgusting changes to handle the fact that hw shader
 shaders are 2x as big

---
 include/PICA/pica_vert_config.hpp    | 20 ++++++++++++++++-
 include/PICA/shader_gen.hpp          |  3 ++-
 src/core/PICA/gpu.cpp                | 33 +++++++++++++++++++++++-----
 src/core/PICA/shader_decompiler.cpp  |  4 ++--
 src/core/PICA/shader_gen_glsl.cpp    | 29 ++++++++++++++++++++----
 src/core/renderer_gl/renderer_gl.cpp | 24 ++++++++++----------
 6 files changed, 89 insertions(+), 24 deletions(-)

diff --git a/include/PICA/pica_vert_config.hpp b/include/PICA/pica_vert_config.hpp
index ae774405d..083e1997f 100644
--- a/include/PICA/pica_vert_config.hpp
+++ b/include/PICA/pica_vert_config.hpp
@@ -6,21 +6,39 @@
 
 #include "PICA/pica_hash.hpp"
 #include "PICA/regs.hpp"
+#include "PICA/shader.hpp"
 #include "bitfield.hpp"
 #include "helpers.hpp"
 
 namespace PICA {
-	// Configuration struct used 
+	// Configuration struct used
 	struct VertConfig {
 		PICAHash::HashType shaderHash;
 		PICAHash::HashType opdescHash;
 		u32 entrypoint;
+
+		// PICA registers for configuring shader output->fragment semantic mapping
+		std::array<u32, 7> outmaps{};
+		u16 outputMask;
+		u8 outputCount;
 		bool usingUbershader;
 
 		bool operator==(const VertConfig& config) const {
 			// Hash function and equality operator required by std::unordered_map
 			return std::memcmp(this, &config, sizeof(VertConfig)) == 0;
 		}
+
+		VertConfig(PICAShader& shader, const std::array<u32, 0x300>& regs, bool usingUbershader) : usingUbershader(usingUbershader) {
+			shaderHash = shader.getCodeHash();
+			opdescHash = shader.getOpdescHash();
+			entrypoint = shader.entrypoint;
+
+			outputCount = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
+			outputMask = regs[PICA::InternalRegs::VertexShaderOutputMask];
+			for (int i = 0; i < outputCount; i++) {
+				outputMask = regs[PICA::InternalRegs::ShaderOutmap0 + i];
+			}
+		}
 	};
 }  // namespace PICA
 
diff --git a/include/PICA/shader_gen.hpp b/include/PICA/shader_gen.hpp
index 2d39e0787..aef16d50b 100644
--- a/include/PICA/shader_gen.hpp
+++ b/include/PICA/shader_gen.hpp
@@ -3,6 +3,7 @@
 
 #include "PICA/gpu.hpp"
 #include "PICA/pica_frag_config.hpp"
+#include "PICA/pica_vert_config.hpp"
 #include "PICA/regs.hpp"
 #include "PICA/shader_gen_types.hpp"
 #include "helpers.hpp"
@@ -31,7 +32,7 @@ namespace PICA::ShaderGen {
 		std::string generate(const PICA::FragmentConfig& config);
 		std::string getDefaultVertexShader();
 		// For when PICA shader is acceleration is enabled. Turn the PICA shader source into a proper vertex shader
-		std::string getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader);
+		std::string getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader);
 
 		void setTarget(API api, Language language) {
 			this->api = api;
diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp
index a6d734fd0..998bacf92 100644
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@@ -150,7 +150,19 @@ void GPU::drawArrays(bool indexed) {
 	}
 }
 
-static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
+// We need a union here, because unfortunately in CPU shaders we only need to store the vertex shader outputs in the vertex buffer,
+// which consist of 8 vec4 attributes, while with GPU shaders we need to pass all the vertex shader inputs to the GPU, which consist
+// of 16 vec4 attributes
+union PICAVertexBuffer {
+	// Used with CPU shaders
+	std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
+	// Used with GPU shaders. We can have up to 16 attributes per vertex, each attribute with 4 floats
+	std::array<float, Renderer::vertexBufferSize * 16 * 4> vsInputs;
+
+	PICAVertexBuffer() {}
+};
+
+static PICAVertexBuffer vertexBuffer;
 
 template <bool indexed, ShaderExecMode mode>
 void GPU::drawArrays() {
@@ -158,6 +170,10 @@ void GPU::drawArrays() {
 		shaderJIT.prepare(shaderUnit.vs);
 	}
 
+	// We can have up to 16 attributes, each one consisting of 4 floats
+	constexpr u32 maxAttrSizeInFloats = 16 * 4;
+	auto& vertices = vertexBuffer.vertices;
+
 	setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
 
 	// Base address for vertex attributes
@@ -228,7 +244,14 @@ void GPU::drawArrays() {
 			size_t tag = vertexIndex % vertexCacheSize;
 			// Cache hit
 			if (cache.validBits[tag] && cache.ids[tag] == vertexIndex) {
-				vertices[i] = vertices[cache.bufferPositions[tag]];
+				if constexpr (mode != ShaderExecMode::Hardware) {
+					vertices[i] = vertices[cache.bufferPositions[tag]];
+				} else {
+					std::memcpy(
+						&vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cache.bufferPositions[tag] * maxAttrSizeInFloats],
+						sizeof(float) * maxAttrSizeInFloats
+					);
+				}
 				continue;
 			}
 
@@ -361,11 +384,11 @@ void GPU::drawArrays() {
 				}
 			}
 		} else {  // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly
-			PICA::Vertex& out = vertices[i];
+			float* out = &vertexBuffer.vsInputs[i * maxAttrSizeInFloats];
 			for (int j = 0; j < totalAttribCount; j++) {
 				const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
-				// Multiply mapping * 4 as mapping refers to a vec4 whereas out.raw is an array of floats
-				std::memcpy(&out.raw[mapping * 4], &currentAttributes[j], sizeof(vec4f));
+				// Multiply mapping * 4 as mapping refers to a vec4 whereas out is an array of floats
+				std::memcpy(&out[mapping * 4], &currentAttributes[j], sizeof(vec4f));
 			}
 		}
 	}
diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 1395f8e3d..2adc36614 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -160,7 +160,7 @@ const Function* ShaderDecompiler::findFunction(const AddressRange& range) {
 
 void ShaderDecompiler::writeAttributes() {
 	decompiledShader += R"(
-	layout(location = 0) in vec4 inputs[8];
+	layout(location = 0) in vec4 inputs[16];
 	layout(std140) uniform PICAShaderUniforms {
 		vec4 uniform_float[96];
 		uvec4 uniform_int;
@@ -168,7 +168,7 @@ void ShaderDecompiler::writeAttributes() {
 	};
 
 	vec4 tmp_regs[16];
-	vec4 out_regs[8];
+	vec4 out_regs[16];
 	vec4 dummy_vec = vec4(0.0);
 	bvec2 cmp_reg = bvec2(false);
 )";
diff --git a/src/core/PICA/shader_gen_glsl.cpp b/src/core/PICA/shader_gen_glsl.cpp
index 8eaf09e8f..3920bed4f 100644
--- a/src/core/PICA/shader_gen_glsl.cpp
+++ b/src/core/PICA/shader_gen_glsl.cpp
@@ -671,7 +671,28 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf
 	shader += "combinerOutput.rgb = mix(fog_color, combinerOutput.rgb, fog_factor);";
 }
 
-std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, bool usingUbershader) {
+std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader) {
+	// First, calculate output register -> Fixed function fragment semantics based on the VAO config
+	{
+		uint count = 0;
+		u16 outputMask = vertConfig.outputMask;
+		std::array<u8, 16> vsOutputRegisters;
+
+		// See which registers are actually enabled and ignore the disabled ones
+		for (int i = 0; i < 16; i++) {
+			if (outputMask & 1) {
+				vsOutputRegisters[count++] = i;
+			}
+
+			outputMask >>= 1;
+		}
+
+		// For the others, map the index to a vs output directly (TODO: What does hw actually do?)
+		for (; count < 16; count++) {
+			vsOutputRegisters[count] = count;
+		}
+	}
+
 	if (usingUbershader) {
 		Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader");
 		return picaSource;
@@ -704,8 +725,8 @@ void main() {
 	float a_texcoord0_w = out_regs[2].w;
 	vec2 a_texcoord1 = out_regs[3].xy;
 	vec2 a_texcoord2 = out_regs[4].xy;
-	vec3 a_view = out_regs[5].xyz;
-	vec4 a_quaternion = out_regs[6];
+	vec3 a_view = out_regs[2].xyz;
+	vec4 a_quaternion = out_regs[3];
 
 	gl_Position = a_coords;
 	vec4 colourAbs = abs(a_vertexColour);
@@ -722,7 +743,7 @@ void main() {
 	gl_ClipDistance[1] = dot(clipCoords, a_coords);
 #endif
 })";
-
+		std::cout << ret << "\n";
 		return ret;
 	}
 }
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 17e3702f3..6fd266baa 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -88,7 +88,7 @@ void RendererGL::initGraphicsContextInternal() {
 	gl.bindUBO(hwShaderUniformUBO);
 	glBufferData(GL_UNIFORM_BUFFER, PICAShader::totalUniformSize(), nullptr, GL_DYNAMIC_DRAW);
 
-	vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize, GL_STREAM_DRAW);
+	vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize * 2, GL_STREAM_DRAW);
 	gl.bindVBO(vbo);
 	// Initialize the VAO used when not using hw shaders
 	defaultVAO.create();
@@ -122,8 +122,8 @@ void RendererGL::initGraphicsContextInternal() {
 	// Initialize the VAO used for hw shaders
 	hwShaderVAO.create();
 	gl.bindVAO(hwShaderVAO);
-	for (int attr = 0; attr < 8; attr++) {
-		hwShaderVAO.setAttributeFloat<float>(attr, 4, sizeof(Vertex), attr * sizeof(float) * 4);
+	for (int attr = 0; attr < 16; attr++) {
+		hwShaderVAO.setAttributeFloat<float>(attr, 4, sizeof(Vertex) * 2, attr * sizeof(float) * 4);
 		hwShaderVAO.enableAttribute(attr);
 	}
 
@@ -495,7 +495,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 
 	setupStencilTest(stencilEnable);
 
-	vbo.bufferVertsSub(vertices);
+	// If we're using hardware shaders, the vertex array works completely different
+	// And instead of 8 vec4 attributes, each vertex is 16 vec4 attributes. We use a union + aliasing which is not ideal for readability.
+	if (!usingAcceleratedShader) {
+		vbo.bufferVertsSub(vertices);
+	} else {
+		glBufferSubData(GL_ARRAY_BUFFER, 0, vertices.size_bytes() * 2, vertices.data());
+	}
+
 	OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
 }
 
@@ -956,12 +963,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
 	if (usingAcceleratedShader) {
 		auto shaderCodeHash = shaderUnit.vs.getCodeHash();
 		auto opdescHash = shaderUnit.vs.getOpdescHash();
-		auto vertexConfig = PICA::VertConfig{
-			.shaderHash = shaderCodeHash,
-			.opdescHash = opdescHash,
-			.entrypoint = shaderUnit.vs.entrypoint,
-			.usingUbershader = usingUbershader,
-		};
+		PICA::VertConfig vertexConfig(shaderUnit.vs, regs, usingUbershader);
 
 		std::optional<OpenGL::Shader>& shader = shaderCache.vertexShaderCache[vertexConfig];
 		// If the optional is false, we have never tried to recompile the shader before. Try to recompile it and see if it works.
@@ -976,7 +978,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
 			// Empty source means compilation error, if the source is not empty then we convert the rcompiled PICA code into a valid shader and upload
 			// it to the GPU
 			if (!picaShaderSource.empty()) {
-				std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, usingUbershader);
+				std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, vertexConfig, usingUbershader);
 				shader->create({vertexShaderSource}, OpenGL::Vertex);
 			}
 		}

From 9ee1c3964a1568bd23984118aa92db6f32b75784 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 28 Jul 2024 15:36:22 +0300
Subject: [PATCH 17/63] Shader decompiler: Implement proper output semantic
 mapping

---
 include/PICA/pica_vert_config.hpp |  2 +-
 src/core/PICA/shader_gen_glsl.cpp | 56 +++++++++++++++++++++++++------
 2 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/include/PICA/pica_vert_config.hpp b/include/PICA/pica_vert_config.hpp
index 083e1997f..606a28e61 100644
--- a/include/PICA/pica_vert_config.hpp
+++ b/include/PICA/pica_vert_config.hpp
@@ -36,7 +36,7 @@ namespace PICA {
 			outputCount = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
 			outputMask = regs[PICA::InternalRegs::VertexShaderOutputMask];
 			for (int i = 0; i < outputCount; i++) {
-				outputMask = regs[PICA::InternalRegs::ShaderOutmap0 + i];
+				outmaps[i] = regs[PICA::InternalRegs::ShaderOutmap0 + i];
 			}
 		}
 	};
diff --git a/src/core/PICA/shader_gen_glsl.cpp b/src/core/PICA/shader_gen_glsl.cpp
index 3920bed4f..1aa307332 100644
--- a/src/core/PICA/shader_gen_glsl.cpp
+++ b/src/core/PICA/shader_gen_glsl.cpp
@@ -1,3 +1,7 @@
+#include <fmt/format.h>
+
+#include <utility>
+
 #include "PICA/pica_frag_config.hpp"
 #include "PICA/regs.hpp"
 #include "PICA/shader_gen.hpp"
@@ -673,10 +677,15 @@ void FragmentGenerator::compileFog(std::string& shader, const PICA::FragmentConf
 
 std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& picaSource, const PICA::VertConfig& vertConfig, bool usingUbershader) {
 	// First, calculate output register -> Fixed function fragment semantics based on the VAO config
+	// This array contains the mappings for the 32 fixed function semantics (8 variables, with 4 lanes each).
+	// Each entry is a pair, containing the output reg to use for this semantic (first) and which lane of that register (second)
+	std::array<std::pair<int, int>, 32> outputMappings{};
+	// Output registers adjusted according to VS_OUTPUT_MASK, which handles enabling and disabling output attributes
+	std::array<u8, 16> vsOutputRegisters;
+
 	{
 		uint count = 0;
 		u16 outputMask = vertConfig.outputMask;
-		std::array<u8, 16> vsOutputRegisters;
 
 		// See which registers are actually enabled and ignore the disabled ones
 		for (int i = 0; i < 16; i++) {
@@ -691,8 +700,38 @@ std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& pic
 		for (; count < 16; count++) {
 			vsOutputRegisters[count] = count;
 		}
+
+		for (int i = 0; i < vertConfig.outputCount; i++) {
+			const u32 config = vertConfig.outmaps[i];
+			for (int j = 0; j < 4; j++) {
+				const u32 mapping = (config >> (j * 8)) & 0x1F;
+				outputMappings[mapping] = std::make_pair(vsOutputRegisters[i], j);
+			}
+		}
 	}
 
+	auto getSemanticName = [&](u32 semanticIndex) {
+		auto [reg, lane] = outputMappings[semanticIndex];
+		return fmt::format("out_regs[{}][{}]", reg, lane);
+	};
+
+	std::string semantics = fmt::format(
+		R"(
+		vec4 a_coords = vec4({}, {}, {}, {});
+		vec4 a_quaternion = vec4({}, {}, {}, {});
+		vec4 a_vertexColour = vec4({}, {}, {}, {});
+		vec2 a_texcoord0 = vec2({}, {});
+		float a_texcoord0_w = {};
+		vec2 a_texcoord1 = vec2({}, {});
+		vec2 a_texcoord2 = vec2({}, {});
+		vec3 a_view = vec3({}, {}, {});
+	)",
+		getSemanticName(0), getSemanticName(1), getSemanticName(2), getSemanticName(3), getSemanticName(4), getSemanticName(5), getSemanticName(6),
+		getSemanticName(7), getSemanticName(8), getSemanticName(9), getSemanticName(10), getSemanticName(11), getSemanticName(12),
+		getSemanticName(13), getSemanticName(16), getSemanticName(14), getSemanticName(15), getSemanticName(22), getSemanticName(23),
+		getSemanticName(18), getSemanticName(19), getSemanticName(20)
+	);
+
 	if (usingUbershader) {
 		Helpers::panic("Unimplemented: GetVertexShaderAccelerated for ubershader");
 		return picaSource;
@@ -719,15 +758,11 @@ out vec2 v_texcoord2;
 
 void main() {
 	pica_shader_main();
-	vec4 a_coords = out_regs[0];
-	vec4 a_vertexColour = out_regs[1];
-	vec2 a_texcoord0 = out_regs[2].xy;
-	float a_texcoord0_w = out_regs[2].w;
-	vec2 a_texcoord1 = out_regs[3].xy;
-	vec2 a_texcoord2 = out_regs[4].xy;
-	vec3 a_view = out_regs[2].xyz;
-	vec4 a_quaternion = out_regs[3];
-
+)";
+	// Transfer fixed function fragment registers from vertex shader output to the fragment shader
+	ret += semantics;
+	
+	ret += R"(
 	gl_Position = a_coords;
 	vec4 colourAbs = abs(a_vertexColour);
 	v_colour = min(colourAbs, vec4(1.f));
@@ -743,6 +778,7 @@ void main() {
 	gl_ClipDistance[1] = dot(clipCoords, a_coords);
 #endif
 })";
+		
 		std::cout << ret << "\n";
 		return ret;
 	}

From 6c738e821dcbac7e98bcc1f62ee956a72fdc3a76 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 28 Jul 2024 16:06:38 +0300
Subject: [PATCH 18/63] Moar instructions

---
 include/PICA/shader_decompiler.hpp  | 12 +++--
 src/core/PICA/shader_decompiler.cpp | 84 +++++++++++++++++++++++++++--
 src/core/PICA/shader_gen_glsl.cpp   | 18 +++----
 3 files changed, 97 insertions(+), 17 deletions(-)

diff --git a/include/PICA/shader_decompiler.hpp b/include/PICA/shader_decompiler.hpp
index 42bd56429..d992d0df2 100644
--- a/include/PICA/shader_decompiler.hpp
+++ b/include/PICA/shader_decompiler.hpp
@@ -1,10 +1,12 @@
 #pragma once
+#include <fmt/format.h>
+
+#include <map>
 #include <set>
 #include <string>
 #include <tuple>
-#include <map>
-#include <vector>
 #include <utility>
+#include <vector>
 
 #include "PICA/shader.hpp"
 #include "PICA/shader_gen_types.hpp"
@@ -42,9 +44,9 @@ namespace PICA::ShaderGen {
 			explicit Function(u32 start, u32 end) : start(start), end(end) {}
 			bool operator<(const Function& other) const { return AddressRange(start, end) < AddressRange(other.start, other.end); }
 
-			std::string getIdentifier() const { return "func_" + std::to_string(start) + "_to_" + std::to_string(end); }
-			std::string getForwardDecl() const { return "void " + getIdentifier() + "();\n"; }
-			std::string getCallStatement() const { return getIdentifier() + "()"; }
+			std::string getIdentifier() const { return fmt::format("fn_{}_{}", start, end); }
+			std::string getForwardDecl() const { return fmt::format("void fn_{}_{}();\n", start, end); }
+			std::string getCallStatement() const { return fmt::format("fn_{}_{}()", start, end); }
 		};
 
 		std::set<Function> functions{};
diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 2adc36614..899aff298 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -85,7 +85,6 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 			}
 			case ShaderOpcodes::IFU:
 			case ShaderOpcodes::IFC: {
-				Helpers::panic("IFC/IFU");
 				const u32 num = instruction & 0xff;
 				const u32 dest = getBits<10, 12>(instruction);
 
@@ -122,7 +121,29 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 				}
 				break;
 			}
-			case ShaderOpcodes::CALL: Helpers::panic("Unimplemented control flow operation (CALL)"); break;
+			case ShaderOpcodes::CALL: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+				const Function* calledFunction = addFunction(shader, dest, dest + num);
+
+				// Check if analysis of the branch taken func failed and return unknown if it did
+				if (analysisFailed) {
+					it->second = ExitMode::Unknown;
+					return it->second;
+				}
+
+				if (calledFunction->exitMode == ExitMode::AlwaysEnd) {
+					it->second = ExitMode::AlwaysEnd;
+					return it->second;
+				}
+
+				// Exit mode of the remainder of this function, after we return from the callee
+				ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels);
+				ExitMode exitMode = exitSeries(postCallExitMode, calledFunction->exitMode);
+
+				it->second = exitMode;
+				return exitMode;
+			}
 			case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)"); break;
 			case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)"); break;
 			case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)"); break;
@@ -464,14 +485,71 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 				const uint refY = getBit<24>(instruction);
 				const uint refX = getBit<25>(instruction);
 				const char* condition = getCondition(condOp, refX, refY);
-				
+
 				decompiledShader += fmt::format("if ({}) {{ pc = {}u; break; }}", condition, dest);
 				break;
 			}
+
+			case ShaderOpcodes::IFU:
+			case ShaderOpcodes::IFC: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+				const Function* conditionalFunc = findFunction(AddressRange(pc + 1, dest));
+
+				if (opcode == ShaderOpcodes::IFC) {
+					const u32 condOp = getBits<22, 2>(instruction);
+					const uint refY = getBit<24>(instruction);
+					const uint refX = getBit<25>(instruction);
+					const char* condition = getCondition(condOp, refX, refY);
+
+					decompiledShader += fmt::format("if ({}) {{", condition);
+				} else {
+					const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check
+					const u32 mask = 1u << bit;
+
+					decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{", mask);
+				}
+
+				callFunction(*conditionalFunc);
+				decompiledShader += "}\n";
+
+				pc = dest;
+				if (num > 0) {
+					const Function* elseFunc = findFunction(AddressRange(dest, dest + num));
+					pc = dest + num;
+
+					decompiledShader += "else { ";
+					callFunction(*elseFunc);
+					decompiledShader += "}\n";
+
+					if (conditionalFunc->exitMode == ExitMode::AlwaysEnd && elseFunc->exitMode == ExitMode::AlwaysEnd) {
+						finished = true;
+						return;
+					}
+				}
+
+				return;
+			}
+
+			case ShaderOpcodes::CALL: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+				const Function* calledFunc = findFunction(AddressRange(dest, dest + num));
+				callFunction(*calledFunc);
+
+				if (opcode == ShaderOpcodes::CALL && calledFunc->exitMode == ExitMode::AlwaysEnd) {
+					finished = true;
+					return;
+				}
+				break;
+			}
+
 			case ShaderOpcodes::END:
 				decompiledShader += "return;\n";
 				finished = true;
 				return;
+
+			case ShaderOpcodes::NOP: break;
 			default: Helpers::panic("GLSL recompiler: Unknown opcode: %X", opcode); break;
 		}
 	}
diff --git a/src/core/PICA/shader_gen_glsl.cpp b/src/core/PICA/shader_gen_glsl.cpp
index 1aa307332..affe9837f 100644
--- a/src/core/PICA/shader_gen_glsl.cpp
+++ b/src/core/PICA/shader_gen_glsl.cpp
@@ -717,15 +717,15 @@ std::string FragmentGenerator::getVertexShaderAccelerated(const std::string& pic
 
 	std::string semantics = fmt::format(
 		R"(
-		vec4 a_coords = vec4({}, {}, {}, {});
-		vec4 a_quaternion = vec4({}, {}, {}, {});
-		vec4 a_vertexColour = vec4({}, {}, {}, {});
-		vec2 a_texcoord0 = vec2({}, {});
-		float a_texcoord0_w = {};
-		vec2 a_texcoord1 = vec2({}, {});
-		vec2 a_texcoord2 = vec2({}, {});
-		vec3 a_view = vec3({}, {}, {});
-	)",
+	vec4 a_coords = vec4({}, {}, {}, {});
+	vec4 a_quaternion = vec4({}, {}, {}, {});
+	vec4 a_vertexColour = vec4({}, {}, {}, {});
+	vec2 a_texcoord0 = vec2({}, {});
+	float a_texcoord0_w = {};
+	vec2 a_texcoord1 = vec2({}, {});
+	vec2 a_texcoord2 = vec2({}, {});
+	vec3 a_view = vec3({}, {}, {});
+)",
 		getSemanticName(0), getSemanticName(1), getSemanticName(2), getSemanticName(3), getSemanticName(4), getSemanticName(5), getSemanticName(6),
 		getSemanticName(7), getSemanticName(8), getSemanticName(9), getSemanticName(10), getSemanticName(11), getSemanticName(12),
 		getSemanticName(13), getSemanticName(16), getSemanticName(14), getSemanticName(15), getSemanticName(22), getSemanticName(23),

From d125180847ca92c9ed4dcd18f5d880b94fa7fe10 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 28 Jul 2024 17:48:16 +0300
Subject: [PATCH 19/63] Shader decompiler: Add FLR/SLT/SLTI/SGE/SGEI

---
 include/PICA/pica_vert_config.hpp    | 3 ++-
 src/core/PICA/shader_decompiler.cpp  | 7 +++++++
 src/core/renderer_gl/renderer_gl.cpp | 2 --
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/include/PICA/pica_vert_config.hpp b/include/PICA/pica_vert_config.hpp
index 606a28e61..ba66426d3 100644
--- a/include/PICA/pica_vert_config.hpp
+++ b/include/PICA/pica_vert_config.hpp
@@ -36,7 +36,8 @@ namespace PICA {
 			outputCount = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
 			outputMask = regs[PICA::InternalRegs::VertexShaderOutputMask];
 			for (int i = 0; i < outputCount; i++) {
-				outmaps[i] = regs[PICA::InternalRegs::ShaderOutmap0 + i];
+				// Mask out unused bits
+				outmaps[i] = regs[PICA::InternalRegs::ShaderOutmap0 + i] & 0x1F1F1F1F;
 			}
 		}
 	};
diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 899aff298..e028d6d88 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -408,9 +408,16 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 
 			case ShaderOpcodes::DP3: setDest(operandDescriptor, dest, fmt::format("vec4(dot({}.xyz, {}.xyz))", src1, src2)); break;
 			case ShaderOpcodes::DP4: setDest(operandDescriptor, dest, fmt::format("vec4(dot({}, {}))", src1, src2)); break;
+			case ShaderOpcodes::FLR: setDest(operandDescriptor, dest, fmt::format("floor({})", src1)); break;
 			case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, fmt::format("vec4(inversesqrt({}.x))", src1)); break;
 			case ShaderOpcodes::RCP: setDest(operandDescriptor, dest, fmt::format("vec4(1.0 / {}.x)", src1)); break;
 
+			case ShaderOpcodes::SLT:
+			case ShaderOpcodes::SLTI: setDest(operandDescriptor, dest, fmt::format("vec4(lessThan({}, {}))", src1, src2)); break;
+
+			case ShaderOpcodes::SGE:
+			case ShaderOpcodes::SGEI: setDest(operandDescriptor, dest, fmt::format("vec4(greaterThanEqual({}, {}))", src1, src2)); break;
+
 			case ShaderOpcodes::CMP1:
 			case ShaderOpcodes::CMP2: {
 				static constexpr std::array<const char*, 8> operators = {
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 6fd266baa..78dfb98fd 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -961,8 +961,6 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
 	usingAcceleratedShader = emulatorConfig->accelerateShaders && !isImmediateMode && !usingUbershader;
 
 	if (usingAcceleratedShader) {
-		auto shaderCodeHash = shaderUnit.vs.getCodeHash();
-		auto opdescHash = shaderUnit.vs.getOpdescHash();
 		PICA::VertConfig vertexConfig(shaderUnit.vs, regs, usingUbershader);
 
 		std::optional<OpenGL::Shader>& shader = shaderCache.vertexShaderCache[vertexConfig];

From 4040d885c6eb4ec990c3cbc2890f2a4cce6b245e Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 28 Jul 2024 21:25:51 +0300
Subject: [PATCH 20/63] Shader decompiler: Add register indexing

---
 src/core/PICA/shader_decompiler.cpp | 57 ++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index e028d6d88..da880fcc2 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -191,7 +191,14 @@ void ShaderDecompiler::writeAttributes() {
 	vec4 tmp_regs[16];
 	vec4 out_regs[16];
 	vec4 dummy_vec = vec4(0.0);
+	ivec3 addr_reg = ivec3(0);
 	bvec2 cmp_reg = bvec2(false);
+
+	vec4 float_uniform_indexed(int source, int offset) {
+		int clipped_offs = (offset >= -128 && offset <= 127) ? offset : 0;
+		uint index = uint(clipped_offs + source) & 127u;
+		return (index < 96u) ? uniform_float[index] : vec4(1.0);
+	}
 )";
 }
 
@@ -284,10 +291,15 @@ std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index)
 	} else {
 		const usize floatIndex = (source - 0x20) & 0x7f;
 
-		if (floatIndex >= 96) [[unlikely]] {
-			return "dummy_vec";
+		if (index == 0) {
+			if (floatIndex >= 96) [[unlikely]] {
+				return "dummy_vec";
+			}
+			return "uniform_float[" + std::to_string(floatIndex) + "]";
+		} else {
+			static constexpr std::array<const char*, 4> offsets = {"0", "addr_reg.x", "addr_reg.y", "addr_reg.z"};
+			return fmt::format("float_uniform_indexed({}, {})", floatIndex, offsets[index]);
 		}
-		return "uniform_float[" + std::to_string(floatIndex) + "]";
 	}
 }
 
@@ -391,14 +403,6 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 
 		std::string dest = getDest(destIndex);
 
-		if (idx != 0) {
-			Helpers::panic("GLSL recompiler: Indexed instruction");
-		}
-
-		if (invertSources) {
-			Helpers::panic("GLSL recompiler: Inverted instruction");
-		}
-
 		switch (opcode) {
 			case ShaderOpcodes::MOV: setDest(operandDescriptor, dest, src1); break;
 			case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, fmt::format("{} + {}", src1, src2)); break;
@@ -444,6 +448,20 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 				break;
 			}
 
+			case ShaderOpcodes::MOVA: {
+				const bool writeX = getBit<3>(operandDescriptor);  // Should we write the x component of the address register?
+				const bool writeY = getBit<2>(operandDescriptor);
+
+				if (writeX) {
+					decompiledShader += fmt::format("addr_reg.x = int({}.x);\n", src1);
+				}
+
+				if (writeY) {
+					decompiledShader += fmt::format("addr_reg.y = int({}.y);\n", src1);
+				}
+				break;
+			}
+
 			default: Helpers::panic("GLSL recompiler: Unknown common opcode: %X", opcode); break;
 		}
 	} else if (opcode >= 0x30 && opcode <= 0x3F) { // MAD and MADI
@@ -478,11 +496,6 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 		src3 += getSwizzlePattern(swizzle3);
 
 		std::string dest = getDest(destIndex);
-
-		if (idx != 0) {
-			Helpers::panic("GLSL recompiler: Indexed instruction");
-		}
-
 		setDest(operandDescriptor, dest, src1 + " * " + src2 + " + " + src3);
 	} else {
 		switch (opcode) {
@@ -493,7 +506,16 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 				const uint refX = getBit<25>(instruction);
 				const char* condition = getCondition(condOp, refX, refY);
 
-				decompiledShader += fmt::format("if ({}) {{ pc = {}u; break; }}", condition, dest);
+				decompiledShader += fmt::format("if ({}) {{ pc = {}u; break; }}\n", condition, dest);
+				break;
+			}
+
+			case ShaderOpcodes::JMPU: {
+				const u32 dest = getBits<10, 12>(instruction);
+				const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check
+				const u32 mask = 1u << bit;
+
+				decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{ pc = {}u; break; }}\n", mask, dest);
 				break;
 			}
 
@@ -556,6 +578,7 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 				finished = true;
 				return;
 
+
 			case ShaderOpcodes::NOP: break;
 			default: Helpers::panic("GLSL recompiler: Unknown opcode: %X", opcode); break;
 		}

From 94bd0600820922813d53d32691bc2168fcd36adf Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 28 Jul 2024 21:28:16 +0300
Subject: [PATCH 21/63] Shader decompiler: Optimize mova with both x and y
 masked

---
 src/core/PICA/shader_decompiler.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index da880fcc2..607b5c926 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -452,11 +452,11 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 				const bool writeX = getBit<3>(operandDescriptor);  // Should we write the x component of the address register?
 				const bool writeY = getBit<2>(operandDescriptor);
 
-				if (writeX) {
+				if (writeX && writeY) {
+					decompiledShader += fmt::format("addr_reg.xy = ivec2({}.xy);\n", src1);
+				} else if (writeX) {
 					decompiledShader += fmt::format("addr_reg.x = int({}.x);\n", src1);
-				}
-
-				if (writeY) {
+				} else if (writeY) {
 					decompiledShader += fmt::format("addr_reg.y = int({}.y);\n", src1);
 				}
 				break;

From 59f4f236d88a7f3abb8c0b2863beca74eb53471a Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Mon, 29 Jul 2024 00:21:30 +0300
Subject: [PATCH 22/63] Shader decompiler: Add DPH/DPHI

---
 src/core/PICA/shader_decompiler.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 607b5c926..d2a3405de 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -422,6 +422,10 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 			case ShaderOpcodes::SGE:
 			case ShaderOpcodes::SGEI: setDest(operandDescriptor, dest, fmt::format("vec4(greaterThanEqual({}, {}))", src1, src2)); break;
 
+			case ShaderOpcodes::DPH:
+			case ShaderOpcodes::DPHI:
+				setDest(operandDescriptor, dest, fmt::format("vec4(dot(vec4({}.xyz, 1.0), {}))", src1, src2)); break;
+
 			case ShaderOpcodes::CMP1:
 			case ShaderOpcodes::CMP2: {
 				static constexpr std::array<const char*, 8> operators = {

From 72097404180f566ffe685ecfe938b1888483b794 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Mon, 29 Jul 2024 01:03:41 +0300
Subject: [PATCH 23/63] Fix shader caching being broken

---
 include/PICA/pica_vert_config.hpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/PICA/pica_vert_config.hpp b/include/PICA/pica_vert_config.hpp
index ba66426d3..4300e4542 100644
--- a/include/PICA/pica_vert_config.hpp
+++ b/include/PICA/pica_vert_config.hpp
@@ -1,5 +1,6 @@
 #pragma once
 #include <array>
+#include <cassert>
 #include <cstring>
 #include <type_traits>
 #include <unordered_map>
@@ -23,6 +24,10 @@ namespace PICA {
 		u8 outputCount;
 		bool usingUbershader;
 
+		// Pad to 56 bytes so that the compiler won't insert unnecessary padding, which in turn will affect our unordered_map lookup
+		// As the padding will get hashed and memcmp'd...
+		u32 pad{};
+
 		bool operator==(const VertConfig& config) const {
 			// Hash function and equality operator required by std::unordered_map
 			return std::memcmp(this, &config, sizeof(VertConfig)) == 0;
@@ -43,6 +48,8 @@ namespace PICA {
 	};
 }  // namespace PICA
 
+static_assert(sizeof(PICA::VertConfig) == 56);
+
 // Override std::hash for our vertex config class
 template <>
 struct std::hash<PICA::VertConfig> {

From 0d6bef2d70c06a6c3cf3bf350715b8bcfe1f6088 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Mon, 29 Jul 2024 01:27:13 +0300
Subject: [PATCH 24/63] PICA decompiler: Cache VS uniforms

---
 include/PICA/shader.hpp              | 11 +++++++++++
 src/core/PICA/regs.cpp               |  2 +-
 src/core/PICA/shader_unit.cpp        |  1 +
 src/core/renderer_gl/renderer_gl.cpp |  6 +++++-
 4 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/PICA/shader.hpp b/include/PICA/shader.hpp
index 7f127795e..535b6f4cc 100644
--- a/include/PICA/shader.hpp
+++ b/include/PICA/shader.hpp
@@ -133,6 +133,10 @@ class PICAShader {
 	Hash lastCodeHash = 0;    // Last hash computed for the shader code (Used for the JIT caching mechanism)
 	Hash lastOpdescHash = 0;  // Last hash computed for the operand descriptors (Also used for the JIT)
 
+  public:
+	bool uniformsDirty = false;
+
+  protected:
 	bool codeHashDirty = false;
 	bool opdescHashDirty = false;
 
@@ -283,6 +287,7 @@ class PICAShader {
 				uniform[2] = f24::fromRaw(((floatUniformBuffer[0] & 0xff) << 16) | (floatUniformBuffer[1] >> 16));
 				uniform[3] = f24::fromRaw(floatUniformBuffer[0] >> 8);
 			}
+			uniformsDirty = true;
 		}
 	}
 
@@ -294,6 +299,12 @@ class PICAShader {
 		u[1] = getBits<8, 8>(word);
 		u[2] = getBits<16, 8>(word);
 		u[3] = getBits<24, 8>(word);
+		uniformsDirty = true;
+	}
+
+	void uploadBoolUniform(u32 value) {
+		boolUniform = value;
+		uniformsDirty = true;
 	}
 
 	void run();
diff --git a/src/core/PICA/regs.cpp b/src/core/PICA/regs.cpp
index c9412fc8f..0c5f4adb7 100644
--- a/src/core/PICA/regs.cpp
+++ b/src/core/PICA/regs.cpp
@@ -301,7 +301,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 		}
 
 		case VertexBoolUniform: {
-			shaderUnit.vs.boolUniform = value & 0xffff;
+			shaderUnit.vs.uploadBoolUniform(value & 0xffff);
 			break;
 		}
 
diff --git a/src/core/PICA/shader_unit.cpp b/src/core/PICA/shader_unit.cpp
index 759849a8a..6b291d31c 100644
--- a/src/core/PICA/shader_unit.cpp
+++ b/src/core/PICA/shader_unit.cpp
@@ -34,4 +34,5 @@ void PICAShader::reset() {
 
 	codeHashDirty = true;
 	opdescHashDirty = true;
+	uniformsDirty = true;
 }
\ No newline at end of file
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 78dfb98fd..6e50f77be 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -987,7 +987,11 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
 		} else {
 			generatedVertexShader = &(*shader);
 			gl.bindUBO(hwShaderUniformUBO);
-			glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer());
+
+			if (shaderUnit.vs.uniformsDirty) {
+				shaderUnit.vs.uniformsDirty = false;
+				glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer());
+			}
 		}
 	}
 

From 1c9df7c02c6caf21c586782ad1b388570faeb0e5 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Mon, 29 Jul 2024 01:42:56 +0300
Subject: [PATCH 25/63] Simply vertex cache code

---
 src/core/PICA/gpu.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp
index 998bacf92..6cbdb100b 100644
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@@ -174,7 +174,9 @@ void GPU::drawArrays() {
 	constexpr u32 maxAttrSizeInFloats = 16 * 4;
 	auto& vertices = vertexBuffer.vertices;
 
-	setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
+	if constexpr (mode != ShaderExecMode::Hardware) {
+		setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
+	}
 
 	// Base address for vertex attributes
 	// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
@@ -247,8 +249,9 @@ void GPU::drawArrays() {
 				if constexpr (mode != ShaderExecMode::Hardware) {
 					vertices[i] = vertices[cache.bufferPositions[tag]];
 				} else {
+					const u32 cachedBufferPosition = cache.bufferPositions[tag] * maxAttrSizeInFloats;
 					std::memcpy(
-						&vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cache.bufferPositions[tag] * maxAttrSizeInFloats],
+						&vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cachedBufferPosition],
 						sizeof(float) * maxAttrSizeInFloats
 					);
 				}

From 53ee3f305127cf2fa53effc8eac5c61d04caf1d3 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Mon, 29 Jul 2024 01:42:56 +0300
Subject: [PATCH 26/63] Simplify vertex cache code

---
 src/core/PICA/gpu.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp
index 998bacf92..6cbdb100b 100644
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@@ -174,7 +174,9 @@ void GPU::drawArrays() {
 	constexpr u32 maxAttrSizeInFloats = 16 * 4;
 	auto& vertices = vertexBuffer.vertices;
 
-	setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
+	if constexpr (mode != ShaderExecMode::Hardware) {
+		setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
+	}
 
 	// Base address for vertex attributes
 	// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
@@ -247,8 +249,9 @@ void GPU::drawArrays() {
 				if constexpr (mode != ShaderExecMode::Hardware) {
 					vertices[i] = vertices[cache.bufferPositions[tag]];
 				} else {
+					const u32 cachedBufferPosition = cache.bufferPositions[tag] * maxAttrSizeInFloats;
 					std::memcpy(
-						&vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cache.bufferPositions[tag] * maxAttrSizeInFloats],
+						&vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cachedBufferPosition],
 						sizeof(float) * maxAttrSizeInFloats
 					);
 				}

From b46f7ad9bcbbc4c27da2276d36667d55bd7071de Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Thu, 1 Aug 2024 00:55:20 +0300
Subject: [PATCH 27/63] Shader decompiler: Add loops

---
 src/core/PICA/shader_decompiler.cpp | 44 +++++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index d2a3405de..b441c8135 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -146,7 +146,24 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 			}
 			case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)"); break;
 			case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)"); break;
-			case ShaderOpcodes::LOOP: Helpers::panic("Unimplemented control flow operation (LOOP)"); break;
+			case ShaderOpcodes::LOOP: {
+				u32 dest = getBits<10, 12>(instruction);
+				const Function* loopFunction = addFunction(shader, pc + 1, dest + 1);
+				if (analysisFailed) {
+					it->second = ExitMode::Unknown;
+					return it->second;
+				}
+
+				if (loopFunction->exitMode == ExitMode::AlwaysEnd) {
+					it->second = ExitMode::AlwaysEnd;
+					return it->second;
+				}
+
+				ExitMode afterLoop = analyzeFunction(shader, dest + 1, end, labels);
+				ExitMode exitMode = exitSeries(afterLoop, loopFunction->exitMode);
+				it->second = exitMode;
+				return it->second;
+			}
 			case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second;
 
 			default: break;
@@ -577,12 +594,35 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 				break;
 			}
 
+			case ShaderOpcodes::LOOP: {
+				const u32 dest = getBits<10, 12>(instruction);
+				const u32 uniformIndex = getBits<22, 2>(instruction);
+
+				// loop counter = uniform.y
+				decompiledShader += fmt::format("addr_reg.z = int((uniform_int[{}] >> 16u) & 0xFFu);\n", uniformIndex);
+				decompiledShader += fmt::format(
+					"for (uint loopCtr{} = 0u; loopCtr{} <= ((uniform_int[{}] >> 24) & 0xFFu); loopCtr{}++, addr_reg.z += int((uniform_int[{}] >> "
+					"8u) & 0xFFu)) {{\n",
+					pc, pc, uniformIndex, pc, uniformIndex
+				);
+
+				AddressRange range(pc + 1, dest + 1);
+				const Function* func = findFunction(range);
+				callFunction(*func);
+				decompiledShader += "}\n";
+
+				if (func->exitMode == ExitMode::AlwaysEnd) {
+					finished = true;
+					return;
+				}
+				break;
+			}
+
 			case ShaderOpcodes::END:
 				decompiledShader += "return;\n";
 				finished = true;
 				return;
 
-
 			case ShaderOpcodes::NOP: break;
 			default: Helpers::panic("GLSL recompiler: Unknown opcode: %X", opcode); break;
 		}

From c7371e3bf4f627700688a896a56d2ee0a8e99e5f Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Thu, 8 Aug 2024 00:38:52 +0300
Subject: [PATCH 28/63] Shader decompiler: Implement safe multiplication

---
 src/core/PICA/shader_decompiler.cpp | 42 ++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index b441c8135..6e7304e15 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -241,7 +241,7 @@ std::string ShaderDecompiler::decompile() {
 		decompiledShader += R"(
 			vec4 safe_mul(vec4 a, vec4 b) {
 				vec4 res = a * b;
-				return mix(res, mix(mix(vec4(0.0), res, isnan(rhs)), product, isnan(lhs)), isnan(res));
+				return mix(res, mix(mix(vec4(0.0), res, isnan(b)), res, isnan(a)), isnan(res));
 			}
 		)";
 	}
@@ -423,12 +423,32 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 		switch (opcode) {
 			case ShaderOpcodes::MOV: setDest(operandDescriptor, dest, src1); break;
 			case ShaderOpcodes::ADD: setDest(operandDescriptor, dest, fmt::format("{} + {}", src1, src2)); break;
-			case ShaderOpcodes::MUL: setDest(operandDescriptor, dest, fmt::format("{} * {}", src1, src2)); break;
+			case ShaderOpcodes::MUL:
+				if (!config.accurateShaderMul) {
+					setDest(operandDescriptor, dest, fmt::format("{} * {}", src1, src2));
+				} else {
+					setDest(operandDescriptor, dest, fmt::format("safe_mul({}, {})", src1, src2));
+				}
+				break;
 			case ShaderOpcodes::MAX: setDest(operandDescriptor, dest, fmt::format("max({}, {})", src1, src2)); break;
 			case ShaderOpcodes::MIN: setDest(operandDescriptor, dest, fmt::format("min({}, {})", src1, src2)); break;
 
-			case ShaderOpcodes::DP3: setDest(operandDescriptor, dest, fmt::format("vec4(dot({}.xyz, {}.xyz))", src1, src2)); break;
-			case ShaderOpcodes::DP4: setDest(operandDescriptor, dest, fmt::format("vec4(dot({}, {}))", src1, src2)); break;
+			case ShaderOpcodes::DP3:
+				if (!config.accurateShaderMul) {
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot({}.xyz, {}.xyz))", src1, src2));
+				} else {
+					// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec3(1.0)
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul({}, {}).xyz, vec3(1.0)))", src1, src2));
+				}
+				break;
+			case ShaderOpcodes::DP4:
+				if (!config.accurateShaderMul) {
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot({}, {}))", src1, src2));
+				} else {
+					// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec4(1.0)
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul({}, {}), vec4(1.0)))", src1, src2));
+				}
+				break;
 			case ShaderOpcodes::FLR: setDest(operandDescriptor, dest, fmt::format("floor({})", src1)); break;
 			case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, fmt::format("vec4(inversesqrt({}.x))", src1)); break;
 			case ShaderOpcodes::RCP: setDest(operandDescriptor, dest, fmt::format("vec4(1.0 / {}.x)", src1)); break;
@@ -441,7 +461,13 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 
 			case ShaderOpcodes::DPH:
 			case ShaderOpcodes::DPHI:
-				setDest(operandDescriptor, dest, fmt::format("vec4(dot(vec4({}.xyz, 1.0), {}))", src1, src2)); break;
+				if (!config.accurateShaderMul) {
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot(vec4({}.xyz, 1.0), {}))", src1, src2));
+				} else {
+					// A dot product between a and b is equivalent to the per-lane multiplication of a and b followed by a dot product with vec4(1.0)
+					setDest(operandDescriptor, dest, fmt::format("vec4(dot(safe_mul(vec4({}.xyz, 1.0), {}), vec4(1.0)))", src1, src2));
+				}
+				break;
 
 			case ShaderOpcodes::CMP1:
 			case ShaderOpcodes::CMP2: {
@@ -517,7 +543,11 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 		src3 += getSwizzlePattern(swizzle3);
 
 		std::string dest = getDest(destIndex);
-		setDest(operandDescriptor, dest, src1 + " * " + src2 + " + " + src3);
+		if (!config.accurateShaderMul) {
+			setDest(operandDescriptor, dest, fmt::format("{} * {} + {}", src1, src2, src3));
+		} else {
+			setDest(operandDescriptor, dest, fmt::format("safe_mul({}, {}) + {}", src1, src2, src3));
+		}
 	} else {
 		switch (opcode) {
 			case ShaderOpcodes::JMPC: {

From 7e04ab78e8d621b3d583129b6b6aaccbe38c8352 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Mon, 19 Aug 2024 22:32:55 +0300
Subject: [PATCH 29/63] Shader decompiler: Implement LG2/EX2

---
 src/core/PICA/shader_decompiler.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 6e7304e15..10afc3061 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -452,6 +452,8 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 			case ShaderOpcodes::FLR: setDest(operandDescriptor, dest, fmt::format("floor({})", src1)); break;
 			case ShaderOpcodes::RSQ: setDest(operandDescriptor, dest, fmt::format("vec4(inversesqrt({}.x))", src1)); break;
 			case ShaderOpcodes::RCP: setDest(operandDescriptor, dest, fmt::format("vec4(1.0 / {}.x)", src1)); break;
+			case ShaderOpcodes::LG2: setDest(operandDescriptor, dest, fmt::format("vec4(log2({}.x))", src1)); break;
+			case ShaderOpcodes::EX2: setDest(operandDescriptor, dest, fmt::format("vec4(exp2({}.x))", src1)); break;
 
 			case ShaderOpcodes::SLT:
 			case ShaderOpcodes::SLTI: setDest(operandDescriptor, dest, fmt::format("vec4(lessThan({}, {}))", src1, src2)); break;

From e481ce87a9cc0c16aeb898790054976183eb7994 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Mon, 19 Aug 2024 23:15:44 +0300
Subject: [PATCH 30/63] Shader decompiler: More control flow

---
 include/PICA/shader_decompiler.hpp  |  5 +-
 src/core/PICA/shader_decompiler.cpp | 95 +++++++++++++++++++++++------
 2 files changed, 80 insertions(+), 20 deletions(-)

diff --git a/include/PICA/shader_decompiler.hpp b/include/PICA/shader_decompiler.hpp
index d992d0df2..b7bd869c3 100644
--- a/include/PICA/shader_decompiler.hpp
+++ b/include/PICA/shader_decompiler.hpp
@@ -45,7 +45,10 @@ namespace PICA::ShaderGen {
 			bool operator<(const Function& other) const { return AddressRange(start, end) < AddressRange(other.start, other.end); }
 
 			std::string getIdentifier() const { return fmt::format("fn_{}_{}", start, end); }
-			std::string getForwardDecl() const { return fmt::format("void fn_{}_{}();\n", start, end); }
+			// To handle weird control flow, we have to return from each function a bool that indicates whether or not the shader reached an end
+			// instruction and should thus terminate. This is necessary for games like Rayman and Gravity Falls, which have "END" instructions called
+			// from within functions deep in the callstack
+			std::string getForwardDecl() const { return fmt::format("bool fn_{}_{}();\n", start, end); }
 			std::string getCallStatement() const { return fmt::format("fn_{}_{}()", start, end); }
 		};
 
diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 10afc3061..cab55fb01 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -138,14 +138,33 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 				}
 
 				// Exit mode of the remainder of this function, after we return from the callee
-				ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels);
-				ExitMode exitMode = exitSeries(postCallExitMode, calledFunction->exitMode);
+				const ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels);
+				const ExitMode exitMode = exitSeries(postCallExitMode, calledFunction->exitMode);
 
 				it->second = exitMode;
 				return exitMode;
 			}
-			case ShaderOpcodes::CALLC: Helpers::panic("Unimplemented control flow operation (CALLC)"); break;
-			case ShaderOpcodes::CALLU: Helpers::panic("Unimplemented control flow operation (CALLU)"); break;
+
+			case ShaderOpcodes::CALLC:
+			case ShaderOpcodes::CALLU: {
+				const u32 num = instruction & 0xff;
+				const u32 dest = getBits<10, 12>(instruction);
+				const Function* calledFunction = addFunction(shader, dest, dest + num);
+
+				// Check if analysis of the branch taken func failed and return unknown if it did
+				if (analysisFailed) {
+					it->second = ExitMode::Unknown;
+					return it->second;
+				}
+
+				// Exit mode of the remainder of this function, after we return from the callee
+				const ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels);
+				const ExitMode exitMode = exitSeries(exitParallel(calledFunction->exitMode, ExitMode::AlwaysReturn), postCallExitMode);
+
+				it->second = exitMode;
+				return exitMode;
+			}
+
 			case ShaderOpcodes::LOOP: {
 				u32 dest = getBits<10, 12>(instruction);
 				const Function* loopFunction = addFunction(shader, pc + 1, dest + 1);
@@ -159,13 +178,13 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 					return it->second;
 				}
 
-				ExitMode afterLoop = analyzeFunction(shader, dest + 1, end, labels);
-				ExitMode exitMode = exitSeries(afterLoop, loopFunction->exitMode);
+				const ExitMode afterLoop = analyzeFunction(shader, dest + 1, end, labels);
+				const ExitMode exitMode = exitSeries(afterLoop, loopFunction->exitMode);
 				it->second = exitMode;
 				return it->second;
 			}
-			case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second;
 
+			case ShaderOpcodes::END: it->second = ExitMode::AlwaysEnd; return it->second;
 			default: break;
 		}
 	}
@@ -251,15 +270,20 @@ std::string ShaderDecompiler::decompile() {
 		decompiledShader += func.getForwardDecl();
 	}
 
-	decompiledShader += "void pica_shader_main() {\n";
+	decompiledShader += "bool pica_shader_main() {\n";
 	AddressRange mainFunctionRange(entrypoint, PICAShader::maxInstructionCount);
 	callFunction(*findFunction(mainFunctionRange));
-	decompiledShader += "}\n";
+	decompiledShader += "return true;\n}\n";
 
 	for (const Function& func : controlFlow.functions) {
 		if (func.outLabels.empty()) {
-			decompiledShader += fmt::format("void {}() {{\n", func.getIdentifier());
-			compileRange(AddressRange(func.start, func.end));
+			decompiledShader += fmt::format("bool {}() {{\n", func.getIdentifier());
+
+			auto [pc, finished] = compileRange(AddressRange(func.start, func.end));
+			if (!finished) {
+				decompiledShader += "return false;";
+			}
+
 			decompiledShader += "}\n";
 		} else {
 			auto labels = func.outLabels;
@@ -267,7 +291,7 @@ std::string ShaderDecompiler::decompile() {
 
 			// If a function has jumps and "labels", this needs to be emulated using a switch-case, with the variable being switched on being the
 			// current PC
-			decompiledShader += fmt::format("void {}() {{\n", func.getIdentifier());
+			decompiledShader += fmt::format("bool {}() {{\n", func.getIdentifier());
 			decompiledShader += fmt::format("uint pc = {}u;\n", func.start);
 			decompiledShader += "while(true){\nswitch(pc){\n";
 
@@ -287,12 +311,12 @@ std::string ShaderDecompiler::decompile() {
 				decompiledShader += "}\n";
 			}
 
-			decompiledShader += "default: return;\n";
+			decompiledShader += "default: return false;\n";
 			// Exit the switch and loop
 			decompiledShader += "} }\n";
 
 			// Exit the function
-			decompiledShader += "return;\n";
+			decompiledShader += "return false;\n";
 			decompiledShader += "}\n";
 		}
 	}
@@ -613,12 +637,35 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 				return;
 			}
 
-			case ShaderOpcodes::CALL: {
+			case ShaderOpcodes::CALL:
+			case ShaderOpcodes::CALLC:
+			case ShaderOpcodes::CALLU: {
 				const u32 num = instruction & 0xff;
 				const u32 dest = getBits<10, 12>(instruction);
 				const Function* calledFunc = findFunction(AddressRange(dest, dest + num));
+
+				// Handle conditions for CALLC/CALLU
+				if (opcode == ShaderOpcodes::CALLC) {
+					const u32 condOp = getBits<22, 2>(instruction);
+					const uint refY = getBit<24>(instruction);
+					const uint refX = getBit<25>(instruction);
+					const char* condition = getCondition(condOp, refX, refY);
+
+					decompiledShader += fmt::format("if ({}) {{", condition);
+				} else if (opcode == ShaderOpcodes::CALLU) {
+					const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check
+					const u32 mask = 1u << bit;
+
+					decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{", mask);
+				}
+
 				callFunction(*calledFunc);
 
+				// Close brackets for CALLC/CALLU
+				if (opcode != ShaderOpcodes::CALL) {
+					decompiledShader += "}";
+				}
+
 				if (opcode == ShaderOpcodes::CALL && calledFunc->exitMode == ExitMode::AlwaysEnd) {
 					finished = true;
 					return;
@@ -651,7 +698,7 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 			}
 
 			case ShaderOpcodes::END:
-				decompiledShader += "return;\n";
+				decompiledShader += "return true;\n";
 				finished = true;
 				return;
 
@@ -686,13 +733,23 @@ bool ShaderDecompiler::usesCommonEncoding(u32 instruction) const {
 		case ShaderOpcodes::SLT:
 		case ShaderOpcodes::SLTI:
 		case ShaderOpcodes::SGE:
-		case ShaderOpcodes::SGEI: return true;
+		case ShaderOpcodes::SGEI:
+		case ShaderOpcodes::LITP: return true;
 
 		default: return false;
 	}
 }
 
-void ShaderDecompiler::callFunction(const Function& function) { decompiledShader += function.getCallStatement() + ";\n"; }
+void ShaderDecompiler::callFunction(const Function& function) {
+	switch (function.exitMode) {
+		// This function always ends, so call it and return true to signal that we're gonna be ending the shader
+		case ExitMode::AlwaysEnd: decompiledShader += function.getCallStatement() + ";\nreturn true;\n"; break;
+		// This function will potentially end. Call it, see if it returns that it ended, and return that we're ending if it did
+		case ExitMode::Conditional: decompiledShader += fmt::format("if ({}) {{ return true; }}\n", function.getCallStatement()); break;
+		// This function will not end. Just call it like a normal function.
+		default: decompiledShader += function.getCallStatement() + ";\n"; break;
+	}
+}
 
 std::string ShaderGen::decompileShader(PICAShader& shader, EmulatorConfig& config, u32 entrypoint, API api, Language language) {
 	ShaderDecompiler decompiler(shader, config, entrypoint, api, language);
@@ -726,7 +783,7 @@ const char* ShaderDecompiler::getCondition(u32 cond, u32 refX, u32 refY) {
 		"cmp_reg.x",
 		"cmp_reg.y",
 	};
-	u32 key = (cond & 0b11) | (refX << 2) | (refY << 3);
+	const u32 key = (cond & 0b11) | (refX << 2) | (refY << 3);
 
 	return conditions[key];
 }

From 943cf9b8890b8b822b3559cad3c8830acb6dec95 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Mon, 19 Aug 2024 23:46:37 +0300
Subject: [PATCH 31/63] Shader decompiler: Fix JMPU condition

---
 src/core/PICA/shader_decompiler.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index cab55fb01..73bd4eb01 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -591,8 +591,9 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 				const u32 dest = getBits<10, 12>(instruction);
 				const u32 bit = getBits<22, 4>(instruction);  // Bit of the bool uniform to check
 				const u32 mask = 1u << bit;
+				const u32 test = (instruction & 1) ^ 1;  // If the LSB is 0 we jump if bit = 1, otherwise 0
 
-				decompiledShader += fmt::format("if ((uniform_bool & {}u) != 0u) {{ pc = {}u; break; }}\n", mask, dest);
+				decompiledShader += fmt::format("if ((uniform_bool & {}u) {} 0u) {{ pc = {}u; break; }}\n", mask, (test != 0) ? "!=" : "==", dest);
 				break;
 			}
 

From 652b6008845bef59539192fec713d270ab4cd86d Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Tue, 20 Aug 2024 15:10:55 +0300
Subject: [PATCH 32/63] Shader decompiler: Convert main function to void

---
 src/core/PICA/shader_decompiler.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 73bd4eb01..133637a7b 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -270,10 +270,11 @@ std::string ShaderDecompiler::decompile() {
 		decompiledShader += func.getForwardDecl();
 	}
 
-	decompiledShader += "bool pica_shader_main() {\n";
+	decompiledShader += "void pica_shader_main() {\n";
 	AddressRange mainFunctionRange(entrypoint, PICAShader::maxInstructionCount);
-	callFunction(*findFunction(mainFunctionRange));
-	decompiledShader += "return true;\n}\n";
+	auto mainFunc = findFunction(mainFunctionRange);
+
+	decompiledShader += mainFunc->getCallStatement() + ";\n}\n";
 
 	for (const Function& func : controlFlow.functions) {
 		if (func.outLabels.empty()) {

From e13ef42b654a8dd0e8122e6f78fb7713ca84e8c2 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Wed, 21 Aug 2024 00:47:57 +0300
Subject: [PATCH 33/63] PICA: Start implementing GPU vertex fetch

---
 CMakeLists.txt                       |  4 +-
 include/PICA/draw_acceleration.hpp   | 19 ++++++++
 include/PICA/gpu.hpp                 |  2 +
 include/renderer.hpp                 |  5 +-
 include/renderer_gl/renderer_gl.hpp  |  4 +-
 src/core/PICA/draw_acceleration.cpp  | 71 ++++++++++++++++++++++++++++
 src/core/PICA/gpu.cpp                | 10 +++-
 src/core/PICA/regs.cpp               |  2 +-
 src/core/renderer_gl/renderer_gl.cpp |  8 +++-
 9 files changed, 117 insertions(+), 8 deletions(-)
 create mode 100644 include/PICA/draw_acceleration.hpp
 create mode 100644 src/core/PICA/draw_acceleration.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 043bb084b..643e48e38 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -241,7 +241,7 @@ set(PICA_SOURCE_FILES src/core/PICA/gpu.cpp src/core/PICA/regs.cpp src/core/PICA
                       src/core/PICA/shader_interpreter.cpp src/core/PICA/dynapica/shader_rec.cpp
                       src/core/PICA/dynapica/shader_rec_emitter_x64.cpp src/core/PICA/pica_hash.cpp
                       src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp src/core/PICA/shader_gen_glsl.cpp
-                      src/core/PICA/shader_decompiler.cpp
+                      src/core/PICA/shader_decompiler.cpp src/core/PICA/draw_acceleration.cpp
 )
 
 set(LOADER_SOURCE_FILES src/core/loader/elf.cpp src/core/loader/ncsd.cpp src/core/loader/ncch.cpp src/core/loader/3dsx.cpp src/core/loader/lz77.cpp)
@@ -293,7 +293,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
                  include/audio/miniaudio_device.hpp include/ring_buffer.hpp include/bitfield.hpp include/audio/dsp_shared_mem.hpp
                  include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp
                  include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
-                 include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp
+                 include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp
 )
 
 cmrc_add_resource_library(
diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp
new file mode 100644
index 000000000..eec76b873
--- /dev/null
+++ b/include/PICA/draw_acceleration.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <array>
+
+#include "helpers.hpp"
+
+namespace PICA {
+	struct DrawAcceleration {
+		u8* vertexBuffer;
+		u8* indexBuffer;
+
+		// Minimum and maximum index in the index buffer for a draw call
+		u16 minimumIndex, maximumIndex;
+		u32 vertexDataSize;
+
+		bool canBeAccelerated;
+		bool indexed;
+	};
+}  // namespace PICA
\ No newline at end of file
diff --git a/include/PICA/gpu.hpp b/include/PICA/gpu.hpp
index 1e1d3c4bd..c168a9bfe 100644
--- a/include/PICA/gpu.hpp
+++ b/include/PICA/gpu.hpp
@@ -1,6 +1,7 @@
 #pragma once
 #include <array>
 
+#include "PICA/draw_acceleration.hpp"
 #include "PICA/dynapica/shader_rec.hpp"
 #include "PICA/float_types.hpp"
 #include "PICA/pica_vertex.hpp"
@@ -87,6 +88,7 @@ class GPU {
 	std::unique_ptr<Renderer> renderer;
 	PICA::Vertex getImmediateModeVertex();
 
+	void getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed);
   public:
 	// 256 entries per LUT with each LUT as its own row forming a 2D image 256 * LUT_COUNT
 	// Encoded in PICA native format
diff --git a/include/renderer.hpp b/include/renderer.hpp
index 721364c1a..94a0b0f3c 100644
--- a/include/renderer.hpp
+++ b/include/renderer.hpp
@@ -1,9 +1,10 @@
 #pragma once
 #include <array>
+#include <optional>
 #include <span>
 #include <string>
-#include <optional>
 
+#include "PICA/draw_acceleration.hpp"
 #include "PICA/pica_vertex.hpp"
 #include "PICA/regs.hpp"
 #include "helpers.hpp"
@@ -83,7 +84,7 @@ class Renderer {
 	// It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between
 	// ubershaders and shadergen, and so on.
 	// Returns whether this draw is eligible for using hardware-accelerated shaders or if shaders should run on the CPU
-	virtual bool prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) { return false; }
+	virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel, bool isImmediateMode) { return false; }
 
 	// Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window
 #ifdef PANDA3DS_FRONTEND_QT
diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp
index 73b52cc5a..397aaf534 100644
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@@ -135,6 +135,8 @@ class RendererGL final : public Renderer {
 	void updateFogLUT();
 	void initGraphicsContextInternal();
 
+	void accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel);
+
   public:
 	RendererGL(GPU& gpu, const std::array<u32, regNum>& internalRegs, const std::array<u32, extRegNum>& externalRegs)
 		: Renderer(gpu, internalRegs, externalRegs), fragShaderGen(PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL) {}
@@ -152,7 +154,7 @@ class RendererGL final : public Renderer {
 	virtual bool supportsShaderReload() override { return true; }
 	virtual std::string getUbershader() override;
 	virtual void setUbershader(const std::string& shader) override;
-	virtual bool prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) override;
+	virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel, bool isImmediateMode) override;
 	
 	std::optional<ColourBuffer> getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true);
 
diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp
new file mode 100644
index 000000000..4f3e5bdd7
--- /dev/null
+++ b/src/core/PICA/draw_acceleration.cpp
@@ -0,0 +1,71 @@
+#include "PICA/draw_acceleration.hpp"
+
+#include <limits>
+
+#include "PICA/gpu.hpp"
+#include "PICA/regs.hpp"
+
+void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
+	accel.indexed = indexed;
+	const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16;
+	const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];  // Total # of vertices to transfer
+
+	accel.vertexBuffer = getPointerPhys<u8>(vertexBase);
+	if (indexed) {
+		u32 indexBufferConfig = regs[PICA::InternalRegs::IndexBufferConfig];
+		u32 indexBufferPointer = vertexBase + (indexBufferConfig & 0xfffffff);
+
+		u8* indexBuffer = getPointerPhys<u8>(indexBufferPointer);
+		u16 minimumIndex = std::numeric_limits<u16>::max();
+		u16 maximumIndex = 0;
+
+		// Check whether the index buffer uses u16 indices or u8
+		bool shortIndex = Helpers::getBit<31>(indexBufferConfig);  // Indicates whether vert indices are 16-bit or 8-bit
+
+		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
+		if (shortIndex) {
+			u16* indexBuffer16 = reinterpret_cast<u16*>(indexBuffer);
+			for (int i = 0; i < vertexCount; i++) {
+				u16 index = indexBuffer16[i];
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+		} else {
+			for (int i = 0; i < vertexCount; i++) {
+				u16 index = u16(indexBuffer[i]);
+				minimumIndex = std::min(minimumIndex, index);
+				maximumIndex = std::max(maximumIndex, index);
+			}
+		}
+
+		accel.indexBuffer = indexBuffer;
+		accel.minimumIndex = minimumIndex;
+		accel.maximumIndex = maximumIndex;
+	} else {
+		accel.indexBuffer = nullptr;
+		accel.minimumIndex = regs[PICA::InternalRegs::VertexOffsetReg];
+		accel.maximumIndex = accel.minimumIndex + vertexCount - 1;
+	}
+
+	int buffer = 0;
+	accel.vertexDataSize = 0;
+
+	for (int attrCount = 0; attrCount < totalAttribCount; attrCount++) {
+		bool fixedAttribute = (fixedAttribMask & (1 << attrCount)) != 0;
+
+		if (!fixedAttribute) {
+			auto& attr = attributeInfo[buffer];  // Get information for this attribute
+			
+			if (attr.componentCount != 0) {
+				// Size of the attribute in bytes multiplied by the total number of vertices
+				const u32 bytes = attr.size * vertexCount;
+				// Add it to the total vertex data size, aligned to 4 bytes.
+				accel.vertexDataSize += (bytes + 3) & ~3;
+			}
+
+			buffer++;
+		}
+	}
+
+	accel.canBeAccelerated = true;
+}
\ No newline at end of file
diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp
index 6cbdb100b..7e9be0053 100644
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@@ -123,7 +123,15 @@ void GPU::reset() {
 // Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
 // And whether we are going to use the shader JIT (second template parameter)
 void GPU::drawArrays(bool indexed) {
-	const bool hwShaders = renderer->prepareForDraw(shaderUnit, false);
+	PICA::DrawAcceleration accel;
+
+	if (config.accelerateShaders) {
+		// If we are potentially going to use hw shaders, gather necessary to do vertex fetch, index buffering, etc on the GPU
+		// This includes parsing which vertices to upload, getting pointers to the index buffer data & vertex data, and so on 
+		getAcceleratedDrawInfo(accel, indexed);
+	}
+
+	const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel, false);
 
 	if (hwShaders) {
 		if (indexed) {
diff --git a/src/core/PICA/regs.cpp b/src/core/PICA/regs.cpp
index 0c5f4adb7..091bd377d 100644
--- a/src/core/PICA/regs.cpp
+++ b/src/core/PICA/regs.cpp
@@ -249,7 +249,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 						// If we've reached 3 verts, issue a draw call
 						// Handle rendering depending on the primitive type
 						if (immediateModeVertIndex == 3) {
-							renderer->prepareForDraw(shaderUnit, true);
+							renderer->prepareForDraw(shaderUnit, nullptr, true);
 							renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices);
 
 							switch (primType) {
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 6e50f77be..d0ecf4433 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -942,7 +942,7 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
 	return program;
 }
 
-bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
+bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel, bool isImmediateMode) {
 	// First we figure out if we will be using an ubershader
 	bool usingUbershader = emulatorConfig->useUbershaders;
 	if (usingUbershader) {
@@ -993,6 +993,8 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, bool isImmediateMode) {
 				glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer());
 			}
 		}
+
+		accelerateVertexUpload(shaderUnit, accel);
 	}
 
 	if (usingUbershader) {
@@ -1110,4 +1112,8 @@ void RendererGL::initUbershader(OpenGL::Program& program) {
 	glUniform1i(OpenGL::uniformLocation(program, "u_tex1"), 1);
 	glUniform1i(OpenGL::uniformLocation(program, "u_tex2"), 2);
 	glUniform1i(OpenGL::uniformLocation(program, "u_tex_luts"), 3);
+}
+
+void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) {
+
 }
\ No newline at end of file

From 74a341ba46667696d87160fa979831a53d4f5a73 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sat, 24 Aug 2024 02:58:05 +0300
Subject: [PATCH 34/63] More hw VAO work

---
 include/PICA/draw_acceleration.hpp   | 15 +++++++++
 src/core/PICA/draw_acceleration.cpp  | 47 ++++++++++++++++++++++++----
 src/core/renderer_gl/renderer_gl.cpp | 10 ++++++
 3 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp
index eec76b873..f940fc7c0 100644
--- a/include/PICA/draw_acceleration.hpp
+++ b/include/PICA/draw_acceleration.hpp
@@ -6,13 +6,28 @@
 
 namespace PICA {
 	struct DrawAcceleration {
+		static constexpr u32 maxAttribCount = 12;
+
+		struct AttributeInfo {
+			u32 offset;
+
+			u8 type;
+			u8 componentCount;
+			bool fixed;
+
+			std::array<float, 4> fixedValue;  // For fixed attributes
+		};
+
 		u8* vertexBuffer;
 		u8* indexBuffer;
 
 		// Minimum and maximum index in the index buffer for a draw call
 		u16 minimumIndex, maximumIndex;
+		u32 totalAttribCount;
 		u32 vertexDataSize;
 
+		std::array<AttributeInfo, maxAttribCount> attributeInfo;
+
 		bool canBeAccelerated;
 		bool indexed;
 	};
diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp
index 4f3e5bdd7..827f107db 100644
--- a/src/core/PICA/draw_acceleration.cpp
+++ b/src/core/PICA/draw_acceleration.cpp
@@ -7,6 +7,8 @@
 
 void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 	accel.indexed = indexed;
+	accel.totalAttribCount = totalAttribCount;
+
 	const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16;
 	const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];  // Total # of vertices to transfer
 
@@ -47,23 +49,56 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 		accel.maximumIndex = accel.minimumIndex + vertexCount - 1;
 	}
 
+	const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32);
 	int buffer = 0;
 	accel.vertexDataSize = 0;
 
 	for (int attrCount = 0; attrCount < totalAttribCount; attrCount++) {
-		bool fixedAttribute = (fixedAttribMask & (1 << attrCount)) != 0;
+		auto& attr = accel.attributeInfo[attrCount];
+		attr.fixed = (fixedAttribMask & (1 << attrCount)) != 0;
 
-		if (!fixedAttribute) {
-			auto& attr = attributeInfo[buffer];  // Get information for this attribute
-			
-			if (attr.componentCount != 0) {
+		// Variable attribute attribute
+		if (!attr.fixed) {
+			auto& attrData = attributeInfo[buffer];  // Get information for this attribute
+			u64 attrCfg = attrData.getConfigFull();  // Get config1 | (config2 << 32)
+			u32 attributeOffset = attrData.offset;
+
+			if (attrData.componentCount != 0) {
 				// Size of the attribute in bytes multiplied by the total number of vertices
-				const u32 bytes = attr.size * vertexCount;
+				const u32 bytes = attrData.size * vertexCount;
 				// Add it to the total vertex data size, aligned to 4 bytes.
 				accel.vertexDataSize += (bytes + 3) & ~3;
 			}
 
+			for (int i = 0; i < attrData.componentCount; i++) {
+				uint index = (attrCfg >> (i * 4)) & 0xf;  // Get index of attribute in vertexCfg
+
+				// Vertex attributes used as padding
+				// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
+				if (index >= 12) [[unlikely]] {
+					Helpers::panic("Padding attribute");
+					// Align attribute address up to a 4 byte boundary
+					attributeOffset = (attributeOffset + 3) & -4;
+					attributeOffset += (index - 11) << 2;
+					continue;
+				}
+
+				u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf;
+				u32 attribType = attribInfo & 0x3;  //  Type of attribute(sbyte/ubyte/short/float)
+				u32 size = (attribInfo >> 2) + 1;   // Total number of components
+			
+				attr.componentCount = size;
+				attr.offset = attributeOffset;
+				attr.type = attribType;
+			}
+
 			buffer++;
+		} else {
+			vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount];
+
+			for (int i = 0; i < 4; i++) {
+				attr.fixedValue[i] = fixedAttr[i].toFloat32();
+			}
 		}
 	}
 
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index d0ecf4433..71346f9b5 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -1115,5 +1115,15 @@ void RendererGL::initUbershader(OpenGL::Program& program) {
 }
 
 void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) {
+	u32 buffer = 0;  // Vertex buffer index for non-fixed attributes
+	u32 attrCount = 0;
 
+	const u32 totalAttribCount = accel->totalAttribCount;
+
+	static constexpr GLenum attributeFormats[4] = {
+		GL_BYTE,           // 0: Signed byte
+		GL_UNSIGNED_BYTE,  // 1: Unsigned byte
+		GL_SHORT,          // 2: Short
+		GL_FLOAT,          // 3: Float
+	};
 }
\ No newline at end of file

From 5d6f59112aa677084851734e1959d99d1c8d5283 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sat, 24 Aug 2024 02:58:05 +0300
Subject: [PATCH 35/63] More hw VAO work

---
 include/PICA/draw_acceleration.hpp   | 15 ++++++++
 src/core/PICA/draw_acceleration.cpp  | 51 ++++++++++++++++++++++++----
 src/core/renderer_gl/renderer_gl.cpp | 10 ++++++
 3 files changed, 70 insertions(+), 6 deletions(-)

diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp
index eec76b873..f940fc7c0 100644
--- a/include/PICA/draw_acceleration.hpp
+++ b/include/PICA/draw_acceleration.hpp
@@ -6,13 +6,28 @@
 
 namespace PICA {
 	struct DrawAcceleration {
+		static constexpr u32 maxAttribCount = 12;
+
+		struct AttributeInfo {
+			u32 offset;
+
+			u8 type;
+			u8 componentCount;
+			bool fixed;
+
+			std::array<float, 4> fixedValue;  // For fixed attributes
+		};
+
 		u8* vertexBuffer;
 		u8* indexBuffer;
 
 		// Minimum and maximum index in the index buffer for a draw call
 		u16 minimumIndex, maximumIndex;
+		u32 totalAttribCount;
 		u32 vertexDataSize;
 
+		std::array<AttributeInfo, maxAttribCount> attributeInfo;
+
 		bool canBeAccelerated;
 		bool indexed;
 	};
diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp
index 4f3e5bdd7..b96f6db42 100644
--- a/src/core/PICA/draw_acceleration.cpp
+++ b/src/core/PICA/draw_acceleration.cpp
@@ -7,6 +7,8 @@
 
 void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 	accel.indexed = indexed;
+	accel.totalAttribCount = totalAttribCount;
+
 	const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16;
 	const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];  // Total # of vertices to transfer
 
@@ -47,23 +49,60 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 		accel.maximumIndex = accel.minimumIndex + vertexCount - 1;
 	}
 
+	const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32);
 	int buffer = 0;
 	accel.vertexDataSize = 0;
 
 	for (int attrCount = 0; attrCount < totalAttribCount; attrCount++) {
-		bool fixedAttribute = (fixedAttribMask & (1 << attrCount)) != 0;
+		auto& attr = accel.attributeInfo[attrCount];
+		attr.fixed = (fixedAttribMask & (1 << attrCount)) != 0;
 
-		if (!fixedAttribute) {
-			auto& attr = attributeInfo[buffer];  // Get information for this attribute
-			
-			if (attr.componentCount != 0) {
+		// Variable attribute attribute
+		if (!attr.fixed) {
+			auto& attrData = attributeInfo[buffer];  // Get information for this attribute
+			u64 attrCfg = attrData.getConfigFull();  // Get config1 | (config2 << 32)
+			u32 attributeOffset = attrData.offset;
+
+			if (attrData.componentCount != 0) {
 				// Size of the attribute in bytes multiplied by the total number of vertices
-				const u32 bytes = attr.size * vertexCount;
+				const u32 bytes = attrData.size * vertexCount;
 				// Add it to the total vertex data size, aligned to 4 bytes.
 				accel.vertexDataSize += (bytes + 3) & ~3;
 			}
 
+			for (int i = 0; i < attrData.componentCount; i++) {
+				uint index = (attrCfg >> (i * 4)) & 0xf;  // Get index of attribute in vertexCfg
+
+				// Vertex attributes used as padding
+				// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
+				if (index >= 12) [[unlikely]] {
+					Helpers::panic("Padding attribute");
+					// Align attribute address up to a 4 byte boundary
+					attributeOffset = (attributeOffset + 3) & -4;
+					attributeOffset += (index - 11) << 2;
+					continue;
+				}
+
+				u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf;
+				u32 attribType = attribInfo & 0x3;  //  Type of attribute(sbyte/ubyte/short/float)
+				u32 size = (attribInfo >> 2) + 1;   // Total number of components
+			
+				attr.componentCount = size;
+				attr.offset = attributeOffset;
+				attr.type = attribType;
+
+				// Size of each component based on the attribute type
+				static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
+				attributeOffset += size * sizePerComponent[attribType];
+			}
+
 			buffer++;
+		} else {
+			vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount];
+
+			for (int i = 0; i < 4; i++) {
+				attr.fixedValue[i] = fixedAttr[i].toFloat32();
+			}
 		}
 	}
 
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index d0ecf4433..71346f9b5 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -1115,5 +1115,15 @@ void RendererGL::initUbershader(OpenGL::Program& program) {
 }
 
 void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) {
+	u32 buffer = 0;  // Vertex buffer index for non-fixed attributes
+	u32 attrCount = 0;
 
+	const u32 totalAttribCount = accel->totalAttribCount;
+
+	static constexpr GLenum attributeFormats[4] = {
+		GL_BYTE,           // 0: Signed byte
+		GL_UNSIGNED_BYTE,  // 1: Unsigned byte
+		GL_SHORT,          // 2: Short
+		GL_FLOAT,          // 3: Float
+	};
 }
\ No newline at end of file

From a8b30ee2dc5b53f6bd7f62953189d767c01f7186 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 25 Aug 2024 00:45:23 +0300
Subject: [PATCH 36/63] More GPU vertex fetch code

---
 include/PICA/draw_acceleration.hpp   |  1 +
 src/core/PICA/draw_acceleration.cpp  | 33 ++++++++++++++++++++--------
 src/core/renderer_gl/renderer_gl.cpp |  8 +++++++
 3 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp
index f940fc7c0..bd3e428dd 100644
--- a/include/PICA/draw_acceleration.hpp
+++ b/include/PICA/draw_acceleration.hpp
@@ -9,6 +9,7 @@ namespace PICA {
 		static constexpr u32 maxAttribCount = 12;
 
 		struct AttributeInfo {
+			u8* data;
 			u32 offset;
 
 			u8 type;
diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp
index b96f6db42..e9546cf7b 100644
--- a/src/core/PICA/draw_acceleration.cpp
+++ b/src/core/PICA/draw_acceleration.cpp
@@ -50,15 +50,15 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 	}
 
 	const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32);
-	int buffer = 0;
+	u32 buffer = 0;
+	u32 attrCount = 0;
 	accel.vertexDataSize = 0;
 
-	for (int attrCount = 0; attrCount < totalAttribCount; attrCount++) {
-		auto& attr = accel.attributeInfo[attrCount];
-		attr.fixed = (fixedAttribMask & (1 << attrCount)) != 0;
+	while (attrCount < totalAttribCount) {
+		bool fixedAttrib = (fixedAttribMask & (1 << attrCount)) != 0;
 
 		// Variable attribute attribute
-		if (!attr.fixed) {
+		if (!fixedAttrib) {
 			auto& attrData = attributeInfo[buffer];  // Get information for this attribute
 			u64 attrCfg = attrData.getConfigFull();  // Get config1 | (config2 << 32)
 			u32 attributeOffset = attrData.offset;
@@ -72,6 +72,8 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 
 			for (int i = 0; i < attrData.componentCount; i++) {
 				uint index = (attrCfg >> (i * 4)) & 0xf;  // Get index of attribute in vertexCfg
+				auto& attr = accel.attributeInfo[attrCount];
+				attr.fixed = false;
 
 				// Vertex attributes used as padding
 				// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
@@ -83,26 +85,39 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 					continue;
 				}
 
-				u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf;
-				u32 attribType = attribInfo & 0x3;  //  Type of attribute(sbyte/ubyte/short/float)
-				u32 size = (attribInfo >> 2) + 1;   // Total number of components
+				const u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf;
+				const u32 attribType = attribInfo & 0x3;  //  Type of attribute (sbyte/ubyte/short/float)
+				const u32 size = (attribInfo >> 2) + 1;   // Total number of components
 			
 				attr.componentCount = size;
 				attr.offset = attributeOffset;
 				attr.type = attribType;
 
+				// Get a pointer to the data where this attribute is stored
+				const u32 attrAddress = vertexBase + attr.offset + (accel.minimumIndex * attrData.size);
+				attr.data = getPointerPhys<u8>(attrAddress);
+
 				// Size of each component based on the attribute type
 				static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
 				attributeOffset += size * sizePerComponent[attribType];
+
+				attrCount += 1;
 			}
 
-			buffer++;
+			buffer += 1;
 		} else {
 			vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount];
+			auto& attr = accel.attributeInfo[attrCount];
+
+			attr.fixed = true;
+			// Set the data pointer to nullptr in order to catch any potential bugs
+			attr.data = nullptr;
 
 			for (int i = 0; i < 4; i++) {
 				attr.fixedValue[i] = fixedAttr[i].toFloat32();
 			}
+
+			attrCount += 1;
 		}
 	}
 
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 71346f9b5..f5728346b 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -1126,4 +1126,12 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
 		GL_SHORT,          // 2: Short
 		GL_FLOAT,          // 3: Float
 	};
+
+	for (int i = 0; i < totalAttribCount; i++) {
+		const auto& attrib = accel->attributeInfo[i];
+		printf(
+			"%s attribute starting from offset %d with a size of %d components\n", attrib.fixed ? "Fixed" : "Variable", (!attrib.fixed) ? attrib.offset : 0,
+			!attrib.fixed ? attrib.componentCount : 4
+		);
+	}
 }
\ No newline at end of file

From e34bdb68413a8a0560a3708813949ddd4ba175c8 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 25 Aug 2024 01:47:02 +0300
Subject: [PATCH 37/63] Add GL Stream Buffer from Duckstation

---
 CMakeLists.txt                               |   5 +-
 include/align.hpp                            |  99 +++++++
 include/renderer_gl/renderer_gl.hpp          |   8 +-
 src/core/PICA/draw_acceleration.cpp          |   2 +
 src/core/renderer_gl/renderer_gl.cpp         |   8 +
 third_party/duckstation/gl/stream_buffer.cpp | 288 +++++++++++++++++++
 third_party/duckstation/gl/stream_buffer.h   |  53 ++++
 7 files changed, 461 insertions(+), 2 deletions(-)
 create mode 100644 include/align.hpp
 create mode 100644 third_party/duckstation/gl/stream_buffer.cpp
 create mode 100644 third_party/duckstation/gl/stream_buffer.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 89322af45..6a94047c9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,6 +138,7 @@ include_directories(${SDL2_INCLUDE_DIR})
 include_directories(third_party/toml11)
 include_directories(third_party/glm)
 include_directories(third_party/renderdoc)
+include_directories(third_party/duckstation)
 
 add_subdirectory(third_party/cmrc)
 
@@ -302,6 +303,7 @@ set(HEADER_FILES include/emulator.hpp include/helpers.hpp include/termcolor.hpp
                  include/audio/hle_core.hpp include/capstone.hpp include/audio/aac.hpp include/PICA/pica_frag_config.hpp
                  include/PICA/pica_frag_uniforms.hpp include/PICA/shader_gen_types.hpp include/PICA/shader_decompiler.hpp
                  include/PICA/pica_vert_config.hpp include/sdl_sensors.hpp include/PICA/draw_acceleration.hpp include/renderdoc.hpp
+                 include/align.hpp
 )
 
 cmrc_add_resource_library(
@@ -334,7 +336,6 @@ if(ENABLE_LUAJIT AND NOT ANDROID)
 endif()
 
 if(ENABLE_QT_GUI)
-    include_directories(third_party/duckstation)
     set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/window_info.cpp third_party/duckstation/gl/context.cpp)
 
     if(APPLE)
@@ -377,6 +378,8 @@ if(ENABLE_OPENGL)
         src/host_shaders/opengl_fragment_shader.frag
     )
 
+    set(THIRD_PARTY_SOURCE_FILES ${THIRD_PARTY_SOURCE_FILES} third_party/duckstation/gl/stream_buffer.cpp)
+
     set(HEADER_FILES ${HEADER_FILES} ${RENDERER_GL_INCLUDE_FILES})
     source_group("Source Files\\Core\\OpenGL Renderer" FILES ${RENDERER_GL_SOURCE_FILES})
 
diff --git a/include/align.hpp b/include/align.hpp
new file mode 100644
index 000000000..6b79a6564
--- /dev/null
+++ b/include/align.hpp
@@ -0,0 +1,99 @@
+// SPDX-FileCopyrightText: 2019-2022 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+
+#include <cstdlib>
+
+#include "helpers.hpp"
+
+#ifdef _MSC_VER
+#include <malloc.h>
+#endif
+
+namespace Common {
+	template <typename T>
+	constexpr bool isAligned(T value, unsigned int alignment) {
+		return (value % static_cast<T>(alignment)) == 0;
+	}
+
+	template <typename T>
+	constexpr T alignUp(T value, unsigned int alignment) {
+		return (value + static_cast<T>(alignment - 1)) / static_cast<T>(alignment) * static_cast<T>(alignment);
+	}
+
+	template <typename T>
+	constexpr T alignDown(T value, unsigned int alignment) {
+		return value / static_cast<T>(alignment) * static_cast<T>(alignment);
+	}
+    
+	template <typename T>
+	constexpr bool isAlignedPow2(T value, unsigned int alignment) {
+		return (value & static_cast<T>(alignment - 1)) == 0;
+	}
+
+	template <typename T>
+	constexpr T alignUpPow2(T value, unsigned int alignment) {
+		return (value + static_cast<T>(alignment - 1)) & static_cast<T>(~static_cast<T>(alignment - 1));
+	}
+
+	template <typename T>
+	constexpr T alignDownPow2(T value, unsigned int alignment) {
+		return value & static_cast<T>(~static_cast<T>(alignment - 1));
+	}
+
+	template <typename T>
+	constexpr bool isPow2(T value) {
+		return (value & (value - 1)) == 0;
+	}
+
+	template <typename T>
+	constexpr T previousPow2(T value) {
+		if (value == static_cast<T>(0)) return 0;
+
+		value |= (value >> 1);
+		value |= (value >> 2);
+		value |= (value >> 4);
+		if constexpr (sizeof(T) >= 16) value |= (value >> 8);
+		if constexpr (sizeof(T) >= 32) value |= (value >> 16);
+		if constexpr (sizeof(T) >= 64) value |= (value >> 32);
+		return value - (value >> 1);
+	}
+    
+	template <typename T>
+	constexpr T nextPow2(T value) {
+		// https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+		if (value == static_cast<T>(0)) return 0;
+
+		value--;
+		value |= (value >> 1);
+		value |= (value >> 2);
+		value |= (value >> 4);
+		if constexpr (sizeof(T) >= 16) value |= (value >> 8);
+		if constexpr (sizeof(T) >= 32) value |= (value >> 16);
+		if constexpr (sizeof(T) >= 64) value |= (value >> 32);
+		value++;
+		return value;
+	}
+
+	ALWAYS_INLINE static void* alignedMalloc(size_t size, size_t alignment) {
+#ifdef _MSC_VER
+		return _aligned_malloc(size, alignment);
+#else
+		// Unaligned sizes are slow on macOS.
+#ifdef __APPLE__
+		if (isPow2(alignment)) size = (size + alignment - 1) & ~(alignment - 1);
+#endif
+		void* ret = nullptr;
+		return (posix_memalign(&ret, alignment, size) == 0) ? ret : nullptr;
+#endif
+	}
+
+	ALWAYS_INLINE static void alignedFree(void* ptr) {
+#ifdef _MSC_VER
+		_aligned_free(ptr);
+#else
+		free(ptr);
+#endif
+	}
+}  // namespace Common
\ No newline at end of file
diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp
index 397aaf534..63bbb474c 100644
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@@ -3,6 +3,7 @@
 #include <array>
 #include <cstring>
 #include <functional>
+#include <memory>
 #include <optional>
 #include <span>
 #include <unordered_map>
@@ -10,11 +11,12 @@
 
 #include "PICA/float_types.hpp"
 #include "PICA/pica_frag_config.hpp"
-#include "PICA/pica_vert_config.hpp"
 #include "PICA/pica_hash.hpp"
+#include "PICA/pica_vert_config.hpp"
 #include "PICA/pica_vertex.hpp"
 #include "PICA/regs.hpp"
 #include "PICA/shader_gen.hpp"
+#include "gl/stream_buffer.h"
 #include "gl_state.hpp"
 #include "helpers.hpp"
 #include "logger.hpp"
@@ -83,6 +85,10 @@ class RendererGL final : public Renderer {
 	// UBO for uploading the PICA uniforms when using hw shaders
 	GLuint hwShaderUniformUBO;
 
+	using StreamBuffer = OpenGLStreamBuffer;
+	std::unique_ptr<StreamBuffer> hwVertexBuffer;
+	std::unique_ptr<StreamBuffer> hwIndexBuffer;
+
 	// Cached recompiled fragment shader
 	struct CachedProgram {
 		OpenGL::Program program;
diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp
index e9546cf7b..5fc21e48a 100644
--- a/src/core/PICA/draw_acceleration.cpp
+++ b/src/core/PICA/draw_acceleration.cpp
@@ -82,6 +82,8 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 					// Align attribute address up to a 4 byte boundary
 					attributeOffset = (attributeOffset + 3) & -4;
 					attributeOffset += (index - 11) << 2;
+
+					attr.data = nullptr;
 					continue;
 				}
 
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index f5728346b..3b2d1d70b 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -78,6 +78,14 @@ void RendererGL::initGraphicsContextInternal() {
 	gl.useProgram(displayProgram);
 	glUniform1i(OpenGL::uniformLocation(displayProgram, "u_texture"), 0);  // Init sampler object
 
+	// Create stream buffers for vertex, index and uniform buffers
+	// TODO: Remove buffers from GL state tracking as the StreamBuffer implementation bypasses the state tracker.
+	static constexpr usize hwIndexBufferSize = 2_MB;
+	static constexpr usize hwVertexBufferSize = 16_MB;
+
+	hwIndexBuffer = StreamBuffer::Create(GL_ELEMENT_ARRAY_BUFFER, hwIndexBufferSize);
+	hwVertexBuffer = StreamBuffer::Create(GL_ARRAY_BUFFER, hwVertexBufferSize);
+
 	// Allocate memory for the shadergen fragment uniform UBO
 	glGenBuffers(1, &shadergenFragmentUBO);
 	gl.bindUBO(shadergenFragmentUBO);
diff --git a/third_party/duckstation/gl/stream_buffer.cpp b/third_party/duckstation/gl/stream_buffer.cpp
new file mode 100644
index 000000000..f4f8b54cf
--- /dev/null
+++ b/third_party/duckstation/gl/stream_buffer.cpp
@@ -0,0 +1,288 @@
+// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#include "gl/stream_buffer.h"
+
+#include <array>
+#include <cstdio>
+
+#include "align.hpp"
+
+OpenGLStreamBuffer::OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : m_target(target), m_buffer_id(buffer_id), m_size(size) {}
+OpenGLStreamBuffer::~OpenGLStreamBuffer() { glDeleteBuffers(1, &m_buffer_id); }
+
+void OpenGLStreamBuffer::Bind() { glBindBuffer(m_target, m_buffer_id); }
+void OpenGLStreamBuffer::Unbind() { glBindBuffer(m_target, 0); }
+
+void OpenGLStreamBuffer::SetDebugName(std::string_view name) {
+#ifdef GPU_DEBUG_INFO
+	if (glObjectLabel) {
+		glObjectLabel(GL_BUFFER, GetGLBufferId(), static_cast<GLsizei>(name.length()), static_cast<const GLchar*>(name.data()));
+	}
+#endif
+}
+
+namespace {
+	// Uses glBufferSubData() to update. Preferred for drivers which don't support {ARB,EXT}_buffer_storage.
+	class BufferSubDataStreamBuffer final : public OpenGLStreamBuffer {
+	  public:
+		~BufferSubDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); }
+
+		MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast<void*>(m_cpu_buffer), 0, 0, m_size / alignment}; }
+
+		u32 Unmap(u32 used_size) override {
+			if (used_size == 0) return 0;
+
+			glBindBuffer(m_target, m_buffer_id);
+			glBufferSubData(m_target, 0, used_size, m_cpu_buffer);
+			return 0;
+		}
+
+		u32 GetChunkSize() const override { return m_size; }
+
+		static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size) {
+			glGetError();
+
+			GLuint buffer_id;
+			glGenBuffers(1, &buffer_id);
+			glBindBuffer(target, buffer_id);
+			glBufferData(target, size, nullptr, GL_STREAM_DRAW);
+
+			GLenum err = glGetError();
+			if (err != GL_NO_ERROR) {
+				glBindBuffer(target, 0);
+				glDeleteBuffers(1, &buffer_id);
+				return {};
+			}
+
+			return std::unique_ptr<OpenGLStreamBuffer>(new BufferSubDataStreamBuffer(target, buffer_id, size));
+		}
+
+	  private:
+		BufferSubDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) {
+			m_cpu_buffer = static_cast<u8*>(Common::alignedMalloc(size, 32));
+			if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer");
+		}
+
+		u8* m_cpu_buffer;
+	};
+
+	// Uses BufferData() to orphan the buffer after every update. Used on Mali where BufferSubData forces a sync.
+	class BufferDataStreamBuffer final : public OpenGLStreamBuffer {
+	  public:
+		~BufferDataStreamBuffer() override { Common::alignedFree(m_cpu_buffer); }
+
+		MappingResult Map(u32 alignment, u32 min_size) override { return MappingResult{static_cast<void*>(m_cpu_buffer), 0, 0, m_size / alignment}; }
+
+		u32 Unmap(u32 used_size) override {
+			if (used_size == 0) return 0;
+
+			glBindBuffer(m_target, m_buffer_id);
+			glBufferData(m_target, used_size, m_cpu_buffer, GL_STREAM_DRAW);
+			return 0;
+		}
+
+		u32 GetChunkSize() const override { return m_size; }
+
+		static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size) {
+			glGetError();
+
+			GLuint buffer_id;
+			glGenBuffers(1, &buffer_id);
+			glBindBuffer(target, buffer_id);
+			glBufferData(target, size, nullptr, GL_STREAM_DRAW);
+
+			GLenum err = glGetError();
+			if (err != GL_NO_ERROR) {
+				glBindBuffer(target, 0);
+				glDeleteBuffers(1, &buffer_id);
+				return {};
+			}
+
+			return std::unique_ptr<OpenGLStreamBuffer>(new BufferDataStreamBuffer(target, buffer_id, size));
+		}
+
+	  private:
+		BufferDataStreamBuffer(GLenum target, GLuint buffer_id, u32 size) : OpenGLStreamBuffer(target, buffer_id, size) {
+			m_cpu_buffer = static_cast<u8*>(Common::alignedMalloc(size, 32));
+			if (!m_cpu_buffer) Panic("Failed to allocate CPU storage for GL buffer");
+		}
+
+		u8* m_cpu_buffer;
+	};
+
+	// Base class for implementations which require syncing.
+	class SyncingStreamBuffer : public OpenGLStreamBuffer {
+	  public:
+		enum : u32 { NUM_SYNC_POINTS = 16 };
+
+		virtual ~SyncingStreamBuffer() override {
+			for (u32 i = m_available_block_index; i <= m_used_block_index; i++) {
+				glDeleteSync(m_sync_objects[i]);
+			}
+		}
+
+	  protected:
+		SyncingStreamBuffer(GLenum target, GLuint buffer_id, u32 size)
+			: OpenGLStreamBuffer(target, buffer_id, size), m_bytes_per_block((size + (NUM_SYNC_POINTS)-1) / NUM_SYNC_POINTS) {}
+
+		ALWAYS_INLINE u32 GetSyncIndexForOffset(u32 offset) { return offset / m_bytes_per_block; }
+
+		ALWAYS_INLINE void AddSyncsForOffset(u32 offset) {
+			const u32 end = GetSyncIndexForOffset(offset);
+			for (; m_used_block_index < end; m_used_block_index++) {
+				if (m_sync_objects[m_used_block_index]) {
+					Helpers::panic("GL stream buffer: Fence slot we're trying to insert is already in use");
+				}
+
+				m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+			}
+		}
+
+		ALWAYS_INLINE void WaitForSync(GLsync& sync) {
+			glClientWaitSync(sync, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
+			glDeleteSync(sync);
+			sync = nullptr;
+		}
+
+		ALWAYS_INLINE void EnsureSyncsWaitedForOffset(u32 offset) {
+			const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
+			for (; m_available_block_index < end; m_available_block_index++) {
+				if (!m_sync_objects[m_used_block_index]) [[unlikely]] {
+					Helpers::panic("GL stream buffer: Fence slot we're trying to wait on in not in use");
+				}
+
+				WaitForSync(m_sync_objects[m_available_block_index]);
+			}
+		}
+
+		void AllocateSpace(u32 size) {
+			// add sync objects for writes since the last allocation
+			AddSyncsForOffset(m_position);
+
+			// wait for sync objects for the space we want to use
+			EnsureSyncsWaitedForOffset(m_position + size);
+
+			// wrap-around?
+			if ((m_position + size) > m_size) {
+				// current position ... buffer end
+				AddSyncsForOffset(m_size);
+
+				// rewind, and try again
+				m_position = 0;
+
+				// wait for the sync at the start of the buffer
+				WaitForSync(m_sync_objects[0]);
+				m_available_block_index = 1;
+
+				// and however much more we need to satisfy the allocation
+				EnsureSyncsWaitedForOffset(size);
+				m_used_block_index = 0;
+			}
+		}
+
+		u32 GetChunkSize() const override { return m_size / NUM_SYNC_POINTS; }
+
+		u32 m_position = 0;
+		u32 m_used_block_index = 0;
+		u32 m_available_block_index = NUM_SYNC_POINTS;
+		u32 m_bytes_per_block;
+		std::array<GLsync, NUM_SYNC_POINTS> m_sync_objects{};
+	};
+
+	class BufferStorageStreamBuffer : public SyncingStreamBuffer {
+	  public:
+		~BufferStorageStreamBuffer() override {
+			glBindBuffer(m_target, m_buffer_id);
+			glUnmapBuffer(m_target);
+			glBindBuffer(m_target, 0);
+		}
+
+		MappingResult Map(u32 alignment, u32 min_size) override {
+			if (m_position > 0) m_position = Common::alignUp(m_position, alignment);
+
+			AllocateSpace(min_size);
+			if ((m_position + min_size) > (m_available_block_index * m_bytes_per_block)) [[unlikely]] {
+				Helpers::panic("GL stream buffer: Invalid size passed to Unmap");
+			}
+
+			const u32 free_space_in_block = ((m_available_block_index * m_bytes_per_block) - m_position);
+			return MappingResult{static_cast<void*>(m_mapped_ptr + m_position), m_position, m_position / alignment, free_space_in_block / alignment};
+		}
+
+		u32 Unmap(u32 used_size) override {
+			if ((m_position + used_size) > m_size) [[unlikely]] {
+				Helpers::panic("GL stream buffer: Invalid size passed to Unmap");
+			}
+
+			if (!m_coherent) {
+				if (GLAD_GL_VERSION_4_5 || GLAD_GL_ARB_direct_state_access) {
+					glFlushMappedNamedBufferRange(m_buffer_id, m_position, used_size);
+				} else {
+					Bind();
+					glFlushMappedBufferRange(m_target, m_position, used_size);
+				}
+			}
+
+			const u32 prev_position = m_position;
+			m_position += used_size;
+			return prev_position;
+		}
+
+		static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size, bool coherent = true) {
+			glGetError();
+
+			GLuint buffer_id;
+			glGenBuffers(1, &buffer_id);
+			glBindBuffer(target, buffer_id);
+
+			const u32 flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
+			const u32 map_flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT);
+			if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage)
+				glBufferStorage(target, size, nullptr, flags);
+			else if (GLAD_GL_EXT_buffer_storage)
+				glBufferStorageEXT(target, size, nullptr, flags);
+
+			GLenum err = glGetError();
+			if (err != GL_NO_ERROR) {
+				glBindBuffer(target, 0);
+				glDeleteBuffers(1, &buffer_id);
+				return {};
+			}
+
+			u8* mapped_ptr = static_cast<u8*>(glMapBufferRange(target, 0, size, map_flags));
+			AssertMsg(mapped_ptr, "Persistent buffer was mapped");
+
+			return std::unique_ptr<OpenGLStreamBuffer>(new BufferStorageStreamBuffer(target, buffer_id, size, mapped_ptr, coherent));
+		}
+
+	  private:
+		BufferStorageStreamBuffer(GLenum target, GLuint buffer_id, u32 size, u8* mapped_ptr, bool coherent)
+			: SyncingStreamBuffer(target, buffer_id, size), m_mapped_ptr(mapped_ptr), m_coherent(coherent) {}
+
+		u8* m_mapped_ptr;
+		bool m_coherent;
+	};
+
+}  // namespace
+
+std::unique_ptr<OpenGLStreamBuffer> OpenGLStreamBuffer::Create(GLenum target, u32 size) {
+	std::unique_ptr<OpenGLStreamBuffer> buf;
+	if (GLAD_GL_VERSION_4_4 || GLAD_GL_ARB_buffer_storage || GLAD_GL_EXT_buffer_storage) {
+		buf = BufferStorageStreamBuffer::Create(target, size);
+		if (buf) return buf;
+	}
+
+	// BufferSubData is slower on all drivers except NVIDIA...
+#if 0
+	const char* vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
+	if (std::strcmp(vendor, "ARM") == 0 || std::strcmp(vendor, "Qualcomm") == 0) {
+		// Mali and Adreno drivers can't do sub-buffer tracking...
+		return BufferDataStreamBuffer::Create(target, size);
+	}
+
+	return BufferSubDataStreamBuffer::Create(target, size);
+#else
+	return BufferDataStreamBuffer::Create(target, size);
+#endif
+}
\ No newline at end of file
diff --git a/third_party/duckstation/gl/stream_buffer.h b/third_party/duckstation/gl/stream_buffer.h
new file mode 100644
index 000000000..6b3562e78
--- /dev/null
+++ b/third_party/duckstation/gl/stream_buffer.h
@@ -0,0 +1,53 @@
+// SPDX-FileCopyrightText: 2019-2023 Connor McLaughlin <stenzek@gmail.com>
+// SPDX-License-Identifier: (GPL-3.0 OR CC-BY-NC-ND-4.0)
+
+#pragma once
+
+#include <glad/gl.h>
+// Comment to avoid clang-format reordering the glad header
+
+#include <memory>
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+#include "duckstation_compat.h"
+#include "helpers.hpp"
+
+class OpenGLStreamBuffer {
+  public:
+	virtual ~OpenGLStreamBuffer();
+
+	ALWAYS_INLINE GLuint GetGLBufferId() const { return m_buffer_id; }
+	ALWAYS_INLINE GLenum GetGLTarget() const { return m_target; }
+	ALWAYS_INLINE u32 GetSize() const { return m_size; }
+
+	void Bind();
+	void Unbind();
+
+	void SetDebugName(std::string_view name);
+
+	struct MappingResult {
+		void* pointer;
+		u32 buffer_offset;
+		u32 index_aligned;  // offset / alignment, suitable for base vertex
+		u32 space_aligned;  // remaining space / alignment
+	};
+
+	virtual MappingResult Map(u32 alignment, u32 min_size) = 0;
+
+	/// Returns the position in the buffer *before* the start of used_size.
+	virtual u32 Unmap(u32 used_size) = 0;
+
+	/// Returns the minimum granularity of blocks which sync objects will be created around.
+	virtual u32 GetChunkSize() const = 0;
+
+	static std::unique_ptr<OpenGLStreamBuffer> Create(GLenum target, u32 size);
+
+  protected:
+	OpenGLStreamBuffer(GLenum target, GLuint buffer_id, u32 size);
+
+	GLenum m_target;
+	GLuint m_buffer_id;
+	u32 m_size;
+};
\ No newline at end of file

From f96b609123cda765397c6015bd0450603e6d37a1 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 25 Aug 2024 03:49:07 +0300
Subject: [PATCH 38/63] GL: Actually upload data to stream buffers

---
 include/PICA/draw_acceleration.hpp   |  4 ++-
 include/renderer_gl/gl_state.hpp     |  9 -------
 src/core/PICA/draw_acceleration.cpp  | 19 ++++++++------
 src/core/renderer_gl/gl_state.cpp    |  3 ---
 src/core/renderer_gl/renderer_gl.cpp | 38 +++++++++++++++++++++++-----
 5 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp
index bd3e428dd..2ec3f318a 100644
--- a/include/PICA/draw_acceleration.hpp
+++ b/include/PICA/draw_acceleration.hpp
@@ -11,15 +11,16 @@ namespace PICA {
 		struct AttributeInfo {
 			u8* data;
 			u32 offset;
+			u32 size;
 
 			u8 type;
 			u8 componentCount;
 			bool fixed;
+			bool isPadding;
 
 			std::array<float, 4> fixedValue;  // For fixed attributes
 		};
 
-		u8* vertexBuffer;
 		u8* indexBuffer;
 
 		// Minimum and maximum index in the index buffer for a draw call
@@ -31,5 +32,6 @@ namespace PICA {
 
 		bool canBeAccelerated;
 		bool indexed;
+		bool useShortIndices;
 	};
 }  // namespace PICA
\ No newline at end of file
diff --git a/include/renderer_gl/gl_state.hpp b/include/renderer_gl/gl_state.hpp
index e5591ea0f..4085cabcf 100644
--- a/include/renderer_gl/gl_state.hpp
+++ b/include/renderer_gl/gl_state.hpp
@@ -38,7 +38,6 @@ struct GLStateManager {
 	
 	GLuint stencilMask;
 	GLuint boundVAO;
-	GLuint boundVBO;
 	GLuint currentProgram;
 	GLuint boundUBO;
 
@@ -173,13 +172,6 @@ struct GLStateManager {
 		}
 	}
 
-	void bindVBO(GLuint handle) {
-		if (boundVBO != handle) {
-			boundVBO = handle;
-			glBindBuffer(GL_ARRAY_BUFFER, handle);
-		}
-	}
-
 	void useProgram(GLuint handle) {
 		if (currentProgram != handle) {
 			currentProgram = handle;
@@ -195,7 +187,6 @@ struct GLStateManager {
 	}
 
 	void bindVAO(const OpenGL::VertexArray& vao) { bindVAO(vao.handle()); }
-	void bindVBO(const OpenGL::VertexBuffer& vbo) { bindVBO(vbo.handle()); }
 	void useProgram(const OpenGL::Program& program) { useProgram(program.handle()); }
 
 	void setColourMask(bool r, bool g, bool b, bool a) {
diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp
index 5fc21e48a..22b1f0413 100644
--- a/src/core/PICA/draw_acceleration.cpp
+++ b/src/core/PICA/draw_acceleration.cpp
@@ -12,7 +12,6 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 	const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16;
 	const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];  // Total # of vertices to transfer
 
-	accel.vertexBuffer = getPointerPhys<u8>(vertexBase);
 	if (indexed) {
 		u32 indexBufferConfig = regs[PICA::InternalRegs::IndexBufferConfig];
 		u32 indexBufferPointer = vertexBase + (indexBufferConfig & 0xfffffff);
@@ -22,11 +21,12 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 		u16 maximumIndex = 0;
 
 		// Check whether the index buffer uses u16 indices or u8
-		bool shortIndex = Helpers::getBit<31>(indexBufferConfig);  // Indicates whether vert indices are 16-bit or 8-bit
+		accel.useShortIndices = Helpers::getBit<31>(indexBufferConfig);  // Indicates whether vert indices are 16-bit or 8-bit
 
 		// Calculate the minimum and maximum indices used in the index buffer, so we'll only upload them
-		if (shortIndex) {
+		if (accel.useShortIndices) {
 			u16* indexBuffer16 = reinterpret_cast<u16*>(indexBuffer);
+
 			for (int i = 0; i < vertexCount; i++) {
 				u16 index = indexBuffer16[i];
 				minimumIndex = std::min(minimumIndex, index);
@@ -84,6 +84,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 					attributeOffset += (index - 11) << 2;
 
 					attr.data = nullptr;
+					attr.isPadding = true;
 					continue;
 				}
 
@@ -91,18 +92,19 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 				const u32 attribType = attribInfo & 0x3;  //  Type of attribute (sbyte/ubyte/short/float)
 				const u32 size = (attribInfo >> 2) + 1;   // Total number of components
 			
+				// Size of each component based on the attribute type
+				static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
+
 				attr.componentCount = size;
 				attr.offset = attributeOffset;
+				attr.size = size * sizePerComponent[attribType];
 				attr.type = attribType;
+				attr.isPadding = false;
+				attributeOffset += attr.size;
 
 				// Get a pointer to the data where this attribute is stored
 				const u32 attrAddress = vertexBase + attr.offset + (accel.minimumIndex * attrData.size);
 				attr.data = getPointerPhys<u8>(attrAddress);
-
-				// Size of each component based on the attribute type
-				static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
-				attributeOffset += size * sizePerComponent[attribType];
-
 				attrCount += 1;
 			}
 
@@ -114,6 +116,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 			attr.fixed = true;
 			// Set the data pointer to nullptr in order to catch any potential bugs
 			attr.data = nullptr;
+			attr.isPadding = false;
 
 			for (int i = 0; i < 4; i++) {
 				attr.fixedValue[i] = fixedAttr[i].toFloat32();
diff --git a/src/core/renderer_gl/gl_state.cpp b/src/core/renderer_gl/gl_state.cpp
index 3d1c0681a..785cac411 100644
--- a/src/core/renderer_gl/gl_state.cpp
+++ b/src/core/renderer_gl/gl_state.cpp
@@ -73,10 +73,7 @@ void GLStateManager::resetVAO() {
 }
 
 void GLStateManager::resetBuffers() {
-	boundVBO = 0;
 	boundUBO = 0;
-
-	glBindBuffer(GL_ARRAY_BUFFER, 0);
 	glBindBuffer(GL_UNIFORM_BUFFER, 0);
 }
 
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 3b2d1d70b..4ed1eac1e 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -97,7 +97,7 @@ void RendererGL::initGraphicsContextInternal() {
 	glBufferData(GL_UNIFORM_BUFFER, PICAShader::totalUniformSize(), nullptr, GL_DYNAMIC_DRAW);
 
 	vbo.createFixedSize(sizeof(Vertex) * vertexBufferSize * 2, GL_STREAM_DRAW);
-	gl.bindVBO(vbo);
+	vbo.bind();
 	// Initialize the VAO used when not using hw shaders
 	defaultVAO.create();
 	gl.bindVAO(defaultVAO);
@@ -439,7 +439,7 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 
 	const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
 	gl.disableScissor();
-	gl.bindVBO(vbo);
+	vbo.bind();
 	gl.bindVAO(usingAcceleratedShader ? hwShaderVAO : defaultVAO);
 
 	gl.enableClipPlane(0);  // Clipping plane 0 is always enabled
@@ -1135,11 +1135,37 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
 		GL_FLOAT,          // 3: Float
 	};
 
+	const u32 vertexCount = accel->maximumIndex - accel->minimumIndex + 1;
+
+	// Update index buffer if necessary
+	if (accel->indexed) {
+		const bool shortIndex = accel->useShortIndices;
+		const usize indexBufferSize = usize(vertexCount) * (shortIndex ? sizeof(u16) : sizeof(u8));
+
+		auto indexBufferRes = hwIndexBuffer->Map(4, indexBufferSize);
+		std::memcpy(indexBufferRes.pointer, accel->indexBuffer, indexBufferSize);
+		hwIndexBuffer->Unmap(indexBufferSize);
+	}
+
+	auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
+	u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
+
 	for (int i = 0; i < totalAttribCount; i++) {
 		const auto& attrib = accel->attributeInfo[i];
-		printf(
-			"%s attribute starting from offset %d with a size of %d components\n", attrib.fixed ? "Fixed" : "Variable", (!attrib.fixed) ? attrib.offset : 0,
-			!attrib.fixed ? attrib.componentCount : 4
-		);
+		
+		if (attrib.fixed) {
+			Helpers::panic("Fixed attribute!");
+		} else {
+			if (attrib.isPadding) {
+				continue;
+			}
+
+			const u32 attributeSize = attrib.size * vertexCount;
+
+			std::memcpy(vertexData, attrib.data, attributeSize);
+			vertexData += attributeSize;
+		}
 	}
+
+	hwVertexBuffer->Unmap(accel->vertexDataSize);
 }
\ No newline at end of file

From 33e63f7d7ac826066ec8a8bfea2ed9021f29c8c2 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 25 Aug 2024 16:02:54 +0300
Subject: [PATCH 39/63] GPU: Cleanup immediate mode handling

---
 include/renderer.hpp                 | 2 +-
 include/renderer_gl/renderer_gl.hpp  | 2 +-
 src/core/PICA/gpu.cpp                | 2 +-
 src/core/PICA/regs.cpp               | 2 +-
 src/core/renderer_gl/renderer_gl.cpp | 9 +++++----
 5 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/include/renderer.hpp b/include/renderer.hpp
index 94a0b0f3c..5a1efc773 100644
--- a/include/renderer.hpp
+++ b/include/renderer.hpp
@@ -84,7 +84,7 @@ class Renderer {
 	// It is responsible for things like looking up which vertex/fragment shaders to use, recompiling them if they don't exist, choosing between
 	// ubershaders and shadergen, and so on.
 	// Returns whether this draw is eligible for using hardware-accelerated shaders or if shaders should run on the CPU
-	virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel, bool isImmediateMode) { return false; }
+	virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) { return false; }
 
 	// Functions for initializing the graphics context for the Qt frontend, where we don't have the convenience of SDL_Window
 #ifdef PANDA3DS_FRONTEND_QT
diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp
index 63bbb474c..162864845 100644
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@@ -160,7 +160,7 @@ class RendererGL final : public Renderer {
 	virtual bool supportsShaderReload() override { return true; }
 	virtual std::string getUbershader() override;
 	virtual void setUbershader(const std::string& shader) override;
-	virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel, bool isImmediateMode) override;
+	virtual bool prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) override;
 	
 	std::optional<ColourBuffer> getColourBuffer(u32 addr, PICA::ColorFmt format, u32 width, u32 height, bool createIfnotFound = true);
 
diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp
index 76f3acea1..64dc5beb7 100644
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@@ -131,7 +131,7 @@ void GPU::drawArrays(bool indexed) {
 		getAcceleratedDrawInfo(accel, indexed);
 	}
 
-	const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel, false);
+	const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel);
 
 	if (hwShaders) {
 		if (indexed) {
diff --git a/src/core/PICA/regs.cpp b/src/core/PICA/regs.cpp
index 091bd377d..4c865d122 100644
--- a/src/core/PICA/regs.cpp
+++ b/src/core/PICA/regs.cpp
@@ -249,7 +249,7 @@ void GPU::writeInternalReg(u32 index, u32 value, u32 mask) {
 						// If we've reached 3 verts, issue a draw call
 						// Handle rendering depending on the primitive type
 						if (immediateModeVertIndex == 3) {
-							renderer->prepareForDraw(shaderUnit, nullptr, true);
+							renderer->prepareForDraw(shaderUnit, nullptr);
 							renderer->drawVertices(PICA::PrimType::TriangleList, immediateModeVertices);
 
 							switch (primType) {
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 4ed1eac1e..fc6e2ce6b 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -950,7 +950,7 @@ OpenGL::Program& RendererGL::getSpecializedShader() {
 	return program;
 }
 
-bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel, bool isImmediateMode) {
+bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration* accel) {
 	// First we figure out if we will be using an ubershader
 	bool usingUbershader = emulatorConfig->useUbershaders;
 	if (usingUbershader) {
@@ -966,7 +966,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration*
 
 	// Then we figure out if we will use hw accelerated shaders, and try to fetch our shader
 	// TODO: Ubershader support for accelerated shaders
-	usingAcceleratedShader = emulatorConfig->accelerateShaders && !isImmediateMode && !usingUbershader;
+	usingAcceleratedShader = emulatorConfig->accelerateShaders && !usingUbershader && accel != nullptr && accel->canBeAccelerated;
 
 	if (usingAcceleratedShader) {
 		PICA::VertConfig vertexConfig(shaderUnit.vs, regs, usingUbershader);
@@ -1000,9 +1000,10 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration*
 				shaderUnit.vs.uniformsDirty = false;
 				glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer());
 			}
-		}
 
-		accelerateVertexUpload(shaderUnit, accel);
+			// Upload vertex data and index buffer data to our GPU
+			accelerateVertexUpload(shaderUnit, accel);
+		}
 	}
 
 	if (usingUbershader) {

From 5432a5a0d87ed17a81b7ac865f8b06413b893821 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 25 Aug 2024 17:14:19 +0300
Subject: [PATCH 40/63] Get first renders working with accelerated draws

---
 include/PICA/draw_acceleration.hpp           |  1 +
 include/renderer_gl/renderer_gl.hpp          |  1 +
 src/core/PICA/draw_acceleration.cpp          |  1 +
 src/core/PICA/gpu.cpp                        | 96 +++++++-------------
 src/core/renderer_gl/renderer_gl.cpp         | 38 +++++---
 third_party/duckstation/gl/stream_buffer.cpp |  4 +-
 6 files changed, 63 insertions(+), 78 deletions(-)

diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp
index 2ec3f318a..1671825ea 100644
--- a/include/PICA/draw_acceleration.hpp
+++ b/include/PICA/draw_acceleration.hpp
@@ -12,6 +12,7 @@ namespace PICA {
 			u8* data;
 			u32 offset;
 			u32 size;
+			u32 stride;
 
 			u8 type;
 			u8 componentCount;
diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp
index 162864845..b643534a6 100644
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@@ -62,6 +62,7 @@ class RendererGL final : public Renderer {
 	bool oldDepthmapEnable = false;
 	// Set by prepareDraw, tells us whether the current draw is using hw-accelerated shader
 	bool usingAcceleratedShader = false;
+	bool performIndexedRender = false;
 
 	// Cached pointer to the current vertex shader when using HW accelerated shaders
 	OpenGL::Shader* generatedVertexShader = nullptr;
diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp
index 22b1f0413..7646577fc 100644
--- a/src/core/PICA/draw_acceleration.cpp
+++ b/src/core/PICA/draw_acceleration.cpp
@@ -98,6 +98,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 				attr.componentCount = size;
 				attr.offset = attributeOffset;
 				attr.size = size * sizePerComponent[attribType];
+				attr.stride = attrData.size;
 				attr.type = attribType;
 				attr.isPadding = false;
 				attributeOffset += attr.size;
diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp
index 64dc5beb7..dad24a22a 100644
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@@ -120,6 +120,8 @@ void GPU::reset() {
 	renderer->reset();
 }
 
+static std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
+
 // Call the correct version of drawArrays based on whether this is an indexed draw (first template parameter)
 // And whether we are going to use the shader JIT (second template parameter)
 void GPU::drawArrays(bool indexed) {
@@ -134,11 +136,13 @@ void GPU::drawArrays(bool indexed) {
 	const bool hwShaders = renderer->prepareForDraw(shaderUnit, &accel);
 
 	if (hwShaders) {
-		if (indexed) {
-			drawArrays<true, ShaderExecMode::Hardware>();
-		} else {
-			drawArrays<false, ShaderExecMode::Hardware>();
-		}
+		// Hardware shaders have their own accelerated code path for draws, so they skip everything here
+		const PICA::PrimType primType = static_cast<PICA::PrimType>(Helpers::getBits<8, 2>(regs[PICA::InternalRegs::PrimitiveConfig]));
+		// Total # of vertices to render
+		const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];
+
+		// Note: In the hardware shader path the vertices span shouldn't actually be used as the rasterizer will perform its own attribute fetching
+		renderer->drawVertices(primType, std::span(vertices).first(vertexCount));
 	} else {
 		const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
 
@@ -158,33 +162,17 @@ void GPU::drawArrays(bool indexed) {
 	}
 }
 
-// We need a union here, because unfortunately in CPU shaders we only need to store the vertex shader outputs in the vertex buffer,
-// which consist of 8 vec4 attributes, while with GPU shaders we need to pass all the vertex shader inputs to the GPU, which consist
-// of 16 vec4 attributes
-union PICAVertexBuffer {
-	// Used with CPU shaders
-	std::array<PICA::Vertex, Renderer::vertexBufferSize> vertices;
-	// Used with GPU shaders. We can have up to 16 attributes per vertex, each attribute with 4 floats
-	std::array<float, Renderer::vertexBufferSize * 16 * 4> vsInputs;
-
-	PICAVertexBuffer() {}
-};
-
-static PICAVertexBuffer vertexBuffer;
-
 template <bool indexed, ShaderExecMode mode>
 void GPU::drawArrays() {
 	if constexpr (mode == ShaderExecMode::JIT) {
 		shaderJIT.prepare(shaderUnit.vs);
+	} else if constexpr (mode == ShaderExecMode::Hardware) {
+		// Hardware shaders have their own accelerated code path for draws, so they're not meant to take this path
+		Helpers::panic("GPU::DrawArrays: Hardware shaders shouldn't take this path!");
 	}
 
 	// We can have up to 16 attributes, each one consisting of 4 floats
 	constexpr u32 maxAttrSizeInFloats = 16 * 4;
-	auto& vertices = vertexBuffer.vertices;
-
-	if constexpr (mode != ShaderExecMode::Hardware) {
-		setVsOutputMask(regs[PICA::InternalRegs::VertexShaderOutputMask]);
-	}
 
 	// Base address for vertex attributes
 	// The vertex base is always on a quadword boundary because the PICA does weird alignment shit any time possible
@@ -257,15 +245,7 @@ void GPU::drawArrays() {
 			size_t tag = vertexIndex % vertexCacheSize;
 			// Cache hit
 			if (cache.validBits[tag] && cache.ids[tag] == vertexIndex) {
-				if constexpr (mode != ShaderExecMode::Hardware) {
-					vertices[i] = vertices[cache.bufferPositions[tag]];
-				} else {
-					const u32 cachedBufferPosition = cache.bufferPositions[tag] * maxAttrSizeInFloats;
-					std::memcpy(
-						&vertexBuffer.vsInputs[i * maxAttrSizeInFloats], &vertexBuffer.vsInputs[cachedBufferPosition],
-						sizeof(float) * maxAttrSizeInFloats
-					);
-				}
+				vertices[i] = vertices[cache.bufferPositions[tag]];
 				continue;
 			}
 
@@ -370,39 +350,29 @@ void GPU::drawArrays() {
 			}
 		}
 
-		// Running shader on the CPU instead of the GPU
-		if constexpr (mode == ShaderExecMode::Interpreter || mode == ShaderExecMode::JIT) {
-			// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
-			// Based on the SH_ATTRIBUTES_PERMUTATION registers.
-			// Ie it might map attribute #0 to v2, #1 to v7, etc
-			for (int j = 0; j < totalAttribCount; j++) {
-				const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
-				std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
-			}
+		// Before running the shader, the PICA maps the fetched attributes from the attribute registers to the shader input registers
+		// Based on the SH_ATTRIBUTES_PERMUTATION registers.
+		// Ie it might map attribute #0 to v2, #1 to v7, etc
+		for (int j = 0; j < totalAttribCount; j++) {
+			const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
+			std::memcpy(&shaderUnit.vs.inputs[mapping], &currentAttributes[j], sizeof(vec4f));
+		}
 
-			if constexpr (mode == ShaderExecMode::JIT) {
-				shaderJIT.run(shaderUnit.vs);
-			} else {
-				shaderUnit.vs.run();
-			}
+		if constexpr (mode == ShaderExecMode::JIT) {
+			shaderJIT.run(shaderUnit.vs);
+		} else {
+			shaderUnit.vs.run();
+		}
 
-			PICA::Vertex& out = vertices[i];
-			// Map shader outputs to fixed function properties
-			const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
-			for (int i = 0; i < totalShaderOutputs; i++) {
-				const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
+		PICA::Vertex& out = vertices[i];
+		// Map shader outputs to fixed function properties
+		const u32 totalShaderOutputs = regs[PICA::InternalRegs::ShaderOutputCount] & 7;
+		for (int i = 0; i < totalShaderOutputs; i++) {
+			const u32 config = regs[PICA::InternalRegs::ShaderOutmap0 + i];
 
-				for (int j = 0; j < 4; j++) {  // pls unroll
-					const u32 mapping = (config >> (j * 8)) & 0x1F;
-					out.raw[mapping] = vsOutputRegisters[i][j];
-				}
-			}
-		} else {  // Using hw shaders and running the shader on the CPU, just write the inputs to the attribute buffer directly
-			float* out = &vertexBuffer.vsInputs[i * maxAttrSizeInFloats];
-			for (int j = 0; j < totalAttribCount; j++) {
-				const u32 mapping = (inputAttrCfg >> (j * 4)) & 0xf;
-				// Multiply mapping * 4 as mapping refers to a vec4 whereas out is an array of floats
-				std::memcpy(&out[mapping * 4], &currentAttributes[j], sizeof(vec4f));
+			for (int j = 0; j < 4; j++) {  // pls unroll
+				const u32 mapping = (config >> (j * 8)) & 0x1F;
+				out.raw[mapping] = vsOutputRegisters[i][j];
 			}
 		}
 	}
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index fc6e2ce6b..82248d535 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -129,11 +129,6 @@ void RendererGL::initGraphicsContextInternal() {
 
 	// Initialize the VAO used for hw shaders
 	hwShaderVAO.create();
-	gl.bindVAO(hwShaderVAO);
-	for (int attr = 0; attr < 16; attr++) {
-		hwShaderVAO.setAttributeFloat<float>(attr, 4, sizeof(Vertex) * 2, attr * sizeof(float) * 4);
-		hwShaderVAO.enableAttribute(attr);
-	}
 
 	dummyVBO.create();
 	dummyVAO.create();
@@ -439,8 +434,14 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 
 	const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
 	gl.disableScissor();
-	vbo.bind();
-	gl.bindVAO(usingAcceleratedShader ? hwShaderVAO : defaultVAO);
+
+	if (usingAcceleratedShader) {
+		hwVertexBuffer->Bind();
+		gl.bindVAO(hwShaderVAO);
+	} else {
+		vbo.bind();
+		gl.bindVAO(defaultVAO);
+	}
 
 	gl.enableClipPlane(0);  // Clipping plane 0 is always enabled
 	if (regs[PICA::InternalRegs::ClipEnable] & 1) {
@@ -503,15 +504,19 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 
 	setupStencilTest(stencilEnable);
 
-	// If we're using hardware shaders, the vertex array works completely different
-	// And instead of 8 vec4 attributes, each vertex is 16 vec4 attributes. We use a union + aliasing which is not ideal for readability.
 	if (!usingAcceleratedShader) {
 		vbo.bufferVertsSub(vertices);
+		OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
 	} else {
-		glBufferSubData(GL_ARRAY_BUFFER, 0, vertices.size_bytes() * 2, vertices.data());
+		if (performIndexedRender) {
+			// When doing indexed rendering, bind the IBO and use glDrawRangeElementsBaseVertex to issue the indexed draw
+			hwIndexBuffer->Bind();
+			//glDrawRangeElementsBaseVertex();
+		} else {
+			// When doing non-indexed rendering, just use glDrawArrays
+			OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
+		}
 	}
-
-	OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
 }
 
 void RendererGL::display() {
@@ -1003,6 +1008,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration*
 
 			// Upload vertex data and index buffer data to our GPU
 			accelerateVertexUpload(shaderUnit, accel);
+			performIndexedRender = accel->indexed;
 		}
 	}
 
@@ -1149,7 +1155,9 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
 	}
 
 	auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
+
 	u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
+	gl.bindVAO(hwShaderVAO);
 
 	for (int i = 0; i < totalAttribCount; i++) {
 		const auto& attrib = accel->attributeInfo[i];
@@ -1161,9 +1169,13 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
 				continue;
 			}
 
-			const u32 attributeSize = attrib.size * vertexCount;
+			glVertexAttribPointer(i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride, reinterpret_cast<GLvoid*>(vertexBufferRes.buffer_offset + attrib.offset));
+			// TODO: Disable unused attributes as well
+			hwShaderVAO.enableAttribute(i);
 
+			const u32 attributeSize = attrib.size * vertexCount;
 			std::memcpy(vertexData, attrib.data, attributeSize);
+			
 			vertexData += attributeSize;
 		}
 	}
diff --git a/third_party/duckstation/gl/stream_buffer.cpp b/third_party/duckstation/gl/stream_buffer.cpp
index f4f8b54cf..ff6c79f9b 100644
--- a/third_party/duckstation/gl/stream_buffer.cpp
+++ b/third_party/duckstation/gl/stream_buffer.cpp
@@ -132,7 +132,7 @@ namespace {
 			const u32 end = GetSyncIndexForOffset(offset);
 			for (; m_used_block_index < end; m_used_block_index++) {
 				if (m_sync_objects[m_used_block_index]) {
-					Helpers::panic("GL stream buffer: Fence slot we're trying to insert is already in use");
+					Helpers::warn("GL stream buffer: Fence slot we're trying to insert is already in use");
 				}
 
 				m_sync_objects[m_used_block_index] = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
@@ -149,7 +149,7 @@ namespace {
 			const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
 			for (; m_available_block_index < end; m_available_block_index++) {
 				if (!m_sync_objects[m_used_block_index]) [[unlikely]] {
-					Helpers::panic("GL stream buffer: Fence slot we're trying to wait on in not in use");
+					Helpers::warn("GL stream buffer: Fence slot we're trying to wait on in not in use");
 				}
 
 				WaitForSync(m_sync_objects[m_available_block_index]);

From e925a91e405545c22dc13d5c326d2fdccf17f72c Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 25 Aug 2024 18:38:22 +0300
Subject: [PATCH 41/63] Shader decompiler: Fix control flow analysis bugs

---
 src/core/PICA/shader_decompiler.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 133637a7b..75de4e504 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -18,7 +18,7 @@ void ControlFlow::analyze(const PICAShader& shader, u32 entrypoint) {
 	analysisFailed = false;
 
 	const Function* function = addFunction(shader, entrypoint, PICAShader::maxInstructionCount);
-	if (function == nullptr) {
+	if (function == nullptr || function->exitMode != ExitMode::AlwaysEnd) {
 		analysisFailed = true;
 	}
 }
@@ -83,6 +83,7 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 				it->second = exitParallel(branchTakenExit, branchNotTakenExit);
 				return it->second;
 			}
+
 			case ShaderOpcodes::IFU:
 			case ShaderOpcodes::IFC: {
 				const u32 num = instruction & 0xff;
@@ -114,7 +115,7 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 					it->second = parallel;
 					return it->second;
 				} else {
-					ExitMode afterConditional = analyzeFunction(shader, pc + 1, end, labels);
+					ExitMode afterConditional = analyzeFunction(shader, dest + num, end, labels);
 					ExitMode conditionalExitMode = exitSeries(parallel, afterConditional);
 					it->second = conditionalExitMode;
 					return it->second;
@@ -139,7 +140,7 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 
 				// Exit mode of the remainder of this function, after we return from the callee
 				const ExitMode postCallExitMode = analyzeFunction(shader, pc + 1, end, labels);
-				const ExitMode exitMode = exitSeries(postCallExitMode, calledFunction->exitMode);
+				const ExitMode exitMode = exitSeries(calledFunction->exitMode, postCallExitMode);
 
 				it->second = exitMode;
 				return exitMode;
@@ -179,7 +180,7 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 				}
 
 				const ExitMode afterLoop = analyzeFunction(shader, dest + 1, end, labels);
-				const ExitMode exitMode = exitSeries(afterLoop, loopFunction->exitMode);
+				const ExitMode exitMode = exitSeries(loopFunction->exitMode, afterLoop);
 				it->second = exitMode;
 				return it->second;
 			}
@@ -190,7 +191,8 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 	}
 
 	// A function without control flow instructions will always reach its "return point" and return
-	return ExitMode::AlwaysReturn;
+	it->second = ExitMode::AlwaysReturn;
+	return it->second;
 }
 
 std::pair<u32, bool> ShaderDecompiler::compileRange(const AddressRange& range) {

From 37a43e245f2e901d46c8cacf948d8909c1d343a5 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 25 Aug 2024 19:13:37 +0300
Subject: [PATCH 42/63] HW shaders: Accelerate indexed draws

---
 include/renderer_gl/renderer_gl.hpp  |  8 +++++++-
 src/core/PICA/gpu.cpp                |  2 +-
 src/core/renderer_gl/renderer_gl.cpp | 26 +++++++++++++++++---------
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp
index b643534a6..30b170266 100644
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@@ -60,9 +60,15 @@ class RendererGL final : public Renderer {
 	float oldDepthScale = -1.0;
 	float oldDepthOffset = 0.0;
 	bool oldDepthmapEnable = false;
-	// Set by prepareDraw, tells us whether the current draw is using hw-accelerated shader
+	// Set by prepareForDraw, tells us whether the current draw is using hw-accelerated shader
 	bool usingAcceleratedShader = false;
 	bool performIndexedRender = false;
+	bool usingShortIndices = false;
+
+	// Set by prepareForDraw, metadata for indexed renders
+	GLuint minimumIndex = 0;
+	GLuint maximumIndex = 0;
+	void* hwIndexBufferOffset = nullptr;
 
 	// Cached pointer to the current vertex shader when using HW accelerated shaders
 	OpenGL::Shader* generatedVertexShader = nullptr;
diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp
index dad24a22a..2797e09fb 100644
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@@ -141,7 +141,7 @@ void GPU::drawArrays(bool indexed) {
 		// Total # of vertices to render
 		const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];
 
-		// Note: In the hardware shader path the vertices span shouldn't actually be used as the rasterizer will perform its own attribute fetching
+		// Note: In the hardware shader path the vertices span shouldn't actually be used as the renderer will perform its own attribute fetching
 		renderer->drawVertices(primType, std::span(vertices).first(vertexCount));
 	} else {
 		const bool shaderJITEnabled = ShaderJIT::isAvailable() && config.shaderJitEnabled;
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 82248d535..536cb6fad 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -435,10 +435,8 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 	const auto primitiveTopology = primTypes[static_cast<usize>(primType)];
 	gl.disableScissor();
 
-	if (usingAcceleratedShader) {
-		hwVertexBuffer->Bind();
-		gl.bindVAO(hwShaderVAO);
-	} else {
+	// If we're using accelerated shaders, the hw VAO, VBO and EBO objects will have already been bound in prepareForDraw
+	if (!usingAcceleratedShader) {
 		vbo.bind();
 		gl.bindVAO(defaultVAO);
 	}
@@ -509,9 +507,12 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 		OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
 	} else {
 		if (performIndexedRender) {
-			// When doing indexed rendering, bind the IBO and use glDrawRangeElementsBaseVertex to issue the indexed draw
+			// When doing indexed rendering, bind the EBO and use glDrawRangeElementsBaseVertex to issue the indexed draw
 			hwIndexBuffer->Bind();
-			//glDrawRangeElementsBaseVertex();
+			glDrawRangeElementsBaseVertex(
+				primitiveTopology, minimumIndex, maximumIndex, GLsizei(vertices.size()), usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE,
+				hwIndexBufferOffset, -minimumIndex
+			);
 		} else {
 			// When doing non-indexed rendering, just use glDrawArrays
 			OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
@@ -1008,7 +1009,10 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration*
 
 			// Upload vertex data and index buffer data to our GPU
 			accelerateVertexUpload(shaderUnit, accel);
+
 			performIndexedRender = accel->indexed;
+			minimumIndex = GLsizei(accel->minimumIndex);
+			maximumIndex = GLsizei(accel->maximumIndex);
 		}
 	}
 
@@ -1146,17 +1150,21 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
 
 	// Update index buffer if necessary
 	if (accel->indexed) {
-		const bool shortIndex = accel->useShortIndices;
-		const usize indexBufferSize = usize(vertexCount) * (shortIndex ? sizeof(u16) : sizeof(u8));
+		usingShortIndices = accel->useShortIndices;
+		const usize indexBufferSize = usize(vertexCount) * (usingShortIndices ? sizeof(u16) : sizeof(u8));
 
+		hwIndexBuffer->Bind();
 		auto indexBufferRes = hwIndexBuffer->Map(4, indexBufferSize);
+		hwIndexBufferOffset = reinterpret_cast<void*>(usize(indexBufferRes.buffer_offset));
+
 		std::memcpy(indexBufferRes.pointer, accel->indexBuffer, indexBufferSize);
 		hwIndexBuffer->Unmap(indexBufferSize);
 	}
 
+	hwVertexBuffer->Bind();
 	auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
-
 	u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
+
 	gl.bindVAO(hwShaderVAO);
 
 	for (int i = 0; i < totalAttribCount; i++) {

From ca2d7e40eaab6f5278d6b70aa1f6fbae2f308aa7 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 25 Aug 2024 19:34:56 +0300
Subject: [PATCH 43/63] Shader decompiler: Add support for compilation errors

---
 include/PICA/shader_decompiler.hpp  |  1 +
 src/core/PICA/shader_decompiler.cpp | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/PICA/shader_decompiler.hpp b/include/PICA/shader_decompiler.hpp
index b7bd869c3..4a5cdc138 100644
--- a/include/PICA/shader_decompiler.hpp
+++ b/include/PICA/shader_decompiler.hpp
@@ -99,6 +99,7 @@ namespace PICA::ShaderGen {
 
 		API api;
 		Language language;
+		bool compilationError = false;
 
 		void compileInstruction(u32& pc, bool& finished);
 		// Compile range "range" and returns the end PC or if we're "finished" with the program (called an END instruction)
diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 75de4e504..2d4d29632 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -247,6 +247,7 @@ std::string ShaderDecompiler::decompile() {
 		return "";
 	}
 
+	compilationError = false;
 	decompiledShader = "";
 
 	switch (api) {
@@ -324,6 +325,13 @@ std::string ShaderDecompiler::decompile() {
 		}
 	}
 
+	// We allow some leeway for "compilation errors" in addition to control flow errors, in cases where eg an unimplemented instruction
+	// or an instruction that we can't emulate in GLSL is found in the instruction stream. Just like control flow errors, these return an empty string
+	// and the renderer core will decide to use CPU shaders instead
+	if (compilationError) [[unlikely]] {
+		return "";
+	}
+
 	return decompiledShader;
 }
 
@@ -707,7 +715,11 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 				return;
 
 			case ShaderOpcodes::NOP: break;
-			default: Helpers::panic("GLSL recompiler: Unknown opcode: %X", opcode); break;
+
+			default:
+				Helpers::warn("GLSL recompiler: Unknown opcode: %X. Falling back to CPU shaders", opcode);
+				compilationError = true;
+				break;
 		}
 	}
 

From 0c2ae1b7d07df911ded44b581ea6125dcada7a0c Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 25 Aug 2024 19:55:47 +0300
Subject: [PATCH 44/63] GLSL decompiler: Fall back for LITP

---
 src/core/PICA/shader_decompiler.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 2d4d29632..347df5c5e 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -546,7 +546,10 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 				break;
 			}
 
-			default: Helpers::panic("GLSL recompiler: Unknown common opcode: %X", opcode); break;
+			default:
+				Helpers::warn("GLSL recompiler: Unknown common opcode: %02X. Falling back to CPU shaders", opcode);
+				compilationError = true;
+				break;
 		}
 	} else if (opcode >= 0x30 && opcode <= 0x3F) { // MAD and MADI
 		const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x1f];
@@ -717,7 +720,7 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 			case ShaderOpcodes::NOP: break;
 
 			default:
-				Helpers::warn("GLSL recompiler: Unknown opcode: %X. Falling back to CPU shaders", opcode);
+				Helpers::warn("GLSL recompiler: Unknown opcode: %02X. Falling back to CPU shaders", opcode);
 				compilationError = true;
 				break;
 		}

From 0e7697dc673c4c58a01fba0a74bdb4c941002292 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Mon, 26 Aug 2024 00:43:36 +0300
Subject: [PATCH 45/63] Add Renderdoc scope classes

---
 include/renderdoc.hpp | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/include/renderdoc.hpp b/include/renderdoc.hpp
index 94a0f4944..ea2c8a3d6 100644
--- a/include/renderdoc.hpp
+++ b/include/renderdoc.hpp
@@ -35,4 +35,35 @@ namespace Renderdoc {
 	static void setOutputDir(const std::string& path, const std::string& prefix) {}
 	static constexpr bool isSupported() { return false; }
 }  // namespace Renderdoc
-#endif
\ No newline at end of file
+#endif
+
+namespace Renderdoc {
+	// RAII scope class that encloses a Renderdoc capture, as long as it's triggered by triggerCapture
+	struct Scope {
+		Scope() { Renderdoc::startCapture(); }
+		~Scope() { Renderdoc::endCapture(); }
+
+		Scope(const Scope&) = delete;
+		Scope& operator=(const Scope&) = delete;
+
+		Scope(Scope&&) = delete;
+		Scope& operator=(const Scope&&) = delete;
+	};
+
+	// RAII scope class that encloses a Renderdoc capture. Unlike regular Scope it doesn't wait for a trigger, it will always issue the capture
+	// trigger on its own and take a capture
+	struct InstantScope {
+		InstantScope() {
+			Renderdoc::triggerCapture();
+			Renderdoc::startCapture();
+		}
+
+		~InstantScope() { Renderdoc::endCapture(); }
+		
+		InstantScope(const InstantScope&) = delete;
+		InstantScope& operator=(const InstantScope&) = delete;
+
+		InstantScope(InstantScope&&) = delete;
+		InstantScope& operator=(const InstantScope&&) = delete;
+	};
+}  // namespace Renderdoc
\ No newline at end of file

From e332ab2e58b7ce42e245ca765bbdc7950ace1dd7 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Tue, 3 Sep 2024 01:59:58 +0300
Subject: [PATCH 46/63] Fix control flow analysis bug

---
 include/PICA/draw_acceleration.hpp   |  2 ++
 include/renderer_gl/renderer_gl.hpp  |  3 ++
 src/core/PICA/draw_acceleration.cpp  | 12 +++++++-
 src/core/PICA/shader_decompiler.cpp  |  3 +-
 src/core/PICA/shader_gen_glsl.cpp    |  2 --
 src/core/renderer_gl/renderer_gl.cpp | 46 ++++++++++++++++++++++------
 6 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp
index 1671825ea..72eb8944c 100644
--- a/include/PICA/draw_acceleration.hpp
+++ b/include/PICA/draw_acceleration.hpp
@@ -14,6 +14,7 @@ namespace PICA {
 			u32 size;
 			u32 stride;
 
+			u8 inputReg; // Which input reg should this attribute go to in the vertex shader?
 			u8 type;
 			u8 componentCount;
 			bool fixed;
@@ -27,6 +28,7 @@ namespace PICA {
 		// Minimum and maximum index in the index buffer for a draw call
 		u16 minimumIndex, maximumIndex;
 		u32 totalAttribCount;
+		u32 enabledAttributeMask;
 		u32 vertexDataSize;
 
 		std::array<AttributeInfo, maxAttribCount> attributeInfo;
diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp
index 30b170266..137c48898 100644
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@@ -70,6 +70,9 @@ class RendererGL final : public Renderer {
 	GLuint maximumIndex = 0;
 	void* hwIndexBufferOffset = nullptr;
 
+	// When doing hw shaders, we cache which attributes are enabled in our VAO to avoid having to enable/disable all attributes on each draw
+	u32 previousAttributeMask = 0;
+
 	// Cached pointer to the current vertex shader when using HW accelerated shaders
 	OpenGL::Shader* generatedVertexShader = nullptr;
 
diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp
index 7646577fc..538a714eb 100644
--- a/src/core/PICA/draw_acceleration.cpp
+++ b/src/core/PICA/draw_acceleration.cpp
@@ -8,7 +8,8 @@
 void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 	accel.indexed = indexed;
 	accel.totalAttribCount = totalAttribCount;
-
+	accel.enabledAttributeMask = 0;
+	
 	const u32 vertexBase = ((regs[PICA::InternalRegs::VertexAttribLoc] >> 1) & 0xfffffff) * 16;
 	const u32 vertexCount = regs[PICA::InternalRegs::VertexCountReg];  // Total # of vertices to transfer
 
@@ -50,6 +51,8 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 	}
 
 	const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32);
+	const u64 inputAttrCfg = getVertexShaderInputConfig();
+
 	u32 buffer = 0;
 	u32 attrCount = 0;
 	accel.vertexDataSize = 0;
@@ -94,7 +97,11 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 			
 				// Size of each component based on the attribute type
 				static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
+				const u32 inputReg = (inputAttrCfg >> (attrCount * 4)) & 0xf;
+				// Mark the attribute as enabled
+				accel.enabledAttributeMask |= 1 << inputReg;
 
+				attr.inputReg = inputReg;
 				attr.componentCount = size;
 				attr.offset = attributeOffset;
 				attr.size = size * sizePerComponent[attribType];
@@ -123,6 +130,9 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 				attr.fixedValue[i] = fixedAttr[i].toFloat32();
 			}
 
+			const u32 inputReg = (inputAttrCfg >> (attrCount * 4)) & 0xf;
+
+			attr.inputReg = inputReg;
 			attrCount += 1;
 		}
 	}
diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 347df5c5e..ead984100 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -79,7 +79,7 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 
 				// This opens up 2 parallel paths of execution
 				auto branchTakenExit = analyzeFunction(shader, dest, end, labels);
-				auto branchNotTakenExit = analyzeFunction(shader, pc + 1, dest, labels);
+				auto branchNotTakenExit = analyzeFunction(shader, pc + 1, end, labels);
 				it->second = exitParallel(branchTakenExit, branchNotTakenExit);
 				return it->second;
 			}
@@ -122,6 +122,7 @@ ExitMode ControlFlow::analyzeFunction(const PICAShader& shader, u32 start, u32 e
 				}
 				break;
 			}
+
 			case ShaderOpcodes::CALL: {
 				const u32 num = instruction & 0xff;
 				const u32 dest = getBits<10, 12>(instruction);
diff --git a/src/core/PICA/shader_gen_glsl.cpp b/src/core/PICA/shader_gen_glsl.cpp
index affe9837f..8fc2b126c 100644
--- a/src/core/PICA/shader_gen_glsl.cpp
+++ b/src/core/PICA/shader_gen_glsl.cpp
@@ -778,8 +778,6 @@ void main() {
 	gl_ClipDistance[1] = dot(clipCoords, a_coords);
 #endif
 })";
-		
-		std::cout << ret << "\n";
 		return ret;
 	}
 }
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 536cb6fad..3d0119552 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -2,6 +2,7 @@
 
 #include <stb_image_write.h>
 
+#include <bit>
 #include <cmrc/cmrc.hpp>
 
 #include "PICA/float_types.hpp"
@@ -987,7 +988,7 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration*
 				shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL
 			);
 
-			// Empty source means compilation error, if the source is not empty then we convert the rcompiled PICA code into a valid shader and upload
+			// Empty source means compilation error, if the source is not empty then we convert the recompiled PICA code into a valid shader and upload
 			// it to the GPU
 			if (!picaShaderSource.empty()) {
 				std::string vertexShaderSource = fragShaderGen.getVertexShaderAccelerated(picaShaderSource, vertexConfig, usingUbershader);
@@ -1167,24 +1168,49 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
 
 	gl.bindVAO(hwShaderVAO);
 
+	// Enable or disable vertex attributes as needed
+	const u32 currentAttributeMask = accel->enabledAttributeMask;
+	// Use bitwise xor to calculate which attributes chanced
+	u32 attributeMaskDiff = currentAttributeMask ^ previousAttributeMask;
+	
+	while (attributeMaskDiff != 0) {
+		// Get index of next different attribute and turn it off
+		const u32 index = 31 - std::countl_zero<u32>(attributeMaskDiff);
+		const u32 mask = 1u << index;
+		attributeMaskDiff ^= mask;
+
+		if ((currentAttributeMask & mask) != 0) {
+			// Attribute was disabled and is now enabled
+			hwShaderVAO.enableAttribute(index);
+		} else {
+			// Attribute was enabled and is now disabled
+			hwShaderVAO.disableAttribute(index);
+		}
+	}
+
+	previousAttributeMask = currentAttributeMask;
+
 	for (int i = 0; i < totalAttribCount; i++) {
 		const auto& attrib = accel->attributeInfo[i];
-		
+
 		if (attrib.fixed) {
-			Helpers::panic("Fixed attribute!");
+			if ((currentAttributeMask & (1u << i)) == 0) {
+				glVertexAttrib4f(attrib.inputReg, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]);
+			}
 		} else {
-			if (attrib.isPadding) {
+			if (attrib.isPadding) [[unlikely]] {
 				continue;
 			}
-
-			glVertexAttribPointer(i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride, reinterpret_cast<GLvoid*>(vertexBufferRes.buffer_offset + attrib.offset));
-			// TODO: Disable unused attributes as well
-			hwShaderVAO.enableAttribute(i);
-
+	
 			const u32 attributeSize = attrib.size * vertexCount;
 			std::memcpy(vertexData, attrib.data, attributeSize);
-			
+
 			vertexData += attributeSize;
+
+			glVertexAttribPointer(
+				attrib.inputReg, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride,
+				reinterpret_cast<GLvoid*>(vertexBufferRes.buffer_offset + attrib.offset)
+			);
 		}
 	}
 

From 15b6a9e2d947e46a192041dbe860e2e502eac619 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Tue, 3 Sep 2024 02:21:20 +0300
Subject: [PATCH 47/63] HW shaders: Fix attribute fetch

---
 src/core/PICA/draw_acceleration.cpp  | 9 +++++----
 src/core/renderer_gl/renderer_gl.cpp | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp
index 538a714eb..84096fb74 100644
--- a/src/core/PICA/draw_acceleration.cpp
+++ b/src/core/PICA/draw_acceleration.cpp
@@ -64,7 +64,6 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 		if (!fixedAttrib) {
 			auto& attrData = attributeInfo[buffer];  // Get information for this attribute
 			u64 attrCfg = attrData.getConfigFull();  // Get config1 | (config2 << 32)
-			u32 attributeOffset = attrData.offset;
 
 			if (attrData.componentCount != 0) {
 				// Size of the attribute in bytes multiplied by the total number of vertices
@@ -73,6 +72,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 				accel.vertexDataSize += (bytes + 3) & ~3;
 			}
 
+			u32 attributeOffset = 0;
 			for (int i = 0; i < attrData.componentCount; i++) {
 				uint index = (attrCfg >> (i * 4)) & 0xf;  // Get index of attribute in vertexCfg
 				auto& attr = accel.attributeInfo[attrCount];
@@ -101,6 +101,10 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 				// Mark the attribute as enabled
 				accel.enabledAttributeMask |= 1 << inputReg;
 
+				// Get a pointer to the data where this attribute is stored
+				const u32 attrAddress = vertexBase + attributeOffset + attrData.offset + (accel.minimumIndex * attrData.size);
+
+				attr.data = getPointerPhys<u8>(attrAddress);
 				attr.inputReg = inputReg;
 				attr.componentCount = size;
 				attr.offset = attributeOffset;
@@ -110,9 +114,6 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 				attr.isPadding = false;
 				attributeOffset += attr.size;
 
-				// Get a pointer to the data where this attribute is stored
-				const u32 attrAddress = vertexBase + attr.offset + (accel.minimumIndex * attrData.size);
-				attr.data = getPointerPhys<u8>(attrAddress);
 				attrCount += 1;
 			}
 
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 3d0119552..6447f7635 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -512,7 +512,7 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 			hwIndexBuffer->Bind();
 			glDrawRangeElementsBaseVertex(
 				primitiveTopology, minimumIndex, maximumIndex, GLsizei(vertices.size()), usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE,
-				hwIndexBufferOffset, -minimumIndex
+				hwIndexBufferOffset, -GLint(minimumIndex)
 			);
 		} else {
 			// When doing non-indexed rendering, just use glDrawArrays

From 4a39b06262fb2b9b4fcc28293f23d82b7d4ff628 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Wed, 4 Sep 2024 03:18:39 +0300
Subject: [PATCH 48/63] Rewriting hw vertex fetch

---
 include/PICA/draw_acceleration.hpp           |  17 ++-
 src/core/PICA/draw_acceleration.cpp          | 145 ++++++++++---------
 src/core/PICA/gpu.cpp                        |   2 -
 src/core/renderer_gl/renderer_gl.cpp         |  42 +++---
 third_party/duckstation/gl/stream_buffer.cpp |   2 +-
 5 files changed, 110 insertions(+), 98 deletions(-)

diff --git a/include/PICA/draw_acceleration.hpp b/include/PICA/draw_acceleration.hpp
index 72eb8944c..6a66cdc1f 100644
--- a/include/PICA/draw_acceleration.hpp
+++ b/include/PICA/draw_acceleration.hpp
@@ -6,32 +6,37 @@
 
 namespace PICA {
 	struct DrawAcceleration {
-		static constexpr u32 maxAttribCount = 12;
+		static constexpr u32 maxAttribCount = 16;
+		static constexpr u32 maxLoaderCount = 12;
 
 		struct AttributeInfo {
-			u8* data;
 			u32 offset;
-			u32 size;
 			u32 stride;
 
-			u8 inputReg; // Which input reg should this attribute go to in the vertex shader?
 			u8 type;
 			u8 componentCount;
-			bool fixed;
-			bool isPadding;
 
 			std::array<float, 4> fixedValue;  // For fixed attributes
 		};
 
+		struct Loader {
+			// Data to upload for this loader
+			u8* data;
+			usize size;
+		};
+
 		u8* indexBuffer;
 
 		// Minimum and maximum index in the index buffer for a draw call
 		u16 minimumIndex, maximumIndex;
 		u32 totalAttribCount;
+		u32 totalLoaderCount;
 		u32 enabledAttributeMask;
+		u32 fixedAttributes;
 		u32 vertexDataSize;
 
 		std::array<AttributeInfo, maxAttribCount> attributeInfo;
+		std::array<Loader, maxLoaderCount> loaders;
 
 		bool canBeAccelerated;
 		bool indexed;
diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp
index 84096fb74..a65fd1b54 100644
--- a/src/core/PICA/draw_acceleration.cpp
+++ b/src/core/PICA/draw_acceleration.cpp
@@ -1,5 +1,6 @@
 #include "PICA/draw_acceleration.hpp"
 
+#include <bit>
 #include <limits>
 
 #include "PICA/gpu.hpp"
@@ -53,88 +54,94 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 	const u64 vertexCfg = u64(regs[PICA::InternalRegs::AttribFormatLow]) | (u64(regs[PICA::InternalRegs::AttribFormatHigh]) << 32);
 	const u64 inputAttrCfg = getVertexShaderInputConfig();
 
-	u32 buffer = 0;
 	u32 attrCount = 0;
+	u32 loaderOffset = 0;
 	accel.vertexDataSize = 0;
+	accel.totalLoaderCount = 0;
 
-	while (attrCount < totalAttribCount) {
-		bool fixedAttrib = (fixedAttribMask & (1 << attrCount)) != 0;
+	for (int i = 0; i < PICA::DrawAcceleration::maxLoaderCount; i++) {
+		auto& loaderData = attributeInfo[i];  // Get information for this attribute loader
 
-		// Variable attribute attribute
-		if (!fixedAttrib) {
-			auto& attrData = attributeInfo[buffer];  // Get information for this attribute
-			u64 attrCfg = attrData.getConfigFull();  // Get config1 | (config2 << 32)
+		// This loader is empty, skip it
+		if (loaderData.componentCount == 0 || loaderData.size == 0) {
+			continue;
+		}
 
-			if (attrData.componentCount != 0) {
-				// Size of the attribute in bytes multiplied by the total number of vertices
-				const u32 bytes = attrData.size * vertexCount;
-				// Add it to the total vertex data size, aligned to 4 bytes.
-				accel.vertexDataSize += (bytes + 3) & ~3;
+		auto& loader = accel.loaders[accel.totalLoaderCount++];
+
+		// The size of the loader in bytes is equal to the bytes supplied for 1 vertex, multiplied by the number of vertices we'll be uploading
+		// Which is equal to maximumIndex - minimumIndex + 1
+		const u32 bytes = loaderData.size * (accel.maximumIndex - accel.minimumIndex + 1);
+		loader.size = bytes;
+
+		// Add it to the total vertex data size, aligned to 4 bytes.
+		accel.vertexDataSize += (bytes + 3) & ~3;
+		
+		// Get a pointer to the data where this loader's data is stored
+		const u32 loaderAddress = vertexBase + loaderData.offset + (accel.minimumIndex * loaderData.size);
+		loader.data = getPointerPhys<u8>(loaderAddress);
+
+		u64 attrCfg = loaderData.getConfigFull();  // Get config1 | (config2 << 32)
+		u32 attributeOffset = 0;
+
+		for (int component = 0; component < loaderData.componentCount; component++) {
+			uint attributeIndex = (attrCfg >> (component * 4)) & 0xf;  // Get index of attribute in vertexCfg
+
+			// Vertex attributes used as padding
+			// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
+			if (attributeIndex >= 12) [[unlikely]] {
+				Helpers::panic("Padding attribute");
+				// Align attribute address up to a 4 byte boundary
+				attributeOffset = (attributeOffset + 3) & -4;
+				attributeOffset += (attributeIndex - 11) << 2;
+				continue;
 			}
 
-			u32 attributeOffset = 0;
-			for (int i = 0; i < attrData.componentCount; i++) {
-				uint index = (attrCfg >> (i * 4)) & 0xf;  // Get index of attribute in vertexCfg
-				auto& attr = accel.attributeInfo[attrCount];
-				attr.fixed = false;
-
-				// Vertex attributes used as padding
-				// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
-				if (index >= 12) [[unlikely]] {
-					Helpers::panic("Padding attribute");
-					// Align attribute address up to a 4 byte boundary
-					attributeOffset = (attributeOffset + 3) & -4;
-					attributeOffset += (index - 11) << 2;
-
-					attr.data = nullptr;
-					attr.isPadding = true;
-					continue;
-				}
-
-				const u32 attribInfo = (vertexCfg >> (index * 4)) & 0xf;
-				const u32 attribType = attribInfo & 0x3;  //  Type of attribute (sbyte/ubyte/short/float)
-				const u32 size = (attribInfo >> 2) + 1;   // Total number of components
-			
-				// Size of each component based on the attribute type
-				static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
-				const u32 inputReg = (inputAttrCfg >> (attrCount * 4)) & 0xf;
-				// Mark the attribute as enabled
-				accel.enabledAttributeMask |= 1 << inputReg;
-
-				// Get a pointer to the data where this attribute is stored
-				const u32 attrAddress = vertexBase + attributeOffset + attrData.offset + (accel.minimumIndex * attrData.size);
-
-				attr.data = getPointerPhys<u8>(attrAddress);
-				attr.inputReg = inputReg;
-				attr.componentCount = size;
-				attr.offset = attributeOffset;
-				attr.size = size * sizePerComponent[attribType];
-				attr.stride = attrData.size;
-				attr.type = attribType;
-				attr.isPadding = false;
-				attributeOffset += attr.size;
-
-				attrCount += 1;
-			}
+			const u32 attribInfo = (vertexCfg >> (attributeIndex * 4)) & 0xf;
+			const u32 attribType = attribInfo & 0x3;  //  Type of attribute (sbyte/ubyte/short/float)
+			const u32 size = (attribInfo >> 2) + 1;   // Total number of components
+
+			// Size of each component based on the attribute type
+			static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
+			const u32 inputReg = (inputAttrCfg >> (attributeIndex * 4)) & 0xf;
+			// Mark the attribute as enabled
+			accel.enabledAttributeMask |= 1 << inputReg;
+
+			auto& attr = accel.attributeInfo[inputReg];
+			attr.componentCount = size;
+			attr.offset = attributeOffset + loaderOffset;
+			attr.stride = loaderData.size;
+			attr.type = attribType;
+			attributeOffset += size * sizePerComponent[attribType];
+		}
 
-			buffer += 1;
-		} else {
-			vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[attrCount];
-			auto& attr = accel.attributeInfo[attrCount];
+		loaderOffset += loader.size;
+	}
+
+	u32 fixedAttributes = fixedAttribMask;
+	accel.fixedAttributes = 0;
 
-			attr.fixed = true;
-			// Set the data pointer to nullptr in order to catch any potential bugs
-			attr.data = nullptr;
-			attr.isPadding = false;
+	// Fetch values for all fixed attributes using CLZ on the fixed attribute mask to find the attributes that are actually fixed
+	while (fixedAttributes != 0) {
+		// Get index of next fixed attribute and turn it off
+		const u32 index = std::countr_zero<u32>(fixedAttributes);
+		const u32 mask = 1u << index;
+		fixedAttributes ^= mask;
+
+		// PICA register this fixed attribute is meant to go to
+		const u32 inputReg = (inputAttrCfg >> (index * 4)) & 0xf;
+		const u32 inputRegMask = 1u << inputReg;
+
+		// If this input reg is already used for a non-fixed attribute then it will not be replaced by a fixed attribute
+		if ((accel.enabledAttributeMask & inputRegMask) == 0) {
+			vec4f& fixedAttr = shaderUnit.vs.fixedAttributes[index];
+			auto& attr = accel.attributeInfo[inputReg];
+
+			accel.fixedAttributes |= inputRegMask;
 
 			for (int i = 0; i < 4; i++) {
 				attr.fixedValue[i] = fixedAttr[i].toFloat32();
 			}
-
-			const u32 inputReg = (inputAttrCfg >> (attrCount * 4)) & 0xf;
-
-			attr.inputReg = inputReg;
-			attrCount += 1;
 		}
 	}
 
diff --git a/src/core/PICA/gpu.cpp b/src/core/PICA/gpu.cpp
index 2797e09fb..2624903fc 100644
--- a/src/core/PICA/gpu.cpp
+++ b/src/core/PICA/gpu.cpp
@@ -337,8 +337,6 @@ void GPU::drawArrays() {
 					}
 
 					// Fill the remaining attribute lanes with default parameters (1.0 for alpha/w, 0.0) for everything else
-					// Corgi does this although I'm not sure if it's actually needed for anything.
-					// TODO: Find out
 					while (component < 4) {
 						attribute[component] = (component == 3) ? f24::fromFloat32(1.0) : f24::fromFloat32(0.0);
 						component++;
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 6447f7635..954c30bc9 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -508,7 +508,7 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 		OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
 	} else {
 		if (performIndexedRender) {
-			// When doing indexed rendering, bind the EBO and use glDrawRangeElementsBaseVertex to issue the indexed draw
+			// When doing indexed rendering, use glDrawRangeElementsBaseVertex to issue the indexed draw
 			hwIndexBuffer->Bind();
 			glDrawRangeElementsBaseVertex(
 				primitiveTopology, minimumIndex, maximumIndex, GLsizei(vertices.size()), usingShortIndices ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE,
@@ -1165,12 +1165,13 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
 	hwVertexBuffer->Bind();
 	auto vertexBufferRes = hwVertexBuffer->Map(4, accel->vertexDataSize);
 	u8* vertexData = static_cast<u8*>(vertexBufferRes.pointer);
+	const u32 vertexBufferOffset = vertexBufferRes.buffer_offset;
 
 	gl.bindVAO(hwShaderVAO);
 
 	// Enable or disable vertex attributes as needed
 	const u32 currentAttributeMask = accel->enabledAttributeMask;
-	// Use bitwise xor to calculate which attributes chanced
+	// Use bitwise xor to calculate which attributes changed
 	u32 attributeMaskDiff = currentAttributeMask ^ previousAttributeMask;
 	
 	while (attributeMaskDiff != 0) {
@@ -1190,29 +1191,30 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
 
 	previousAttributeMask = currentAttributeMask;
 
-	for (int i = 0; i < totalAttribCount; i++) {
-		const auto& attrib = accel->attributeInfo[i];
+	// Upload the data for each (enabled) attribute loader into our vertex buffer
+	for (int i = 0; i < accel->totalLoaderCount; i++) {
+		auto& loader = accel->loaders[i];
 
-		if (attrib.fixed) {
-			if ((currentAttributeMask & (1u << i)) == 0) {
-				glVertexAttrib4f(attrib.inputReg, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]);
-			}
-		} else {
-			if (attrib.isPadding) [[unlikely]] {
-				continue;
-			}
-	
-			const u32 attributeSize = attrib.size * vertexCount;
-			std::memcpy(vertexData, attrib.data, attributeSize);
+		std::memcpy(vertexData, loader.data, loader.size);
+		vertexData += loader.size;
+	}
 
-			vertexData += attributeSize;
+	hwVertexBuffer->Unmap(accel->vertexDataSize);
+
+	// Iterate over the 16 PICA input registers and configure how they should be fetched.
+	for (int i = 0; i < 16; i++) {
+		const auto& attrib = accel->attributeInfo[i];
+		const u32 attributeMask = 1u << i;
 
+		if (accel->fixedAttributes & attributeMask) {
+			// This is a fixed attribute, so set its fixed value
+			// TODO: Don't update these if the value does not change, it generates way too many calls
+			glVertexAttrib4f(i, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]);
+		} else if (accel->enabledAttributeMask & attributeMask) {
 			glVertexAttribPointer(
-				attrib.inputReg, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride,
-				reinterpret_cast<GLvoid*>(vertexBufferRes.buffer_offset + attrib.offset)
+				i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride,
+				reinterpret_cast<GLvoid*>(vertexBufferOffset + attrib.offset)
 			);
 		}
 	}
-
-	hwVertexBuffer->Unmap(accel->vertexDataSize);
 }
\ No newline at end of file
diff --git a/third_party/duckstation/gl/stream_buffer.cpp b/third_party/duckstation/gl/stream_buffer.cpp
index ff6c79f9b..6fff8b95e 100644
--- a/third_party/duckstation/gl/stream_buffer.cpp
+++ b/third_party/duckstation/gl/stream_buffer.cpp
@@ -149,7 +149,7 @@ namespace {
 			const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
 			for (; m_available_block_index < end; m_available_block_index++) {
 				if (!m_sync_objects[m_used_block_index]) [[unlikely]] {
-					Helpers::warn("GL stream buffer: Fence slot we're trying to wait on in not in use");
+					Helpers::warn("GL stream buffer: Fence slot we're trying to wait on is not in use");
 				}
 
 				WaitForSync(m_sync_objects[m_available_block_index]);

From 16425379e3c52d7e71be5c3df22b154c8f223153 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Fri, 4 Oct 2024 19:14:55 +0300
Subject: [PATCH 49/63] Stream buffer: Fix copy-paste mistake

---
 third_party/duckstation/gl/stream_buffer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/duckstation/gl/stream_buffer.cpp b/third_party/duckstation/gl/stream_buffer.cpp
index 6fff8b95e..b7a406036 100644
--- a/third_party/duckstation/gl/stream_buffer.cpp
+++ b/third_party/duckstation/gl/stream_buffer.cpp
@@ -148,7 +148,7 @@ namespace {
 		ALWAYS_INLINE void EnsureSyncsWaitedForOffset(u32 offset) {
 			const u32 end = std::min<u32>(GetSyncIndexForOffset(offset) + 1, NUM_SYNC_POINTS);
 			for (; m_available_block_index < end; m_available_block_index++) {
-				if (!m_sync_objects[m_used_block_index]) [[unlikely]] {
+				if (!m_sync_objects[m_available_block_index]) [[unlikely]] {
 					Helpers::warn("GL stream buffer: Fence slot we're trying to wait on is not in use");
 				}
 

From 09b04704f82a3ce483763cde94ef95fd91979834 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Fri, 4 Oct 2024 19:38:43 +0300
Subject: [PATCH 50/63] HW shaders: Fix indexed rendering

---
 src/core/renderer_gl/renderer_gl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 954c30bc9..80d2ab415 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -1067,7 +1067,7 @@ void RendererGL::screenshot(const std::string& name) {
 
 	// Flip the image vertically
 	for (int y = 0; y < height; y++) {
-		memcpy(&flippedPixels[y * width * 4], &pixels[(height - y - 1) * width * 4], width * 4);
+		std::memcpy(&flippedPixels[y * width * 4], &pixels[(height - y - 1) * width * 4], width * 4);
 		// Swap R and B channels
 		for (int x = 0; x < width; x++) {
 			std::swap(flippedPixels[y * width * 4 + x * 4 + 0], flippedPixels[y * width * 4 + x * 4 + 2]);
@@ -1152,7 +1152,7 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
 	// Update index buffer if necessary
 	if (accel->indexed) {
 		usingShortIndices = accel->useShortIndices;
-		const usize indexBufferSize = usize(vertexCount) * (usingShortIndices ? sizeof(u16) : sizeof(u8));
+		const usize indexBufferSize = regs[PICA::InternalRegs::VertexCountReg] * (usingShortIndices ? sizeof(u16) : sizeof(u8));
 
 		hwIndexBuffer->Bind();
 		auto indexBufferRes = hwIndexBuffer->Map(4, indexBufferSize);

From 0a2bc7c909f1a392253dad241dab86c196acbe40 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Fri, 4 Oct 2024 19:48:47 +0300
Subject: [PATCH 51/63] HW shaders: Add padding attributes

---
 src/core/PICA/draw_acceleration.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp
index a65fd1b54..1850d8190 100644
--- a/src/core/PICA/draw_acceleration.cpp
+++ b/src/core/PICA/draw_acceleration.cpp
@@ -90,7 +90,6 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 			// Vertex attributes used as padding
 			// 12, 13, 14 and 15 are equivalent to 4, 8, 12 and 16 bytes of padding respectively
 			if (attributeIndex >= 12) [[unlikely]] {
-				Helpers::panic("Padding attribute");
 				// Align attribute address up to a 4 byte boundary
 				attributeOffset = (attributeOffset + 3) & -4;
 				attributeOffset += (attributeIndex - 11) << 2;

From e3252ec4ef28c9563fd0fb735a63320dc3f1838b Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sat, 5 Oct 2024 21:14:40 +0300
Subject: [PATCH 52/63] HW shaders: Avoid redundant glVertexAttrib4f calls

---
 include/renderer_gl/renderer_gl.hpp  |  3 +++
 src/core/renderer_gl/renderer_gl.cpp | 22 ++++++++++++++++------
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/include/renderer_gl/renderer_gl.hpp b/include/renderer_gl/renderer_gl.hpp
index 137c48898..738ce15a1 100644
--- a/include/renderer_gl/renderer_gl.hpp
+++ b/include/renderer_gl/renderer_gl.hpp
@@ -99,6 +99,9 @@ class RendererGL final : public Renderer {
 	std::unique_ptr<StreamBuffer> hwVertexBuffer;
 	std::unique_ptr<StreamBuffer> hwIndexBuffer;
 
+	// Cache of fixed attribute values so that we don't do any duplicate updates
+	std::array<std::array<float, 4>, 16> fixedAttrValues;
+
 	// Cached recompiled fragment shader
 	struct CachedProgram {
 		OpenGL::Program program;
diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 80d2ab415..7e68e0c9a 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -184,6 +184,12 @@ void RendererGL::initGraphicsContextInternal() {
 	OpenGL::clearColor();
 	OpenGL::setViewport(oldViewport[0], oldViewport[1], oldViewport[2], oldViewport[3]);
 
+	// Initialize fixed attributes
+	for (int i = 0; i < fixedAttrValues.size(); i++) {
+		fixedAttrValues[i] = {0.f, 0.f, 0.f, 0.f};
+		glVertexAttrib4f(i, 0.0, 0.0, 0.0, 0.0);
+	}
+
 	reset();
 
 	// Initialize the default vertex shader used with shadergen
@@ -1008,12 +1014,12 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration*
 				glBufferSubData(GL_UNIFORM_BUFFER, 0, PICAShader::totalUniformSize(), shaderUnit.vs.getUniformPointer());
 			}
 
-			// Upload vertex data and index buffer data to our GPU
-			accelerateVertexUpload(shaderUnit, accel);
-
 			performIndexedRender = accel->indexed;
 			minimumIndex = GLsizei(accel->minimumIndex);
 			maximumIndex = GLsizei(accel->maximumIndex);
+
+			// Upload vertex data and index buffer data to our GPU
+			accelerateVertexUpload(shaderUnit, accel);
 		}
 	}
 
@@ -1207,9 +1213,13 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
 		const u32 attributeMask = 1u << i;
 
 		if (accel->fixedAttributes & attributeMask) {
-			// This is a fixed attribute, so set its fixed value
-			// TODO: Don't update these if the value does not change, it generates way too many calls
-			glVertexAttrib4f(i, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]);
+			auto& attrValue = fixedAttrValues[i];
+			// This is a fixed attribute, so set its fixed value, but only if it actually needs to be updated
+			if (attrValue[0] != attrib.fixedValue[0] || attrValue[1] != attrib.fixedValue[1] || attrValue[2] != attrib.fixedValue[2] ||
+				attrValue[3] != attrib.fixedValue[3]) {
+				std::memcpy(attrValue.data(), attrib.fixedValue.data(), sizeof(attrib.fixedValue));
+				glVertexAttrib4f(i, attrib.fixedValue[0], attrib.fixedValue[1], attrib.fixedValue[2], attrib.fixedValue[3]);
+			}
 		} else if (accel->enabledAttributeMask & attributeMask) {
 			glVertexAttribPointer(
 				i, attrib.componentCount, attributeFormats[attrib.type], GL_FALSE, attrib.stride,

From 872a6baccac2536a7eb3ef57f14cc5ea37312bac Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 6 Oct 2024 15:39:13 +0300
Subject: [PATCH 53/63] HW shaders: Fix loops

---
 src/core/PICA/shader_decompiler.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index ead984100..aaa38a466 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -694,10 +694,10 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 				const u32 uniformIndex = getBits<22, 2>(instruction);
 
 				// loop counter = uniform.y
-				decompiledShader += fmt::format("addr_reg.z = int((uniform_int[{}] >> 16u) & 0xFFu);\n", uniformIndex);
+				decompiledShader += fmt::format("addr_reg.z = int((uniform_int[{}] >> 8u) & 0xFFu);\n", uniformIndex);
 				decompiledShader += fmt::format(
-					"for (uint loopCtr{} = 0u; loopCtr{} <= ((uniform_int[{}] >> 24) & 0xFFu); loopCtr{}++, addr_reg.z += int((uniform_int[{}] >> "
-					"8u) & 0xFFu)) {{\n",
+					"for (uint loopCtr{} = 0u; loopCtr{} <= ((uniform_int[{}] >> 0) & 0xFFu); loopCtr{}++, addr_reg.z += int((uniform_int[{}] >> "
+					"16u) & 0xFFu)) {{\n",
 					pc, pc, uniformIndex, pc, uniformIndex
 				);
 
@@ -706,6 +706,10 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 				callFunction(*func);
 				decompiledShader += "}\n";
 
+				// Jump to the end of the loop. We don't want to compile the code inside the loop again.
+				// This will be incremented by 1 due to the pc++ at the end of this loop.
+				pc = dest;
+
 				if (func->exitMode == ExitMode::AlwaysEnd) {
 					finished = true;
 					return;

From bb7b1b3ef19def8fd00569d74c7058865ee4a42e Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 6 Oct 2024 16:07:57 +0300
Subject: [PATCH 54/63] HW shaders: Make generated shaders slightly smaller

---
 src/core/PICA/shader_decompiler.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index aaa38a466..13a05f161 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -222,21 +222,21 @@ void ShaderDecompiler::writeAttributes() {
 	decompiledShader += R"(
 	layout(location = 0) in vec4 inputs[16];
 	layout(std140) uniform PICAShaderUniforms {
-		vec4 uniform_float[96];
-		uvec4 uniform_int;
+		vec4 uniform_f[96];
+		uvec4 uniform_i;
 		uint uniform_bool;
 	};
 
-	vec4 tmp_regs[16];
+	vec4 temp[16];
 	vec4 out_regs[16];
 	vec4 dummy_vec = vec4(0.0);
 	ivec3 addr_reg = ivec3(0);
 	bvec2 cmp_reg = bvec2(false);
 
-	vec4 float_uniform_indexed(int source, int offset) {
+	vec4 uniform_indexed(int source, int offset) {
 		int clipped_offs = (offset >= -128 && offset <= 127) ? offset : 0;
 		uint index = uint(clipped_offs + source) & 127u;
-		return (index < 96u) ? uniform_float[index] : vec4(1.0);
+		return (index < 96u) ? uniform_f[index] : vec4(1.0);
 	}
 )";
 }
@@ -340,7 +340,7 @@ std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index)
 	if (source < 0x10) {
 		return "inputs[" + std::to_string(source) + "]";
 	} else if (source < 0x20) {
-		return "tmp_regs[" + std::to_string(source - 0x10) + "]";
+		return "temp[" + std::to_string(source - 0x10) + "]";
 	} else {
 		const usize floatIndex = (source - 0x20) & 0x7f;
 
@@ -348,10 +348,10 @@ std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index)
 			if (floatIndex >= 96) [[unlikely]] {
 				return "dummy_vec";
 			}
-			return "uniform_float[" + std::to_string(floatIndex) + "]";
+			return "uniform_f[" + std::to_string(floatIndex) + "]";
 		} else {
 			static constexpr std::array<const char*, 4> offsets = {"0", "addr_reg.x", "addr_reg.y", "addr_reg.z"};
-			return fmt::format("float_uniform_indexed({}, {})", floatIndex, offsets[index]);
+			return fmt::format("uniform_indexed({}, {})", floatIndex, offsets[index]);
 		}
 	}
 }
@@ -360,7 +360,7 @@ std::string ShaderDecompiler::getDest(u32 dest) const {
 	if (dest < 0x10) {
 		return "out_regs[" + std::to_string(dest) + "]";
 	} else if (dest < 0x20) {
-		return "tmp_regs[" + std::to_string(dest - 0x10) + "]";
+		return "temp[" + std::to_string(dest - 0x10) + "]";
 	} else {
 		return "dummy_vec";
 	}
@@ -694,9 +694,9 @@ void ShaderDecompiler::compileInstruction(u32& pc, bool& finished) {
 				const u32 uniformIndex = getBits<22, 2>(instruction);
 
 				// loop counter = uniform.y
-				decompiledShader += fmt::format("addr_reg.z = int((uniform_int[{}] >> 8u) & 0xFFu);\n", uniformIndex);
+				decompiledShader += fmt::format("addr_reg.z = int((uniform_i[{}] >> 8u) & 0xFFu);\n", uniformIndex);
 				decompiledShader += fmt::format(
-					"for (uint loopCtr{} = 0u; loopCtr{} <= ((uniform_int[{}] >> 0) & 0xFFu); loopCtr{}++, addr_reg.z += int((uniform_int[{}] >> "
+					"for (uint loopCtr{} = 0u; loopCtr{} <= (uniform_i[{}] & 0xFFu); loopCtr{}++, addr_reg.z += int((uniform_i[{}] >> "
 					"16u) & 0xFFu)) {{\n",
 					pc, pc, uniformIndex, pc, uniformIndex
 				);

From 53097cc53ec5e2ad5d0f3b5c40a5cf04da9789f9 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 6 Oct 2024 17:01:27 +0300
Subject: [PATCH 55/63] Fix libretro build

---
 src/libretro_core.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libretro_core.cpp b/src/libretro_core.cpp
index d77ee7260..21a62f230 100644
--- a/src/libretro_core.cpp
+++ b/src/libretro_core.cpp
@@ -198,7 +198,7 @@ static void configUpdate() {
 	config.sdWriteProtected = fetchVariableBool("panda3ds_write_protect_virtual_sd", false);
 	config.accurateShaderMul = fetchVariableBool("panda3ds_accurate_shader_mul", false);
 	config.useUbershaders = fetchVariableBool("panda3ds_use_ubershader", EmulatorConfig::ubershaderDefault);
-	config.accelerateShaders = FetchVariableBool("panda3ds_accelerate_shaders", EmulatorConfig::accelerateShadersDefault);
+	config.accelerateShaders = fetchVariableBool("panda3ds_accelerate_shaders", EmulatorConfig::accelerateShadersDefault);
 
 	config.forceShadergenForLights = fetchVariableBool("panda3ds_ubershader_lighting_override", true);
 	config.lightShadergenThreshold = fetchVariableRange("panda3ds_ubershader_lighting_override_threshold", 1, 8);

From b833e071d153f8f03f10aaa4b886326ba10df8cc Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 6 Oct 2024 17:13:51 +0300
Subject: [PATCH 56/63] Update config.hpp

---
 include/config.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/config.hpp b/include/config.hpp
index da5b69408..a8ba89466 100644
--- a/include/config.hpp
+++ b/include/config.hpp
@@ -20,7 +20,7 @@ struct EmulatorConfig {
 #else
 	static constexpr bool ubershaderDefault = true;
 #endif
-	static constexpr bool accelerateShadersDefault = false;
+	static constexpr bool accelerateShadersDefault = true;
 	
 	bool shaderJitEnabled = shaderJitDefault;
 	bool useUbershaders = ubershaderDefault;

From 12d081096a5cafbaabe31fd5b9c44faa983e6045 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 6 Oct 2024 17:48:34 +0300
Subject: [PATCH 57/63] Update renderer_gl.cpp

---
 src/core/renderer_gl/renderer_gl.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index fa0df0f16..641785272 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -510,6 +510,7 @@ void RendererGL::drawVertices(PICA::PrimType primType, std::span<const Vertex> v
 	setupStencilTest(stencilEnable);
 
 	if (!usingAcceleratedShader) {
+		return;
 		vbo.bufferVertsSub(vertices);
 		OpenGL::draw(primitiveTopology, GLsizei(vertices.size()));
 	} else {
@@ -1227,4 +1228,4 @@ void RendererGL::accelerateVertexUpload(ShaderUnit& shaderUnit, PICA::DrawAccele
 			);
 		}
 	}
-}
\ No newline at end of file
+}

From 56c3e738adaa8f2d1f803956176b020f9e2ef4dc Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 6 Oct 2024 18:54:03 +0300
Subject: [PATCH 58/63] Add Android logging when a shader fails to compile

---
 third_party/opengl/opengl.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/opengl/opengl.hpp b/third_party/opengl/opengl.hpp
index 607815fa0..b17ec8e3b 100644
--- a/third_party/opengl/opengl.hpp
+++ b/third_party/opengl/opengl.hpp
@@ -30,6 +30,7 @@
 #include <type_traits>
 #include <utility>
 
+#include <android/log.h>
 #include <glad/gl.h>
 
 // Check if we have C++20. If yes, we can add C++20 std::span support
@@ -383,7 +384,7 @@ namespace OpenGL {
 			if (success == GL_FALSE) {
 				char buf[4096];
 				glGetShaderInfoLog(m_handle, 4096, nullptr, buf);
-				fprintf(stderr, "Failed to compile shader\nError: %s\n", buf);
+				__android_log_print("Failed to compile shader\nError: %s\nShader: %s", buf, sources[0]);
 				glDeleteShader(m_handle);
 
 				m_handle = 0;

From 40a7ac6d29ac639ee8ca443603f41926550aee9d Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 6 Oct 2024 19:03:32 +0300
Subject: [PATCH 59/63] Update opengl.hpp

---
 third_party/opengl/opengl.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/opengl/opengl.hpp b/third_party/opengl/opengl.hpp
index b17ec8e3b..5ed630b2d 100644
--- a/third_party/opengl/opengl.hpp
+++ b/third_party/opengl/opengl.hpp
@@ -384,7 +384,7 @@ namespace OpenGL {
 			if (success == GL_FALSE) {
 				char buf[4096];
 				glGetShaderInfoLog(m_handle, 4096, nullptr, buf);
-				__android_log_print("Failed to compile shader\nError: %s\nShader: %s", buf, sources[0]);
+				__android_log_print(ANDROID_LOG_INFO, "AlberDriver", "Failed to compile shader\nError: %s\nShader: %s", buf, sources[0]);
 				glDeleteShader(m_handle);
 
 				m_handle = 0;

From 5202d9172e48ab585e71978fc58614e2037c927d Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 6 Oct 2024 19:23:02 +0300
Subject: [PATCH 60/63] Shader Decompiler: Add int/float precision qualifiers.

---
 src/core/PICA/shader_decompiler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index 13a05f161..be05dd44b 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -253,7 +253,7 @@ std::string ShaderDecompiler::decompile() {
 
 	switch (api) {
 		case API::GL: decompiledShader += "#version 410 core\n"; break;
-		case API::GLES: decompiledShader += "#version 300 es\n"; break;
+		case API::GLES: decompiledShader += "#version 300 es\nprecision mediump float;\nprecision mediump int;\n"; break;
 		default: break;
 	}
 

From 214c1d8bed9b6b0b47f7c3c01ebfce0740ac9734 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 6 Oct 2024 19:52:56 +0300
Subject: [PATCH 61/63] Update renderer_gl.cpp

---
 src/core/renderer_gl/renderer_gl.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/core/renderer_gl/renderer_gl.cpp b/src/core/renderer_gl/renderer_gl.cpp
index 641785272..c4ce4227c 100644
--- a/src/core/renderer_gl/renderer_gl.cpp
+++ b/src/core/renderer_gl/renderer_gl.cpp
@@ -992,7 +992,8 @@ bool RendererGL::prepareForDraw(ShaderUnit& shaderUnit, PICA::DrawAcceleration*
 			shader = OpenGL::Shader();
 
 			std::string picaShaderSource = PICA::ShaderGen::decompileShader(
-				shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint, PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL
+				shaderUnit.vs, *emulatorConfig, shaderUnit.vs.entrypoint,
+				Helpers::isAndroid() ? PICA::ShaderGen::API::GLES : PICA::ShaderGen::API::GL, PICA::ShaderGen::Language::GLSL
 			);
 
 			// Empty source means compilation error, if the source is not empty then we convert the recompiled PICA code into a valid shader and upload

From 0252a2a996bf9b210dd69e8098c6bd22780610ab Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Sun, 6 Oct 2024 20:34:01 +0300
Subject: [PATCH 62/63] Update shader_decompiler.cpp

---
 src/core/PICA/shader_decompiler.cpp | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/core/PICA/shader_decompiler.cpp b/src/core/PICA/shader_decompiler.cpp
index be05dd44b..021a03a38 100644
--- a/src/core/PICA/shader_decompiler.cpp
+++ b/src/core/PICA/shader_decompiler.cpp
@@ -220,7 +220,23 @@ const Function* ShaderDecompiler::findFunction(const AddressRange& range) {
 
 void ShaderDecompiler::writeAttributes() {
 	decompiledShader += R"(
-	layout(location = 0) in vec4 inputs[16];
+	layout(location = 0) in vec4 input0;
+	layout(location = 1) in vec4 input1;
+	layout(location = 2) in vec4 input2;
+	layout(location = 3) in vec4 input3;
+	layout(location = 4) in vec4 input4;
+	layout(location = 5) in vec4 input5;
+	layout(location = 6) in vec4 input6;
+	layout(location = 7) in vec4 input7;
+	layout(location = 8) in vec4 input8;
+	layout(location = 9) in vec4 input9;
+	layout(location = 10) in vec4 input10;
+	layout(location = 11) in vec4 input11;
+	layout(location = 12) in vec4 input12;
+	layout(location = 13) in vec4 input13;
+	layout(location = 14) in vec4 input14;
+	layout(location = 15) in vec4 input15;
+
 	layout(std140) uniform PICAShaderUniforms {
 		vec4 uniform_f[96];
 		uvec4 uniform_i;
@@ -338,7 +354,7 @@ std::string ShaderDecompiler::decompile() {
 
 std::string ShaderDecompiler::getSource(u32 source, [[maybe_unused]] u32 index) const {
 	if (source < 0x10) {
-		return "inputs[" + std::to_string(source) + "]";
+		return "input" + std::to_string(source);
 	} else if (source < 0x20) {
 		return "temp[" + std::to_string(source - 0x10) + "]";
 	} else {

From aa181292fc59e034ff79561c578c5ccaa6fac769 Mon Sep 17 00:00:00 2001
From: wheremyfoodat <44909372+wheremyfoodat@users.noreply.github.com>
Date: Wed, 9 Oct 2024 01:35:29 +0300
Subject: [PATCH 63/63] Update opengl.hpp

---
 third_party/opengl/opengl.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/third_party/opengl/opengl.hpp b/third_party/opengl/opengl.hpp
index 5ed630b2d..01b93373e 100644
--- a/third_party/opengl/opengl.hpp
+++ b/third_party/opengl/opengl.hpp
@@ -30,7 +30,6 @@
 #include <type_traits>
 #include <utility>
 
-#include <android/log.h>
 #include <glad/gl.h>
 
 // Check if we have C++20. If yes, we can add C++20 std::span support
@@ -384,7 +383,6 @@ namespace OpenGL {
 			if (success == GL_FALSE) {
 				char buf[4096];
 				glGetShaderInfoLog(m_handle, 4096, nullptr, buf);
-				__android_log_print(ANDROID_LOG_INFO, "AlberDriver", "Failed to compile shader\nError: %s\nShader: %s", buf, sources[0]);
 				glDeleteShader(m_handle);
 
 				m_handle = 0;