From a320d1a5b4b7ce3b90372697fbe50242b78d082e Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 28 Dec 2014 00:56:32 -0200
Subject: Clipper: Avoid dynamic allocations

The triangle clipper was allocating its temporary input, output and work
buffers using a std::vector. Since this is a hot path, it's desirable to
use stack allocation instead.
---
 src/video_core/clipper.cpp | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'src/video_core/clipper.cpp')
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 0bcd0b895..e89b7a0c0 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -2,7 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
-#include <vector>
+#include <boost/container/static_vector.hpp>
 
 #include "clipper.h"
 #include "pica.h"
@@ -98,18 +98,15 @@ static void InitScreenCoordinates(OutputVertex& vtx)
 }
 
 void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
+    using boost::container::static_vector;
 
     // TODO (neobrain):
     // The list of output vertices has some fixed maximum size,
     // however I haven't taken the time to figure out what it is exactly.
-    // For now, we hence just assume a maximal size of 1000 vertices.
-    const size_t max_vertices = 1000;
-    std::vector<OutputVertex> buffer_vertices;
-    std::vector<OutputVertex*> output_list{ &v0, &v1, &v2 };
-
-    // Make sure to reserve space for all vertices.
-    // Without this, buffer reallocation would invalidate references.
-    buffer_vertices.reserve(max_vertices);
+    // For now, we hence just assume a maximal size of 256 vertices.
+    static const size_t MAX_VERTICES = 256;
+    static_vector<OutputVertex, MAX_VERTICES> buffer_vertices;
+    static_vector<OutputVertex*, MAX_VERTICES> output_list = { &v0, &v1, &v2 };
 
     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
@@ -120,7 +117,7 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
                        ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)),
                        ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) {
 
-        const std::vector<OutputVertex*> input_list = output_list;
+        const static_vector<OutputVertex*, MAX_VERTICES> input_list = output_list;
         output_list.clear();
 
         const OutputVertex* reference_vertex = input_list.back();
-- 
cgit v1.2.3


From 7e9bc85cc826c55a5aa612a3c2f14b8fb631a68c Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 28 Dec 2014 23:05:15 -0200
Subject: Clipper: Compact buffers on each clipping pass

Use a new buffer management scheme in the clipper that allows using a
bounded minimal amount of buffer space. Even though it copies more data
it is still slightly faster likely due to using less cache.
---
 src/video_core/clipper.cpp | 55 +++++++++++++++++++++++-----------------------
 1 file changed, 27 insertions(+), 28 deletions(-)

(limited to 'src/video_core/clipper.cpp')

diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index e89b7a0c0..0521ef866 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -100,13 +100,15 @@ static void InitScreenCoordinates(OutputVertex& vtx)
 void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
     using boost::container::static_vector;
 
-    // TODO (neobrain):
-    // The list of output vertices has some fixed maximum size,
-    // however I haven't taken the time to figure out what it is exactly.
-    // For now, we hence just assume a maximal size of 256 vertices.
-    static const size_t MAX_VERTICES = 256;
-    static_vector<OutputVertex, MAX_VERTICES> buffer_vertices;
-    static_vector<OutputVertex*, MAX_VERTICES> output_list = { &v0, &v1, &v2 };
+    // Clipping a planar n-gon against a plane will remove at least 1 vertex and introduces 2 at
+    // the new edge (or less in degenerate cases). As such, we can say that each clipping plane
+    // introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a
+    // fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9.
+    static const size_t MAX_VERTICES = 9;
+    static_vector<OutputVertex, MAX_VERTICES> buffer_a = { v0, v1, v2 };
+    static_vector<OutputVertex, MAX_VERTICES> buffer_b;
+    auto* output_list = &buffer_a;
+    auto* input_list  = &buffer_b;
 
     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
@@ -117,48 +119,45 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
                        ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)),
                        ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) {
 
-        const static_vector<OutputVertex*, MAX_VERTICES> input_list = output_list;
-        output_list.clear();
+        std::swap(input_list, output_list);
+        output_list->clear();
 
-        const OutputVertex* reference_vertex = input_list.back();
+        const OutputVertex* reference_vertex = &input_list->back();
 
-        for (const auto& vertex : input_list) {
+        for (const auto& vertex : *input_list) {
             // NOTE: This algorithm changes vertex order in some cases!
-            if (edge.IsInside(*vertex)) {
+            if (edge.IsInside(vertex)) {
                 if (edge.IsOutSide(*reference_vertex)) {
-                    buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex));
-                    output_list.push_back(&(buffer_vertices.back()));
+                    output_list->push_back(edge.GetIntersection(vertex, *reference_vertex));
                 }
 
-                output_list.push_back(vertex);
+                output_list->push_back(vertex);
             } else if (edge.IsInside(*reference_vertex)) {
-                buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex));
-                output_list.push_back(&(buffer_vertices.back()));
+                output_list->push_back(edge.GetIntersection(vertex, *reference_vertex));
             }
-
-            reference_vertex = vertex;
+            reference_vertex = &vertex;
         }
 
         // Need to have at least a full triangle to continue...
-        if (output_list.size() < 3)
+        if (output_list->size() < 3)
             return;
     }
 
-    InitScreenCoordinates(*(output_list[0]));
-    InitScreenCoordinates(*(output_list[1]));
+    InitScreenCoordinates((*output_list)[0]);
+    InitScreenCoordinates((*output_list)[1]);
 
-    for (size_t i = 0; i < output_list.size() - 2; i ++) {
-        OutputVertex& vtx0 = *(output_list[0]);
-        OutputVertex& vtx1 = *(output_list[i+1]);
-        OutputVertex& vtx2 = *(output_list[i+2]);
+    for (size_t i = 0; i < output_list->size() - 2; i ++) {
+        OutputVertex& vtx0 = (*output_list)[0];
+        OutputVertex& vtx1 = (*output_list)[i+1];
+        OutputVertex& vtx2 = (*output_list)[i+2];
 
         InitScreenCoordinates(vtx2);
 
         LOG_TRACE(Render_Software,
-                  "Triangle %lu/%lu (%lu buffer vertices) at position (%.3f, %.3f, %.3f, %.3f), "
+                  "Triangle %lu/%lu at position (%.3f, %.3f, %.3f, %.3f), "
                   "(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and "
                   "screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)",
-                  i,output_list.size(), buffer_vertices.size(),
+                  i, output_list->size(),
                   vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(),
                   vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(),
                   vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(),
-- 
cgit v1.2.3


From 8369ee58035ca98f776428f6cccbcf987fee3bc9 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Tue, 23 Dec 2014 13:05:51 -0200
Subject: Rasterizer: Pre-divide vertex attributes by W

Execute the division-by-W for perspective-correct interpolation of
values in the clipper, moving them out of the rasterization inner loop.
---
 src/video_core/clipper.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'src/video_core/clipper.cpp')

diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 0521ef866..1744066ba 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -91,10 +91,17 @@ static void InitScreenCoordinates(OutputVertex& vtx)
     viewport.zscale     = float24::FromRawFloat24(registers.viewport_depth_range);
     viewport.offset_z   = float24::FromRawFloat24(registers.viewport_depth_far_plane);
 
+    float24 inv_w = float24::FromFloat32(1.f) / vtx.pos.w;
+    vtx.color *= inv_w;
+    vtx.tc0 *= inv_w;
+    vtx.tc1 *= inv_w;
+    vtx.tc2 *= inv_w;
+    vtx.pos.w = inv_w;
+
     // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not
-    vtx.screenpos[0] = (vtx.pos.x / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
-    vtx.screenpos[1] = (vtx.pos.y / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
-    vtx.screenpos[2] = viewport.offset_z - vtx.pos.z / vtx.pos.w * viewport.zscale;
+    vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
+    vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
+    vtx.screenpos[2] = viewport.offset_z - vtx.pos.z * inv_w * viewport.zscale;
 }
 
 void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
-- 
cgit v1.2.3