From a320d1a5b4b7ce3b90372697fbe50242b78d082e Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Sun, 28 Dec 2014 00:56:32 -0200 Subject: Clipper: Avoid dynamic allocations The triangle clipper was allocating its temporary input, output and work buffers using a std::vector. Since this is a hot path, it's desirable to use stack allocation instead. --- src/video_core/clipper.cpp | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'src/video_core/clipper.cpp') diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp index 0bcd0b895..e89b7a0c0 100644 --- a/src/video_core/clipper.cpp +++ b/src/video_core/clipper.cpp @@ -2,7 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include +#include #include "clipper.h" #include "pica.h" @@ -98,18 +98,15 @@ static void InitScreenCoordinates(OutputVertex& vtx) } void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { + using boost::container::static_vector; // TODO (neobrain): // The list of output vertices has some fixed maximum size, // however I haven't taken the time to figure out what it is exactly. - // For now, we hence just assume a maximal size of 1000 vertices. - const size_t max_vertices = 1000; - std::vector buffer_vertices; - std::vector output_list{ &v0, &v1, &v2 }; - - // Make sure to reserve space for all vertices. - // Without this, buffer reallocation would invalidate references. - buffer_vertices.reserve(max_vertices); + // For now, we hence just assume a maximal size of 256 vertices. + static const size_t MAX_VERTICES = 256; + static_vector buffer_vertices; + static_vector output_list = { &v0, &v1, &v2 }; // Simple implementation of the Sutherland-Hodgman clipping algorithm. // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here) @@ -120,7 +117,7 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)), ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) { - const std::vector input_list = output_list; + const static_vector input_list = output_list; output_list.clear(); const OutputVertex* reference_vertex = input_list.back(); -- cgit v1.2.3 From 7e9bc85cc826c55a5aa612a3c2f14b8fb631a68c Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Sun, 28 Dec 2014 23:05:15 -0200 Subject: Clipper: Compact buffers on each clipping pass Use a new buffer management scheme in the clipper that allows using a bounded minimal amount of buffer space. Even though it copies more data it is still slightly faster likely due to using less cache. --- src/video_core/clipper.cpp | 55 +++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 28 deletions(-) (limited to 'src/video_core/clipper.cpp') diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp index e89b7a0c0..0521ef866 100644 --- a/src/video_core/clipper.cpp +++ b/src/video_core/clipper.cpp @@ -100,13 +100,15 @@ static void InitScreenCoordinates(OutputVertex& vtx) void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { using boost::container::static_vector; - // TODO (neobrain): - // The list of output vertices has some fixed maximum size, - // however I haven't taken the time to figure out what it is exactly. - // For now, we hence just assume a maximal size of 256 vertices. - static const size_t MAX_VERTICES = 256; - static_vector buffer_vertices; - static_vector output_list = { &v0, &v1, &v2 }; + // Clipping a planar n-gon against a plane will remove at least 1 vertex and introduces 2 at + // the new edge (or less in degenerate cases). As such, we can say that each clipping plane + // introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a + // fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9. + static const size_t MAX_VERTICES = 9; + static_vector buffer_a = { v0, v1, v2 }; + static_vector buffer_b; + auto* output_list = &buffer_a; + auto* input_list = &buffer_b; // Simple implementation of the Sutherland-Hodgman clipping algorithm. // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here) @@ -117,48 +119,45 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { ClippingEdge(ClippingEdge::POS_Z, float24::FromFloat32(+1.0)), ClippingEdge(ClippingEdge::NEG_Z, float24::FromFloat32(-1.0)) }) { - const static_vector input_list = output_list; - output_list.clear(); + std::swap(input_list, output_list); + output_list->clear(); - const OutputVertex* reference_vertex = input_list.back(); + const OutputVertex* reference_vertex = &input_list->back(); - for (const auto& vertex : input_list) { + for (const auto& vertex : *input_list) { // NOTE: This algorithm changes vertex order in some cases! - if (edge.IsInside(*vertex)) { + if (edge.IsInside(vertex)) { if (edge.IsOutSide(*reference_vertex)) { - buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex)); - output_list.push_back(&(buffer_vertices.back())); + output_list->push_back(edge.GetIntersection(vertex, *reference_vertex)); } - output_list.push_back(vertex); + output_list->push_back(vertex); } else if (edge.IsInside(*reference_vertex)) { - buffer_vertices.push_back(edge.GetIntersection(*vertex, *reference_vertex)); - output_list.push_back(&(buffer_vertices.back())); + output_list->push_back(edge.GetIntersection(vertex, *reference_vertex)); } - - reference_vertex = vertex; + reference_vertex = &vertex; } // Need to have at least a full triangle to continue... - if (output_list.size() < 3) + if (output_list->size() < 3) return; } - InitScreenCoordinates(*(output_list[0])); - InitScreenCoordinates(*(output_list[1])); + InitScreenCoordinates((*output_list)[0]); + InitScreenCoordinates((*output_list)[1]); - for (size_t i = 0; i < output_list.size() - 2; i ++) { - OutputVertex& vtx0 = *(output_list[0]); - OutputVertex& vtx1 = *(output_list[i+1]); - OutputVertex& vtx2 = *(output_list[i+2]); + for (size_t i = 0; i < output_list->size() - 2; i ++) { + OutputVertex& vtx0 = (*output_list)[0]; + OutputVertex& vtx1 = (*output_list)[i+1]; + OutputVertex& vtx2 = (*output_list)[i+2]; InitScreenCoordinates(vtx2); LOG_TRACE(Render_Software, - "Triangle %lu/%lu (%lu buffer vertices) at position (%.3f, %.3f, %.3f, %.3f), " + "Triangle %lu/%lu at position (%.3f, %.3f, %.3f, %.3f), " "(%.3f, %.3f, %.3f, %.3f), (%.3f, %.3f, %.3f, %.3f) and " "screen position (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f), (%.2f, %.2f, %.2f)", - i,output_list.size(), buffer_vertices.size(), + i, output_list->size(), vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(), vtx1.pos.x.ToFloat32(), vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(), vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), vtx2.pos.w.ToFloat32(), -- cgit v1.2.3 From 8369ee58035ca98f776428f6cccbcf987fee3bc9 Mon Sep 17 00:00:00 2001 From: Yuri Kunde Schlesner Date: Tue, 23 Dec 2014 13:05:51 -0200 Subject: Rasterizer: Pre-divide vertex attributes by W Execute the division-by-W for perspective-correct interpolation of values in the clipper, moving them out of the rasterization inner loop. --- src/video_core/clipper.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'src/video_core/clipper.cpp') diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp index 0521ef866..1744066ba 100644 --- a/src/video_core/clipper.cpp +++ b/src/video_core/clipper.cpp @@ -91,10 +91,17 @@ static void InitScreenCoordinates(OutputVertex& vtx) viewport.zscale = float24::FromRawFloat24(registers.viewport_depth_range); viewport.offset_z = float24::FromRawFloat24(registers.viewport_depth_far_plane); + float24 inv_w = float24::FromFloat32(1.f) / vtx.pos.w; + vtx.color *= inv_w; + vtx.tc0 *= inv_w; + vtx.tc1 *= inv_w; + vtx.tc2 *= inv_w; + vtx.pos.w = inv_w; + // TODO: Not sure why the viewport width needs to be divided by 2 but the viewport height does not - vtx.screenpos[0] = (vtx.pos.x / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x; - vtx.screenpos[1] = (vtx.pos.y / vtx.pos.w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y; - vtx.screenpos[2] = viewport.offset_z - vtx.pos.z / vtx.pos.w * viewport.zscale; + vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x; + vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y; + vtx.screenpos[2] = viewport.offset_z - vtx.pos.z * inv_w * viewport.zscale; } void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) { -- cgit v1.2.3