diff options
| -rw-r--r-- | src/video_core/pica.h | 14 | ||||
| -rw-r--r-- | src/video_core/shader/shader_interpreter.cpp | 10 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.cpp | 127 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.h | 6 | 
4 files changed, 100 insertions, 57 deletions
diff --git a/src/video_core/pica.h b/src/video_core/pica.h index 58b924f9e..bb689f2a9 100644 --- a/src/video_core/pica.h +++ b/src/video_core/pica.h @@ -1021,12 +1021,20 @@ struct float24 {          return ret;      } +    static float24 Zero() { +        return FromFloat32(0.f); +    } +      // Not recommended for anything but logging      float ToFloat32() const {          return value;      }      float24 operator * (const float24& flt) const { +        if ((this->value == 0.f && !std::isnan(flt.value)) || +            (flt.value == 0.f && !std::isnan(this->value))) +            // PICA gives 0 instead of NaN when multiplying by inf +            return Zero();          return float24::FromFloat32(ToFloat32() * flt.ToFloat32());      } @@ -1043,7 +1051,11 @@ struct float24 {      }      float24& operator *= (const float24& flt) { -        value *= flt.ToFloat32(); +        if ((this->value == 0.f && !std::isnan(flt.value)) || +            (flt.value == 0.f && !std::isnan(this->value))) +            // PICA gives 0 instead of NaN when multiplying by inf +            *this = Zero(); +        else value *= flt.ToFloat32();          return *this;      } diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index ae5a30441..69e4efa68 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -177,7 +177,10 @@ void RunInterpreter(UnitState<Debug>& state) {                      if (!swizzle.DestComponentEnabled(i))                          continue; -                    dest[i] = std::max(src1[i], src2[i]); +                    // NOTE: Exact form required to match NaN semantics to hardware: +                    //   max(0, NaN) -> NaN +                    //   max(NaN, 0) -> 0 +                    dest[i] = (src1[i] > src2[i]) ? src1[i] : src2[i];                  }                  Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);                  break; @@ -190,7 +193,10 @@ void RunInterpreter(UnitState<Debug>& state) {                      if (!swizzle.DestComponentEnabled(i))                          continue; -                    dest[i] = std::min(src1[i], src2[i]); +                    // NOTE: Exact form required to match NaN semantics to hardware: +                    //   min(0, NaN) -> NaN +                    //   min(NaN, 0) -> 0 +                    dest[i] = (src1[i] < src2[i]) ? src1[i] : src2[i];                  }                  Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);                  break; diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index a8045d4b0..d3cfe109e 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -115,6 +115,8 @@ static const X64Reg SRC1 = XMM1;  static const X64Reg SRC2 = XMM2;  /// Loaded with the third swizzled source register, otherwise can be used as a scratch register  static const X64Reg SRC3 = XMM3; +/// Additional scratch register +static const X64Reg SCRATCH2 = XMM4;  /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one  static const X64Reg ONE = XMM14;  /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR @@ -227,8 +229,8 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {              u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);              BLENDPS(SCRATCH, R(src), mask);          } else { -            MOVAPS(XMM4, R(src)); -            UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination +            MOVAPS(SCRATCH2, R(src)); +            UNPCKHPS(SCRATCH2, R(SCRATCH)); // Unpack X/Y components of source and destination              UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination              // Compute selector to selectively copy source components to destination for SHUFPS instruction @@ -236,7 +238,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {                       ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |                       ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |                       ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); -            SHUFPS(SCRATCH, R(XMM4), sel); +            SHUFPS(SCRATCH, R(SCRATCH2), sel);          }          // Store dest back to memory @@ -244,6 +246,19 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {      }  } +void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) { +    MOVAPS(scratch, R(src1)); +    CMPPS(scratch, R(src2), CMP_ORD); + +    MULPS(src1, R(src2)); + +    MOVAPS(src2, R(src1)); +    CMPPS(src2, R(src2), CMP_UNORD); + +    XORPS(scratch, R(src2)); +    ANDPS(src1, R(scratch)); +} +  void JitCompiler::Compile_EvaluateCondition(Instruction instr) {      // Note: NXOR is used below to check for equality      switch (instr.flow_control.op) { @@ -307,21 +322,17 @@ void JitCompiler::Compile_DP3(Instruction instr) {      Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);      Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); -    if (Common::GetCPUCaps().sse4_1) { -        DPPS(SRC1, R(SRC2), 0x7f); -    } else { -        MULPS(SRC1, R(SRC2)); +    Compile_SanitizedMul(SRC1, SRC2, SCRATCH); -        MOVAPS(SRC2, R(SRC1)); -        SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); +    MOVAPS(SRC2, R(SRC1)); +    SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); -        MOVAPS(SRC3, R(SRC1)); -        SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); +    MOVAPS(SRC3, R(SRC1)); +    SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); -        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); -        ADDPS(SRC1, R(SRC2)); -        ADDPS(SRC1, R(SRC3)); -    } +    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); +    ADDPS(SRC1, R(SRC2)); +    ADDPS(SRC1, R(SRC3));      Compile_DestEnable(instr, SRC1);  } @@ -330,19 +341,15 @@ void JitCompiler::Compile_DP4(Instruction instr) {      Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);      Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); -    if (Common::GetCPUCaps().sse4_1) { -        DPPS(SRC1, R(SRC2), 0xff); -    } else { -        MULPS(SRC1, R(SRC2)); +    Compile_SanitizedMul(SRC1, SRC2, SCRATCH); -        MOVAPS(SRC2, R(SRC1)); -        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY -        ADDPS(SRC1, R(SRC2)); +    MOVAPS(SRC2, R(SRC1)); +    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY +    ADDPS(SRC1, R(SRC2)); -        MOVAPS(SRC2, R(SRC1)); -        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX -        ADDPS(SRC1, R(SRC2)); -    } +    MOVAPS(SRC2, R(SRC1)); +    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX +    ADDPS(SRC1, R(SRC2));      Compile_DestEnable(instr, SRC1);  } @@ -359,23 +366,22 @@ void JitCompiler::Compile_DPH(Instruction instr) {      if (Common::GetCPUCaps().sse4_1) {          // Set 4th component to 1.0          BLENDPS(SRC1, R(ONE), 0x8); // 0b1000 -        DPPS(SRC1, R(SRC2), 0xff);      } else { -        // Reverse to set the 4th component to 1.0 -        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); -        MOVSS(SRC1, R(ONE)); -        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); +        // Set 4th component to 1.0 +        MOVAPS(SCRATCH, R(SRC1)); +        UNPCKHPS(SCRATCH, R(ONE));  // XYZW, 1111 -> Z1__ +        UNPCKLPD(SRC1, R(SCRATCH)); // XYZW, Z1__ -> XYZ1 +    } -        MULPS(SRC1, R(SRC2)); +    Compile_SanitizedMul(SRC1, SRC2, SCRATCH); -        MOVAPS(SRC2, R(SRC1)); -        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY -        ADDPS(SRC1, R(SRC2)); +    MOVAPS(SRC2, R(SRC1)); +    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY +    ADDPS(SRC1, R(SRC2)); -        MOVAPS(SRC2, R(SRC1)); -        SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX -        ADDPS(SRC1, R(SRC2)); -    } +    MOVAPS(SRC2, R(SRC1)); +    SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX +    ADDPS(SRC1, R(SRC2));      Compile_DestEnable(instr, SRC1);  } @@ -415,7 +421,7 @@ void JitCompiler::Compile_LG2(Instruction instr) {  void JitCompiler::Compile_MUL(Instruction instr) {      Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);      Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); -    MULPS(SRC1, R(SRC2)); +    Compile_SanitizedMul(SRC1, SRC2, SCRATCH);      Compile_DestEnable(instr, SRC1);  } @@ -465,6 +471,7 @@ void JitCompiler::Compile_FLR(Instruction instr) {  void JitCompiler::Compile_MAX(Instruction instr) {      Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);      Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); +    // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.      MAXPS(SRC1, R(SRC2));      Compile_DestEnable(instr, SRC1);  } @@ -472,6 +479,7 @@ void JitCompiler::Compile_MAX(Instruction instr) {  void JitCompiler::Compile_MIN(Instruction instr) {      Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);      Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); +    // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.      MINPS(SRC1, R(SRC2));      Compile_DestEnable(instr, SRC1);  } @@ -578,27 +586,42 @@ void JitCompiler::Compile_CALLU(Instruction instr) {  }  void JitCompiler::Compile_CMP(Instruction instr) { +    using Op = Instruction::Common::CompareOpType::Op; +    Op op_x = instr.common.compare_op.x; +    Op op_y = instr.common.compare_op.y; +      Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);      Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); -    static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT }; +    // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to +    // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here +    // because they don't match when used with NaNs. +    static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE }; + +    bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual); +    Gen::X64Reg lhs_x = invert_op_x ? SRC2 : SRC1; +    Gen::X64Reg rhs_x = invert_op_x ? SRC1 : SRC2; -    if (instr.common.compare_op.x == instr.common.compare_op.y) { +    if (op_x == op_y) {          // Compare X-component and Y-component together -        CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]); +        CMPPS(lhs_x, R(rhs_x), cmp[op_x]); +        MOVQ_xmm(R(COND0), lhs_x); -        MOVQ_xmm(R(COND0), SRC1);          MOV(64, R(COND1), R(COND0));      } else { +        bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual); +        Gen::X64Reg lhs_y = invert_op_y ? SRC2 : SRC1; +        Gen::X64Reg rhs_y = invert_op_y ? SRC1 : SRC2; +          // Compare X-component -        MOVAPS(SCRATCH, R(SRC1)); -        CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]); +        MOVAPS(SCRATCH, R(lhs_x)); +        CMPSS(SCRATCH, R(rhs_x), cmp[op_x]);          // Compare Y-component -        CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]); +        CMPPS(lhs_y, R(rhs_y), cmp[op_y]);          MOVQ_xmm(R(COND0), SCRATCH); -        MOVQ_xmm(R(COND1), SRC1); +        MOVQ_xmm(R(COND1), lhs_y);      }      SHR(32, R(COND0), Imm8(31)); @@ -616,12 +639,8 @@ void JitCompiler::Compile_MAD(Instruction instr) {          Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);      } -    if (Common::GetCPUCaps().fma) { -        VFMADD213PS(SRC1, SRC2, R(SRC3)); -    } else { -        MULPS(SRC1, R(SRC2)); -        ADDPS(SRC1, R(SRC3)); -    } +    Compile_SanitizedMul(SRC1, SRC2, SCRATCH); +    ADDPS(SRC1, R(SRC3));      Compile_DestEnable(instr, SRC1);  } diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index fbe19fe93..58828ecc8 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h @@ -68,6 +68,12 @@ private:      void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest);      void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); +    /** +     * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying +     * zero by inf. Clobbers `src2` and `scratch`. +     */ +    void Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch); +      void Compile_EvaluateCondition(Instruction instr);      void Compile_UniformCondition(Instruction instr);  | 
