diff options
Diffstat (limited to 'drivers/gpu/drm/i915/gt/intel_workarounds.c')
-rw-r--r-- | drivers/gpu/drm/i915/gt/intel_workarounds.c | 654 |
1 files changed, 318 insertions, 336 deletions
diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index adc9a8ea410a..ec366cf9ef56 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -7,6 +7,7 @@ #include "i915_drv.h" #include "intel_context.h" #include "intel_engine_pm.h" +#include "intel_gpu_commands.h" #include "intel_gt.h" #include "intel_ring.h" #include "intel_workarounds.h" @@ -194,7 +195,7 @@ static void wa_add(struct i915_wa_list *wal, i915_reg_t reg, } static void -wa_write_masked_or(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set) +wa_write_clr_set(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set) { wa_add(wal, reg, clear, set, clear); } @@ -202,21 +203,32 @@ wa_write_masked_or(struct i915_wa_list *wal, i915_reg_t reg, u32 clear, u32 set) static void wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 set) { - wa_write_masked_or(wal, reg, ~0, set); + wa_write_clr_set(wal, reg, ~0, set); } static void wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set) { - wa_write_masked_or(wal, reg, set, set); + wa_write_clr_set(wal, reg, set, set); } static void wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr) { - wa_write_masked_or(wal, reg, clr, 0); + wa_write_clr_set(wal, reg, clr, 0); } +/* + * WA operations on "masked register". A masked register has the upper 16 bits + * documented as "masked" in b-spec. Its purpose is to allow writing to just a + * portion of the register without a rmw: you simply write in the upper 16 bits + * the mask of bits you are going to modify. + * + * The wa_masked_* family of functions already does the necessary operations to + * calculate the mask based on the parameters passed, so user only has to + * provide the lower 16 bits of that register. + */ + static void wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val) { @@ -229,38 +241,36 @@ wa_masked_dis(struct i915_wa_list *wal, i915_reg_t reg, u32 val) wa_add(wal, reg, 0, _MASKED_BIT_DISABLE(val), val); } -#define WA_SET_BIT_MASKED(addr, mask) \ - wa_masked_en(wal, (addr), (mask)) - -#define WA_CLR_BIT_MASKED(addr, mask) \ - wa_masked_dis(wal, (addr), (mask)) - -#define WA_SET_FIELD_MASKED(addr, mask, value) \ - wa_write_masked_or(wal, (addr), 0, _MASKED_FIELD((mask), (value))) +static void +wa_masked_field_set(struct i915_wa_list *wal, i915_reg_t reg, + u32 mask, u32 val) +{ + wa_add(wal, reg, 0, _MASKED_FIELD(mask, val), mask); +} static void gen6_ctx_workarounds_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) { - WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING); + wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING); } static void gen7_ctx_workarounds_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) { - WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING); + wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING); } static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) { - WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING); + wa_masked_en(wal, INSTPM, INSTPM_FORCE_ORDERING); /* WaDisableAsyncFlipPerfMode:bdw,chv */ - WA_SET_BIT_MASKED(MI_MODE, ASYNC_FLIP_PERF_DISABLE); + wa_masked_en(wal, MI_MODE, ASYNC_FLIP_PERF_DISABLE); /* WaDisablePartialInstShootdown:bdw,chv */ - WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, - PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE); + wa_masked_en(wal, GEN8_ROW_CHICKEN, + PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE); /* Use Force Non-Coherent whenever executing a 3D context. This is a * workaround for for a possible hang in the unlikely event a TLB @@ -268,9 +278,9 @@ static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine, */ /* WaForceEnableNonCoherent:bdw,chv */ /* WaHdcDisableFetchWhenMasked:bdw,chv */ - WA_SET_BIT_MASKED(HDC_CHICKEN0, - HDC_DONOT_FETCH_MEM_WHEN_MASKED | - HDC_FORCE_NON_COHERENT); + wa_masked_en(wal, HDC_CHICKEN0, + HDC_DONOT_FETCH_MEM_WHEN_MASKED | + HDC_FORCE_NON_COHERENT); /* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0: * "The Hierarchical Z RAW Stall Optimization allows non-overlapping @@ -280,10 +290,10 @@ static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine, * * This optimization is off by default for BDW and CHV; turn it on. */ - WA_CLR_BIT_MASKED(CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE); + wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE); /* Wa4x4STCOptimizationDisable:bdw,chv */ - WA_SET_BIT_MASKED(CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE); + wa_masked_en(wal, CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE); /* * BSpec recommends 8x4 when MSAA is used, @@ -293,7 +303,7 @@ static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine, * disable bit, which we don't touch here, but it's good * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). */ - WA_SET_FIELD_MASKED(GEN7_GT_MODE, + wa_masked_field_set(wal, GEN7_GT_MODE, GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4); } @@ -306,24 +316,24 @@ static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine, gen8_ctx_workarounds_init(engine, wal); /* WaDisableThreadStallDopClockGating:bdw (pre-production) */ - WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE); + wa_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE); /* WaDisableDopClockGating:bdw * * Also see the related UCGTCL1 write in bdw_init_clock_gating() * to disable EUTC clock gating. */ - WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, - DOP_CLOCK_GATING_DISABLE); + wa_masked_en(wal, GEN7_ROW_CHICKEN2, + DOP_CLOCK_GATING_DISABLE); - WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, - GEN8_SAMPLER_POWER_BYPASS_DIS); + wa_masked_en(wal, HALF_SLICE_CHICKEN3, + GEN8_SAMPLER_POWER_BYPASS_DIS); - WA_SET_BIT_MASKED(HDC_CHICKEN0, - /* WaForceContextSaveRestoreNonCoherent:bdw */ - HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT | - /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */ - (IS_BDW_GT3(i915) ? HDC_FENCE_DEST_SLM_DISABLE : 0)); + wa_masked_en(wal, HDC_CHICKEN0, + /* WaForceContextSaveRestoreNonCoherent:bdw */ + HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT | + /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */ + (IS_BDW_GT3(i915) ? HDC_FENCE_DEST_SLM_DISABLE : 0)); } static void chv_ctx_workarounds_init(struct intel_engine_cs *engine, @@ -332,10 +342,10 @@ static void chv_ctx_workarounds_init(struct intel_engine_cs *engine, gen8_ctx_workarounds_init(engine, wal); /* WaDisableThreadStallDopClockGating:chv */ - WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE); + wa_masked_en(wal, GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE); /* Improve HiZ throughput on CHV. */ - WA_SET_BIT_MASKED(HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X); + wa_masked_en(wal, HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X); } static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine, @@ -349,38 +359,38 @@ static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine, * Must match Display Engine. See * WaCompressedResourceDisplayNewHashMode. */ - WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, - GEN9_PBE_COMPRESSED_HASH_SELECTION); - WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7, - GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR); + wa_masked_en(wal, COMMON_SLICE_CHICKEN2, + GEN9_PBE_COMPRESSED_HASH_SELECTION); + wa_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7, + GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR); } /* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */ /* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */ - WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, - FLOW_CONTROL_ENABLE | - PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE); + wa_masked_en(wal, GEN8_ROW_CHICKEN, + FLOW_CONTROL_ENABLE | + PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE); /* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */ /* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */ - WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7, - GEN9_ENABLE_YV12_BUGFIX | - GEN9_ENABLE_GPGPU_PREEMPTION); + wa_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7, + GEN9_ENABLE_YV12_BUGFIX | + GEN9_ENABLE_GPGPU_PREEMPTION); /* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */ /* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */ - WA_SET_BIT_MASKED(CACHE_MODE_1, - GEN8_4x4_STC_OPTIMIZATION_DISABLE | - GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE); + wa_masked_en(wal, CACHE_MODE_1, + GEN8_4x4_STC_OPTIMIZATION_DISABLE | + GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE); /* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */ - WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5, - GEN9_CCS_TLB_PREFETCH_ENABLE); + wa_masked_dis(wal, GEN9_HALF_SLICE_CHICKEN5, + GEN9_CCS_TLB_PREFETCH_ENABLE); /* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */ - WA_SET_BIT_MASKED(HDC_CHICKEN0, - HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT | - HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE); + wa_masked_en(wal, HDC_CHICKEN0, + HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT | + HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE); /* WaForceEnableNonCoherent and WaDisableHDCInvalidation are * both tied to WaForceContextSaveRestoreNonCoherent @@ -396,19 +406,19 @@ static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine, */ /* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */ - WA_SET_BIT_MASKED(HDC_CHICKEN0, - HDC_FORCE_NON_COHERENT); + wa_masked_en(wal, HDC_CHICKEN0, + HDC_FORCE_NON_COHERENT); /* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */ if (IS_SKYLAKE(i915) || IS_KABYLAKE(i915) || IS_COFFEELAKE(i915) || IS_COMETLAKE(i915)) - WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, - GEN8_SAMPLER_POWER_BYPASS_DIS); + wa_masked_en(wal, HALF_SLICE_CHICKEN3, + GEN8_SAMPLER_POWER_BYPASS_DIS); /* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */ - WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE); + wa_masked_en(wal, HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE); /* * Supporting preemption with fine-granularity requires changes in the @@ -422,16 +432,16 @@ static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine, */ /* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */ - WA_CLR_BIT_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL); + wa_masked_dis(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL); /* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */ - WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, + wa_masked_field_set(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_GPGPU_LEVEL_MASK, GEN9_PREEMPT_GPGPU_COMMAND_LEVEL); /* WaClearHIZ_WM_CHICKEN3:bxt,glk */ if (IS_GEN9_LP(i915)) - WA_SET_BIT_MASKED(GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ); + wa_masked_en(wal, GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ); } static void skl_tune_iz_hashing(struct intel_engine_cs *engine, @@ -465,7 +475,7 @@ static void skl_tune_iz_hashing(struct intel_engine_cs *engine, return; /* Tune IZ hashing. See intel_device_info_runtime_init() */ - WA_SET_FIELD_MASKED(GEN7_GT_MODE, + wa_masked_field_set(wal, GEN7_GT_MODE, GEN9_IZ_HASHING_MASK(2) | GEN9_IZ_HASHING_MASK(1) | GEN9_IZ_HASHING_MASK(0), @@ -487,12 +497,12 @@ static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine, gen9_ctx_workarounds_init(engine, wal); /* WaDisableThreadStallDopClockGating:bxt */ - WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, - STALL_DOP_GATING_DISABLE); + wa_masked_en(wal, GEN8_ROW_CHICKEN, + STALL_DOP_GATING_DISABLE); /* WaToEnableHwFixForPushConstHWBug:bxt */ - WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, - GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); + wa_masked_en(wal, COMMON_SLICE_CHICKEN2, + GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); } static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine, @@ -504,12 +514,12 @@ static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine, /* WaToEnableHwFixForPushConstHWBug:kbl */ if (IS_KBL_GT_REVID(i915, KBL_REVID_C0, REVID_FOREVER)) - WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, - GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); + wa_masked_en(wal, COMMON_SLICE_CHICKEN2, + GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); /* WaDisableSbeCacheDispatchPortSharing:kbl */ - WA_SET_BIT_MASKED(GEN7_HALF_SLICE_CHICKEN1, - GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE); + wa_masked_en(wal, GEN7_HALF_SLICE_CHICKEN1, + GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE); } static void glk_ctx_workarounds_init(struct intel_engine_cs *engine, @@ -518,8 +528,8 @@ static void glk_ctx_workarounds_init(struct intel_engine_cs *engine, gen9_ctx_workarounds_init(engine, wal); /* WaToEnableHwFixForPushConstHWBug:glk */ - WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, - GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); + wa_masked_en(wal, COMMON_SLICE_CHICKEN2, + GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); } static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine, @@ -528,41 +538,41 @@ static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine, gen9_ctx_workarounds_init(engine, wal); /* WaToEnableHwFixForPushConstHWBug:cfl */ - WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, - GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); + wa_masked_en(wal, COMMON_SLICE_CHICKEN2, + GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); /* WaDisableSbeCacheDispatchPortSharing:cfl */ - WA_SET_BIT_MASKED(GEN7_HALF_SLICE_CHICKEN1, - GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE); + wa_masked_en(wal, GEN7_HALF_SLICE_CHICKEN1, + GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE); } static void cnl_ctx_workarounds_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) { /* WaForceContextSaveRestoreNonCoherent:cnl */ - WA_SET_BIT_MASKED(CNL_HDC_CHICKEN0, - HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT); + wa_masked_en(wal, CNL_HDC_CHICKEN0, + HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT); /* WaDisableReplayBufferBankArbitrationOptimization:cnl */ - WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, - GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); + wa_masked_en(wal, COMMON_SLICE_CHICKEN2, + GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); /* WaPushConstantDereferenceHoldDisable:cnl */ - WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, PUSH_CONSTANT_DEREF_DISABLE); + wa_masked_en(wal, GEN7_ROW_CHICKEN2, PUSH_CONSTANT_DEREF_DISABLE); /* FtrEnableFastAnisoL1BankingFix:cnl */ - WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, CNL_FAST_ANISO_L1_BANKING_FIX); + wa_masked_en(wal, HALF_SLICE_CHICKEN3, CNL_FAST_ANISO_L1_BANKING_FIX); /* WaDisable3DMidCmdPreemption:cnl */ - WA_CLR_BIT_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL); + wa_masked_dis(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL); /* WaDisableGPGPUMidCmdPreemption:cnl */ - WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, + wa_masked_field_set(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_GPGPU_LEVEL_MASK, GEN9_PREEMPT_GPGPU_COMMAND_LEVEL); /* WaDisableEarlyEOT:cnl */ - WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, DISABLE_EARLY_EOT); + wa_masked_en(wal, GEN8_ROW_CHICKEN, DISABLE_EARLY_EOT); } static void icl_ctx_workarounds_init(struct intel_engine_cs *engine, @@ -580,8 +590,8 @@ static void icl_ctx_workarounds_init(struct intel_engine_cs *engine, * Formerly known as WaPushConstantDereferenceHoldDisable */ if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_B0)) - WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, - PUSH_CONSTANT_DEREF_DISABLE); + wa_masked_en(wal, GEN7_ROW_CHICKEN2, + PUSH_CONSTANT_DEREF_DISABLE); /* WaForceEnableNonCoherent:icl * This is not the same workaround as in early Gen9 platforms, where @@ -590,40 +600,40 @@ static void icl_ctx_workarounds_init(struct intel_engine_cs *engine, * (the register is whitelisted in hardware now, so UMDs can opt in * for coherency if they have a good reason). */ - WA_SET_BIT_MASKED(ICL_HDC_MODE, HDC_FORCE_NON_COHERENT); + wa_masked_en(wal, ICL_HDC_MODE, HDC_FORCE_NON_COHERENT); /* Wa_2006611047:icl (pre-prod) * Formerly known as WaDisableImprovedTdlClkGating */ if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0)) - WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, - GEN11_TDL_CLOCK_GATING_FIX_DISABLE); + wa_masked_en(wal, GEN7_ROW_CHICKEN2, + GEN11_TDL_CLOCK_GATING_FIX_DISABLE); /* Wa_2006665173:icl (pre-prod) */ if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0)) - WA_SET_BIT_MASKED(GEN11_COMMON_SLICE_CHICKEN3, - GEN11_BLEND_EMB_FIX_DISABLE_IN_RCC); + wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3, + GEN11_BLEND_EMB_FIX_DISABLE_IN_RCC); /* WaEnableFloatBlendOptimization:icl */ - wa_write_masked_or(wal, - GEN10_CACHE_MODE_SS, - 0, /* write-only, so skip validation */ - _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE)); + wa_write_clr_set(wal, + GEN10_CACHE_MODE_SS, + 0, /* write-only, so skip validation */ + _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE)); /* WaDisableGPGPUMidThreadPreemption:icl */ - WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, + wa_masked_field_set(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_GPGPU_LEVEL_MASK, GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL); /* allow headerless messages for preemptible GPGPU context */ - WA_SET_BIT_MASKED(GEN10_SAMPLER_MODE, - GEN11_SAMPLER_ENABLE_HEADLESS_MSG); + wa_masked_en(wal, GEN10_SAMPLER_MODE, + GEN11_SAMPLER_ENABLE_HEADLESS_MSG); /* Wa_1604278689:icl,ehl */ wa_write(wal, IVB_FBC_RT_BASE, 0xFFFFFFFF & ~ILK_FBC_RT_VALID); - wa_write_masked_or(wal, IVB_FBC_RT_BASE_UPPER, - 0, /* write-only register; skip validation */ - 0xFFFFFFFF); + wa_write_clr_set(wal, IVB_FBC_RT_BASE_UPPER, + 0, /* write-only register; skip validation */ + 0xFFFFFFFF); /* Wa_1406306137:icl,ehl */ wa_masked_en(wal, GEN9_ROW_CHICKEN4, GEN11_DIS_PICK_2ND_EU); @@ -643,11 +653,11 @@ static void gen12_ctx_workarounds_init(struct intel_engine_cs *engine, * Wa_14010443199:rkl * Wa_14010698770:rkl */ - WA_SET_BIT_MASKED(GEN11_COMMON_SLICE_CHICKEN3, - GEN12_DISABLE_CPS_AWARE_COLOR_PIPE); + wa_masked_en(wal, GEN11_COMMON_SLICE_CHICKEN3, + GEN12_DISABLE_CPS_AWARE_COLOR_PIPE); /* WaDisableGPGPUMidThreadPreemption:gen12 */ - WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, + wa_masked_field_set(wal, GEN8_CS_CHICKEN1, GEN9_PREEMPT_GPGPU_LEVEL_MASK, GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL); } @@ -680,12 +690,22 @@ static void dg1_ctx_workarounds_init(struct intel_engine_cs *engine, gen12_ctx_workarounds_init(engine, wal); /* Wa_1409044764 */ - WA_CLR_BIT_MASKED(GEN11_COMMON_SLICE_CHICKEN3, - DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN); + wa_masked_dis(wal, GEN11_COMMON_SLICE_CHICKEN3, + DG1_FLOAT_POINT_BLEND_OPT_STRICT_MODE_EN); /* Wa_22010493298 */ - WA_SET_BIT_MASKED(HIZ_CHICKEN, - DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE); + wa_masked_en(wal, HIZ_CHICKEN, + DG1_HZ_READ_SUPPRESSION_OPTIMIZATION_DISABLE); + + /* + * Wa_16011163337 + * + * Like in tgl_ctx_workarounds_init(), read verification is ignored due + * to Wa_1608008084. + */ + wa_add(wal, + FF_MODE2, + FF_MODE2_GS_TIMER_MASK, FF_MODE2_GS_TIMER_224, 0); } static void @@ -804,57 +824,11 @@ ilk_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) static void snb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) { - /* WaDisableHiZPlanesWhenMSAAEnabled:snb */ - wa_masked_en(wal, - _3D_CHICKEN, - _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB); - - /* WaDisable_RenderCache_OperationalFlush:snb */ - wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE); - - /* - * BSpec recommends 8x4 when MSAA is used, - * however in practice 16x4 seems fastest. - * - * Note that PS/WM thread counts depend on the WIZ hashing - * disable bit, which we don't touch here, but it's good - * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). - */ - wa_add(wal, - GEN6_GT_MODE, 0, - _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), - GEN6_WIZ_HASHING_16x4); - - wa_masked_dis(wal, CACHE_MODE_0, CM0_STC_EVICT_DISABLE_LRA_SNB); - - wa_masked_en(wal, - _3D_CHICKEN3, - /* WaStripsFansDisableFastClipPerformanceFix:snb */ - _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL | - /* - * Bspec says: - * "This bit must be set if 3DSTATE_CLIP clip mode is set - * to normal and 3DSTATE_SF number of SF output attributes - * is more than 16." - */ - _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH); } static void ivb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) { - /* WaDisableEarlyCull:ivb */ - wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL); - - /* WaDisablePSDDualDispatchEnable:ivb */ - if (IS_IVB_GT1(i915)) - wa_masked_en(wal, - GEN7_HALF_SLICE_CHICKEN1, - GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE); - - /* WaDisable_RenderCache_OperationalFlush:ivb */ - wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE); - /* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */ wa_masked_dis(wal, GEN7_COMMON_SLICE_CHICKEN1, @@ -866,91 +840,15 @@ ivb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) /* WaForceL3Serialization:ivb */ wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE); - - /* - * WaVSThreadDispatchOverride:ivb,vlv - * - * This actually overrides the dispatch - * mode for all thread types. - */ - wa_write_masked_or(wal, GEN7_FF_THREAD_MODE, - GEN7_FF_SCHED_MASK, - GEN7_FF_TS_SCHED_HW | - GEN7_FF_VS_SCHED_HW | - GEN7_FF_DS_SCHED_HW); - - if (0) { /* causes HiZ corruption on ivb:gt1 */ - /* enable HiZ Raw Stall Optimization */ - wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE); - } - - /* WaDisable4x2SubspanOptimization:ivb */ - wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE); - - /* - * BSpec recommends 8x4 when MSAA is used, - * however in practice 16x4 seems fastest. - * - * Note that PS/WM thread counts depend on the WIZ hashing - * disable bit, which we don't touch here, but it's good - * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). - */ - wa_add(wal, GEN7_GT_MODE, 0, - _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), - GEN6_WIZ_HASHING_16x4); } static void vlv_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) { - /* WaDisableEarlyCull:vlv */ - wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL); - - /* WaPsdDispatchEnable:vlv */ - /* WaDisablePSDDualDispatchEnable:vlv */ - wa_masked_en(wal, - GEN7_HALF_SLICE_CHICKEN1, - GEN7_MAX_PS_THREAD_DEP | - GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE); - - /* WaDisable_RenderCache_OperationalFlush:vlv */ - wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE); - /* WaForceL3Serialization:vlv */ wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE); /* - * WaVSThreadDispatchOverride:ivb,vlv - * - * This actually overrides the dispatch - * mode for all thread types. - */ - wa_write_masked_or(wal, - GEN7_FF_THREAD_MODE, - GEN7_FF_SCHED_MASK, - GEN7_FF_TS_SCHED_HW | - GEN7_FF_VS_SCHED_HW | - GEN7_FF_DS_SCHED_HW); - - /* - * BSpec says this must be set, even though - * WaDisable4x2SubspanOptimization isn't listed for VLV. - */ - wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE); - - /* - * BSpec recommends 8x4 when MSAA is used, - * however in practice 16x4 seems fastest. - * - * Note that PS/WM thread counts depend on the WIZ hashing - * disable bit, which we don't touch here, but it's good - * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). - */ - wa_add(wal, GEN7_GT_MODE, 0, - _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), - GEN6_WIZ_HASHING_16x4); - - /* * WaIncreaseL3CreditsForVLVB0:vlv * This is the hardware default actually. */ @@ -970,31 +868,6 @@ hsw_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) /* WaVSRefCountFullforceMissDisable:hsw */ wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME); - - wa_masked_dis(wal, - CACHE_MODE_0_GEN7, - /* WaDisable_RenderCache_OperationalFlush:hsw */ - RC_OP_FLUSH_ENABLE | - /* enable HiZ Raw Stall Optimization */ - HIZ_RAW_STALL_OPT_DISABLE); - - /* WaDisable4x2SubspanOptimization:hsw */ - wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE); - - /* - * BSpec recommends 8x4 when MSAA is used, - * however in practice 16x4 seems fastest. - * - * Note that PS/WM thread counts depend on the WIZ hashing - * disable bit, which we don't touch here, but it's good - * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). - */ - wa_add(wal, GEN7_GT_MODE, 0, - _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), - GEN6_WIZ_HASHING_16x4); - - /* WaSampleCChickenBitEnable:hsw */ - wa_masked_en(wal, HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE); } static void @@ -1164,7 +1037,7 @@ wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal) drm_dbg(&i915->drm, "MCR slice/subslice = %x\n", mcr); - wa_write_masked_or(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr); + wa_write_clr_set(wal, GEN8_MCR_SELECTOR, mcr_mask, mcr); } static void @@ -1189,10 +1062,10 @@ icl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); /* WaModifyGamTlbPartitioning:icl */ - wa_write_masked_or(wal, - GEN11_GACB_PERF_CTRL, - GEN11_HASH_CTRL_MASK, - GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4); + wa_write_clr_set(wal, + GEN11_GACB_PERF_CTRL, + GEN11_HASH_CTRL_MASK, + GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4); /* Wa_1405766107:icl * Formerly known as WaCL2SFHalfMaxAlloc @@ -1260,6 +1133,11 @@ tgl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) wa_write_or(wal, SLICE_UNIT_LEVEL_CLKGATE, L3_CLKGATE_DIS | L3_CR2X_CLKGATE_DIS); + + /* Wa_1408615072:tgl[a0] */ + if (IS_TGL_UY_GT_REVID(i915, TGL_REVID_A0, TGL_REVID_A0)) + wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, + VSUNIT_CLKGATE_DIS_TGL); } static void @@ -1358,9 +1236,9 @@ static bool wa_verify(const struct i915_wa *wa, u32 cur, const char *name, const char *from) { if ((cur ^ wa->set) & wa->read) { - DRM_ERROR("%s workaround lost on %s! (%x=%x/%x, expected %x)\n", + DRM_ERROR("%s workaround lost on %s! (reg[%x]=0x%x, relevant bits were 0x%x vs expected 0x%x)\n", name, from, i915_mmio_reg_offset(wa->reg), - cur, cur & wa->read, wa->set); + cur, cur & wa->read, wa->set & wa->read); return false; } @@ -1425,7 +1303,8 @@ bool intel_gt_verify_workarounds(struct intel_gt *gt, const char *from) return wa_list_verify(gt->uncore, >->i915->gt_wa_list, from); } -static inline bool is_nonpriv_flags_valid(u32 flags) +__maybe_unused +static bool is_nonpriv_flags_valid(u32 flags) { /* Check only valid flag bits are set */ if (flags & ~RING_FORCE_TO_NONPRIV_MASK_VALID) @@ -1752,10 +1631,6 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) wa_write_or(wal, GEN7_SARCHKMD, GEN7_DISABLE_SAMPLER_PREFETCH); - - /* Wa_1408615072:tgl */ - wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE2, - VSUNIT_CLKGATE_DIS_TGL); } if (IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) { @@ -1770,6 +1645,19 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) */ wa_write_or(wal, GEN7_FF_THREAD_MODE, GEN12_FF_TESSELATION_DOP_GATE_DISABLE); + + /* + * Wa_1606700617:tgl,dg1 + * Wa_22010271021:tgl,rkl,dg1 + */ + wa_masked_en(wal, + GEN9_CS_DEBUG_MODE1, + FF_DOP_CLOCK_GATE_DISABLE); + + /* Wa_1406941453:tgl,rkl,dg1 */ + wa_masked_en(wal, + GEN10_SAMPLER_MODE, + ENABLE_SMALLPL); } if (IS_DG1_REVID(i915, DG1_REVID_A0, DG1_REVID_A0) || @@ -1798,21 +1686,6 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) GEN6_RC_SLEEP_PSMI_CONTROL, GEN12_WAIT_FOR_EVENT_POWER_DOWN_DISABLE | GEN8_RC_SEMA_IDLE_MSG_DISABLE); - - /* - * Wa_1606700617:tgl - * Wa_22010271021:tgl,rkl - */ - wa_masked_en(wal, - GEN9_CS_DEBUG_MODE1, - FF_DOP_CLOCK_GATE_DISABLE); - } - - if (IS_GEN(i915, 12)) { - /* Wa_1406941453:gen12 */ - wa_masked_en(wal, - GEN10_SAMPLER_MODE, - ENABLE_SMALLPL); } if (IS_GEN(i915, 11)) { @@ -1838,14 +1711,14 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) * Wa_1604223664:icl * Formerly known as WaL3BankAddressHashing */ - wa_write_masked_or(wal, - GEN8_GARBCNTL, - GEN11_HASH_CTRL_EXCL_MASK, - GEN11_HASH_CTRL_EXCL_BIT0); - wa_write_masked_or(wal, - GEN11_GLBLINVL, - GEN11_BANK_HASH_ADDR_EXCL_MASK, - GEN11_BANK_HASH_ADDR_EXCL_BIT0); + wa_write_clr_set(wal, + GEN8_GARBCNTL, + GEN11_HASH_CTRL_EXCL_MASK, + GEN11_HASH_CTRL_EXCL_BIT0); + wa_write_clr_set(wal, + GEN11_GLBLINVL, + GEN11_BANK_HASH_ADDR_EXCL_MASK, + GEN11_BANK_HASH_ADDR_EXCL_BIT0); /* * Wa_1405733216:icl @@ -1874,10 +1747,10 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) GEN7_DISABLE_SAMPLER_PREFETCH); /* Wa_1409178092:icl */ - wa_write_masked_or(wal, - GEN11_SCRATCH2, - GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE, - 0); + wa_write_clr_set(wal, + GEN11_SCRATCH2, + GEN11_COHERENT_PARTIAL_WRITE_MERGE_ENABLE, + 0); /* WaEnable32PlaneMode:icl */ wa_masked_en(wal, GEN9_CSFE_CHICKEN1_RCS, @@ -1951,24 +1824,132 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) /* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */ if (IS_GEN9_LP(i915)) - wa_write_masked_or(wal, - GEN8_L3SQCREG1, - L3_PRIO_CREDITS_MASK, - L3_GENERAL_PRIO_CREDITS(62) | - L3_HIGH_PRIO_CREDITS(2)); + wa_write_clr_set(wal, + GEN8_L3SQCREG1, + L3_PRIO_CREDITS_MASK, + L3_GENERAL_PRIO_CREDITS(62) | + L3_HIGH_PRIO_CREDITS(2)); /* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */ wa_write_or(wal, GEN8_L3SQCREG4, GEN8_LQSC_FLUSH_COHERENT_LINES); + + /* Disable atomics in L3 to prevent unrecoverable hangs */ + wa_write_clr_set(wal, GEN9_SCRATCH_LNCF1, + GEN9_LNCF_NONIA_COHERENT_ATOMICS_ENABLE, 0); + wa_write_clr_set(wal, GEN8_L3SQCREG4, + GEN8_LQSQ_NONIA_COHERENT_ATOMICS_ENABLE, 0); + wa_write_clr_set(wal, GEN9_SCRATCH1, + EVICTION_PERF_FIX_ENABLE, 0); + } + + if (IS_HASWELL(i915)) { + /* WaSampleCChickenBitEnable:hsw */ + wa_masked_en(wal, + HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE); + + wa_masked_dis(wal, + CACHE_MODE_0_GEN7, + /* enable HiZ Raw Stall Optimization */ + HIZ_RAW_STALL_OPT_DISABLE); + + /* WaDisable4x2SubspanOptimization:hsw */ + wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE); + } + + if (IS_VALLEYVIEW(i915)) { + /* WaDisableEarlyCull:vlv */ + wa_masked_en(wal, + _3D_CHICKEN3, + _3D_CHICKEN_SF_DISABLE_OBJEND_CULL); + + /* + * WaVSThreadDispatchOverride:ivb,vlv + * + * This actually overrides the dispatch + * mode for all thread types. + */ + wa_write_clr_set(wal, + GEN7_FF_THREAD_MODE, + GEN7_FF_SCHED_MASK, + GEN7_FF_TS_SCHED_HW | + GEN7_FF_VS_SCHED_HW | + GEN7_FF_DS_SCHED_HW); + + /* WaPsdDispatchEnable:vlv */ + /* WaDisablePSDDualDispatchEnable:vlv */ + wa_masked_en(wal, + GEN7_HALF_SLICE_CHICKEN1, + GEN7_MAX_PS_THREAD_DEP | + GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE); } - if (IS_GEN(i915, 7)) + if (IS_IVYBRIDGE(i915)) { + /* WaDisableEarlyCull:ivb */ + wa_masked_en(wal, + _3D_CHICKEN3, + _3D_CHICKEN_SF_DISABLE_OBJEND_CULL); + + if (0) { /* causes HiZ corruption on ivb:gt1 */ + /* enable HiZ Raw Stall Optimization */ + wa_masked_dis(wal, + CACHE_MODE_0_GEN7, + HIZ_RAW_STALL_OPT_DISABLE); + } + + /* + * WaVSThreadDispatchOverride:ivb,vlv + * + * This actually overrides the dispatch + * mode for all thread types. + */ + wa_write_clr_set(wal, + GEN7_FF_THREAD_MODE, + GEN7_FF_SCHED_MASK, + GEN7_FF_TS_SCHED_HW | + GEN7_FF_VS_SCHED_HW | + GEN7_FF_DS_SCHED_HW); + + /* WaDisablePSDDualDispatchEnable:ivb */ + if (IS_IVB_GT1(i915)) + wa_masked_en(wal, + GEN7_HALF_SLICE_CHICKEN1, + GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE); + } + + if (IS_GEN(i915, 7)) { /* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */ wa_masked_en(wal, GFX_MODE_GEN7, GFX_TLB_INVALIDATE_EXPLICIT | GFX_REPLAY_MODE); + /* WaDisable_RenderCache_OperationalFlush:ivb,vlv,hsw */ + wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE); + + /* + * BSpec says this must be set, even though + * WaDisable4x2SubspanOptimization:ivb,hsw + * WaDisable4x2SubspanOptimization isn't listed for VLV. + */ + wa_masked_en(wal, + CACHE_MODE_1, + PIXEL_SUBSPAN_COLLECT_OPT_DISABLE); + + /* + * BSpec recommends 8x4 when MSAA is used, + * however in practice 16x4 seems fastest. + * + * Note that PS/WM thread counts depend on the WIZ hashing + * disable bit, which we don't touch here, but it's good + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). + */ + wa_add(wal, GEN7_GT_MODE, 0, + _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, + GEN6_WIZ_HASHING_16x4), + GEN6_WIZ_HASHING_16x4); + } + if (IS_GEN_RANGE(i915, 6, 7)) /* * We need to disable the AsyncFlip performance optimisations in @@ -1991,6 +1972,39 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) GFX_MODE, GFX_TLB_INVALIDATE_EXPLICIT); + /* WaDisableHiZPlanesWhenMSAAEnabled:snb */ + wa_masked_en(wal, + _3D_CHICKEN, + _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB); + + wa_masked_en(wal, + _3D_CHICKEN3, + /* WaStripsFansDisableFastClipPerformanceFix:snb */ + _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL | + /* + * Bspec says: + * "This bit must be set if 3DSTATE_CLIP clip mode is set + * to normal and 3DSTATE_SF number of SF output attributes + * is more than 16." + */ + _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH); + + /* + * BSpec recommends 8x4 when MSAA is used, + * however in practice 16x4 seems fastest. + * + * Note that PS/WM thread counts depend on the WIZ hashing + * disable bit, which we don't touch here, but it's good + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). + */ + wa_add(wal, + GEN6_GT_MODE, 0, + _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), + GEN6_WIZ_HASHING_16x4); + + /* WaDisable_RenderCache_OperationalFlush:snb */ + wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE); + /* * From the Sandybridge PRM, volume 1 part 3, page 24: * "If this bit is set, STCunit will have LRA as replacement @@ -2067,39 +2081,6 @@ void intel_engine_apply_workarounds(struct intel_engine_cs *engine) wa_list_apply(engine->uncore, &engine->wa_list); } -static struct i915_vma * -create_scratch(struct i915_address_space *vm, int count) -{ - struct drm_i915_gem_object *obj; - struct i915_vma *vma; - unsigned int size; - int err; - - size = round_up(count * sizeof(u32), PAGE_SIZE); - obj = i915_gem_object_create_internal(vm->i915, size); - if (IS_ERR(obj)) - return ERR_CAST(obj); - - i915_gem_object_set_cache_coherency(obj, I915_CACHE_LLC); - - vma = i915_vma_instance(obj, vm, NULL); - if (IS_ERR(vma)) { - err = PTR_ERR(vma); - goto err_obj; - } - - err = i915_vma_pin(vma, 0, 0, - i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER); - if (err) - goto err_obj; - - return vma; - -err_obj: - i915_gem_object_put(obj); - return ERR_PTR(err); -} - struct mcr_range { u32 start; u32 end; @@ -2202,7 +2183,8 @@ static int engine_wa_list_verify(struct intel_context *ce, if (!wal->count) return 0; - vma = create_scratch(&ce->engine->gt->ggtt->vm, wal->count); + vma = __vm_create_scratch_for_read(&ce->engine->gt->ggtt->vm, + wal->count * sizeof(u32)); if (IS_ERR(vma)) return PTR_ERR(vma); |