diff --git a/src/render/d3d11/generated/render_d3d11.meta.h b/src/render/d3d11/generated/render_d3d11.meta.h index 134eccb5..4fc2c54b 100644 --- a/src/render/d3d11/generated/render_d3d11.meta.h +++ b/src/render/d3d11/generated/render_d3d11.meta.h @@ -205,10 +205,14 @@ str8_lit_comp( "cbuffer Globals : register(b0)\n" "{\n" " float4 rect;\n" -" float2 viewport_size;\n" -" float blur_size;\n" -" float is_vertical;\n" " float4 corner_radii_px;\n" +" float2 direction;\n" +" float2 viewport_size;\n" +" uint blur_count;\n" +"}\n" +"\n" +"cbuffer Kernel : register(b1)\n" +"{\n" " float4 kernel[32];\n" "}\n" "\n" @@ -221,7 +225,7 @@ str8_lit_comp( "{\n" " float4 position : SV_POSITION;\n" " float2 texcoord : TEX;\n" -" float2 cornercoord : CRN;\n" +" float2 sdf_sample_pos : SDF;\n" " float corner_radius : RAD;\n" "};\n" "\n" @@ -238,12 +242,12 @@ str8_lit_comp( "Vertex2Pixel\n" "vs_main(CPU2Vertex c2v)\n" "{\n" -" float4 vertex_positions__scrn[] =\n" +" float2 vertex_positions__scrn[] =\n" " {\n" -" float4(rect.x, rect.w, 0, 1) * float4(1, -1, 1, 1) + float4(0, viewport_size.y, 0, 0),\n" -" float4(rect.x, rect.y, 0, 1) * float4(1, -1, 1, 1) + float4(0, viewport_size.y, 0, 0),\n" -" float4(rect.z, rect.w, 0, 1) * float4(1, -1, 1, 1) + float4(0, viewport_size.y, 0, 0),\n" -" float4(rect.z, rect.y, 0, 1) * float4(1, -1, 1, 1) + float4(0, viewport_size.y, 0, 0),\n" +" rect.xw,\n" +" rect.xy,\n" +" rect.zw,\n" +" rect.zy,\n" " };\n" " float corner_radii__px[] =\n" " {\n" @@ -252,22 +256,20 @@ str8_lit_comp( " corner_radii_px.w,\n" " corner_radii_px.z,\n" " };\n" -" float2 cornercoords__pct[] =\n" -" {\n" -" float2(0, 1),\n" -" float2(0, 0),\n" -" float2(1, 1),\n" -" float2(1, 0),\n" -" };\n" -" float4 vertex_position__scrn = vertex_positions__scrn[c2v.vertex_id];\n" -" float4 vertex_position__clip = float4(2*vertex_position__scrn.x/viewport_size.x - 1,\n" -" 2*vertex_position__scrn.y/viewport_size.y - 1,\n" -" 0, 1);\n" +" float2 cornercoords__pct = float2(\n" +" (c2v.vertex_id >> 1) ? 1.f : 0.f,\n" +" (c2v.vertex_id & 1) ? 0.f : 1.f);\n" +"\n" +" float2 vertex_position__pct = vertex_positions__scrn[c2v.vertex_id] / viewport_size;\n" +" float2 vertex_position__scr = 2.f * vertex_position__pct - 1.f;\n" +"\n" +" float2 rect_half_size = float2((rect.z-rect.x)/2, (rect.w-rect.y)/2);\n" +"\n" " Vertex2Pixel v2p;\n" " {\n" -" v2p.position = vertex_position__clip;\n" -" v2p.texcoord = float2(vertex_position__scrn.x/viewport_size.x, 1 - vertex_position__scrn.y/viewport_size.y);\n" -" v2p.cornercoord = cornercoords__pct[c2v.vertex_id];\n" +" v2p.position = float4(vertex_position__scr.x, -vertex_position__scr.y, 0.f, 1.f);\n" +" v2p.texcoord = vertex_position__pct;\n" +" v2p.sdf_sample_pos = (2.f * cornercoords__pct - 1.f) * rect_half_size;\n" " v2p.corner_radius = corner_radii__px[c2v.vertex_id];\n" " }\n" " return v2p;\n" @@ -279,26 +281,27 @@ str8_lit_comp( "ps_main(Vertex2Pixel v2p) : SV_TARGET\n" "{\n" " // rjf: blend weighted texture samples into color\n" -" float4 color = stage_t2d.Sample(stage_sampler, v2p.texcoord) * kernel[0].x;\n" +" float4 color = kernel[0].x * stage_t2d.Sample(stage_sampler, v2p.texcoord);\n" " color.a = kernel[0].x;\n" -" for(float i = 1; i < blur_size; i += 1)\n" +"\n" +" for(uint i = 1; i < blur_count; i += 1)\n" " {\n" -" float weight = ((float[4])kernel[uint(i)/4])[uint(i)%4];\n" -" float4 min_sample = stage_t2d.Sample(stage_sampler, v2p.texcoord - float2(!is_vertical*i/viewport_size.x, is_vertical*i/viewport_size.y));\n" -" float4 max_sample = stage_t2d.Sample(stage_sampler, v2p.texcoord + float2(!is_vertical*i/viewport_size.x, is_vertical*i/viewport_size.y));\n" -" min_sample.a = 1;\n" -" max_sample.a = 1;\n" -" color += min_sample*weight;\n" -" color += max_sample*weight;\n" +" float weight = kernel[i].x;\n" +" float offset = kernel[i].y;\n" +" float4 min_sample = stage_t2d.Sample(stage_sampler, v2p.texcoord - offset * direction);\n" +" float4 max_sample = stage_t2d.Sample(stage_sampler, v2p.texcoord + offset * direction);\n" +" min_sample.a = 1.f;\n" +" max_sample.a = 1.f;\n" +" color += min_sample * weight;\n" +" color += max_sample * weight;\n" " }\n" " \n" " // rjf: determine SDF sample position\n" " float2 rect_half_size = float2((rect.z-rect.x)/2, (rect.w-rect.y)/2);\n" -" float2 sdf_sample_pos = float2((2*v2p.cornercoord.x-1)*rect_half_size.x,\n" -" (2*v2p.cornercoord.y-1)*rect_half_size.y);\n" +" float2 sdf_sample_pos = v2p.sdf_sample_pos;\n" " \n" " // rjf: sample for corners\n" -" float corner_sdf_s = rect_sdf(sdf_sample_pos, rect_half_size - float2(2.f, 2.f), v2p.corner_radius);\n" +" float corner_sdf_s = rect_sdf(sdf_sample_pos, rect_half_size - 2.f, v2p.corner_radius);\n" " float corner_sdf_t = 1-smoothstep(0, 2, corner_sdf_s);\n" " \n" " // rjf: weight output color by sdf\n" diff --git a/src/render/d3d11/render_d3d11.cpp b/src/render/d3d11/render_d3d11.cpp index 5e5ae948..dcfc5801 100644 --- a/src/render/d3d11/render_d3d11.cpp +++ b/src/render/d3d11/render_d3d11.cpp @@ -1183,99 +1183,140 @@ r_window_submit(OS_Handle window, R_Handle window_equip, R_PassList *passes) case R_PassKind_Blur: { R_PassParams_Blur *params = pass->params_blur; - ID3D11SamplerState *sampler = r_d3d11_state->samplers[R_Tex2DSampleKind_Nearest]; + ID3D11SamplerState *sampler = r_d3d11_state->samplers[R_Tex2DSampleKind_Linear]; ID3D11VertexShader *vshad = r_d3d11_state->vshads[R_D3D11_VShadKind_Blur]; ID3D11PixelShader *pshad = r_d3d11_state->pshads[R_D3D11_PShadKind_Blur]; ID3D11Buffer *uniforms_buffer = r_d3d11_state->uniform_type_kind_buffers[R_D3D11_VShadKind_Blur]; - - //- rjf: perform blur on each axis - ID3D11RenderTargetView *rtvs[Axis2_COUNT] = + + // rjf: setup output merger + d_ctx->OMSetDepthStencilState(r_d3d11_state->noop_depth_stencil, 0); + d_ctx->OMSetBlendState(r_d3d11_state->main_blend_state, 0, 0xffffffff); + + // rjf: set up viewport + Vec2S32 resolution = wnd->last_resolution; + D3D11_VIEWPORT viewport = { 0.0f, 0.0f, (F32)resolution.x, (F32)resolution.y, 0.0f, 1.0f }; + d_ctx->RSSetViewports(1, &viewport); + d_ctx->RSSetState(r_d3d11_state->main_rasterizer); + + // rjf: setup input assembly + d_ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP); + d_ctx->IASetInputLayout(0); + + // rjf: setup shaders + d_ctx->VSSetShader(vshad, 0, 0); + d_ctx->VSSetConstantBuffers(0, 1, &uniforms_buffer); + d_ctx->PSSetShader(pshad, 0, 0); + d_ctx->PSSetSamplers(0, 1, &sampler); + + // rjf: setup scissor rect { - wnd->stage_scratch_color_rtv, - wnd->stage_color_rtv, - }; - ID3D11ShaderResourceView *srvs[Axis2_COUNT] = - { - wnd->stage_color_srv, - wnd->stage_scratch_color_srv, - }; - for(Axis2 axis = (Axis2)0; axis < Axis2_COUNT; axis = (Axis2)(axis+1)) - { - // rjf: setup output merger - d_ctx->OMSetRenderTargets(1, &rtvs[axis], 0); - d_ctx->OMSetDepthStencilState(r_d3d11_state->noop_depth_stencil, 0); - d_ctx->OMSetBlendState(r_d3d11_state->main_blend_state, 0, 0xffffffff); - - // rjf: set up viewport - Vec2S32 resolution = wnd->last_resolution; - D3D11_VIEWPORT viewport = { 0.0f, 0.0f, (F32)resolution.x, (F32)resolution.y, 0.0f, 1.0f }; - d_ctx->RSSetViewports(1, &viewport); - d_ctx->RSSetState(r_d3d11_state->main_rasterizer); - - // rjf: setup input assembly - d_ctx->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP); - d_ctx->IASetInputLayout(0); - - // rjf: set up uniforms - { - F32 stdev = (params->blur_size-1.f)/2.f; - F32 one_over_root_2pi_stdev2 = 1/sqrt_f32(2*pi32*stdev*stdev); - F32 euler32 = 2.718281828459045f; - R_D3D11_Uniforms_Blur uniforms = {0}; - uniforms.viewport_size = v2f32(resolution.x, resolution.y); - uniforms.rect = params->rect; - uniforms.blur_size = params->blur_size; - uniforms.is_vertical = (F32)!!axis; - MemoryCopyArray(uniforms.corner_radii.v, params->corner_radii); - F32 kernel_x = 0; - uniforms.kernel[0].v[0] = 1.f; - if(stdev > 0.f) - { - for(U64 idx = 0; idx < ArrayCount(uniforms.kernel); idx += 1) - { - for(U64 v_idx = 0; v_idx < ArrayCount(uniforms.kernel[idx].v); v_idx += 1) - { - uniforms.kernel[idx].v[v_idx] = one_over_root_2pi_stdev2*pow_f32(euler32, -kernel_x*kernel_x/(2.f*stdev*stdev)); - kernel_x += 1; - } - } - } - if(uniforms.kernel[0].v[0] > 1.f) - { - MemoryZeroArray(uniforms.kernel); - uniforms.kernel[0].v[0] = 1.f; - } - D3D11_MAPPED_SUBRESOURCE sub_rsrc = {0}; - r_d3d11_state->device_ctx->Map(uniforms_buffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &sub_rsrc); - MemoryCopy((U8 *)sub_rsrc.pData, &uniforms, sizeof(uniforms)); - r_d3d11_state->device_ctx->Unmap(uniforms_buffer, 0); - } - - // rjf: setup shaders - d_ctx->VSSetShader(vshad, 0, 0); - d_ctx->VSSetConstantBuffers(0, 1, &uniforms_buffer); - d_ctx->PSSetShader(pshad, 0, 0); - d_ctx->PSSetConstantBuffers(0, 1, &uniforms_buffer); - d_ctx->PSSetShaderResources(0, 1, &srvs[axis]); - d_ctx->PSSetSamplers(0, 1, &sampler); - - // rjf: setup scissor rect - { - D3D11_RECT rect = {0}; + D3D11_RECT rect = { 0 }; rect.left = 0; rect.right = (LONG)wnd->last_resolution.x; rect.top = 0; rect.bottom = (LONG)wnd->last_resolution.y; d_ctx->RSSetScissorRects(1, &rect); - } - - // rjf: draw - d_ctx->Draw(4, 0); - - // rjf: unset srv - ID3D11ShaderResourceView *srv = 0; - d_ctx->PSSetShaderResources(0, 1, &srv); } + + // rjf: set up uniforms + R_D3D11_Uniforms_Blur uniforms = { 0 }; + { + F32 weights[ArrayCount(uniforms.kernel)*2] = {0}; + + F32 blur_size = Min(params->blur_size, ArrayCount(weights)); + U64 blur_count = (U64)round_f32(blur_size); + + F32 stdev = (blur_size-1.f)/2.f; + F32 one_over_root_2pi_stdev2 = 1/sqrt_f32(2*pi32*stdev*stdev); + F32 euler32 = 2.718281828459045f; + + weights[0] = 1.f; + if(stdev > 0.f) + { + for(U64 idx = 0; idx < blur_count; idx += 1) + { + F32 kernel_x = (F32)idx; + weights[idx] = one_over_root_2pi_stdev2*pow_f32(euler32, -kernel_x*kernel_x/(2.f*stdev*stdev)); + } + } + if(weights[0] > 1.f) + { + MemoryZeroArray(weights); + weights[0] = 1.f; + } + else + { + // prepare weights & offsets for bilinear lookup + // blur filter wants to calculate w0*pixel[pos] + w1*pixel[pos+1] + ... + // with bilinear filter we can do this calulation by doing only w*sample(pos+t) = w*((1-t)*pixel[pos] + t*pixel[pos+1]) + // we can see w0=w*(1-t) and w1=w*t + // thus w=w0+w1 and t=w1/w + for (U64 idx = 1; idx < blur_count; idx += 2) + { + F32 w0 = weights[idx + 0]; + F32 w1 = weights[idx + 1]; + F32 w = w0 + w1; + F32 t = w1 / w; + + // each kernel element is float2(weight, offset) + // weights & offsets are adjusted for bilinear sampling + // zw elements are not used, a bit of waste but it allows for simpler shader code + uniforms.kernel[(idx+1)/2] = v4f32(w, (F32)idx + t, 0, 0); + } + uniforms.kernel[0].x = weights[0]; + } + + // technically we need just direction be different + // but there are 256 bytes of usable space anyway for each constant buffer chunk + + uniforms.passes[Axis2_X].viewport_size = v2f32(resolution.x, resolution.y); + uniforms.passes[Axis2_X].rect = params->rect; + uniforms.passes[Axis2_X].direction = v2f32(1.f / resolution.x, 0); + uniforms.passes[Axis2_X].blur_count = 1 + blur_count / 2; // 2x smaller because of bilinear sampling + MemoryCopyArray(uniforms.passes[Axis2_X].corner_radii.v, params->corner_radii); + + uniforms.passes[Axis2_Y].viewport_size = v2f32(resolution.x, resolution.y); + uniforms.passes[Axis2_Y].rect = params->rect; + uniforms.passes[Axis2_Y].direction = v2f32(0, 1.f / resolution.y); + uniforms.passes[Axis2_Y].blur_count = 1 + blur_count / 2; // 2x smaller because of bilinear sampling + MemoryCopyArray(uniforms.passes[Axis2_Y].corner_radii.v, params->corner_radii); + + D3D11_MAPPED_SUBRESOURCE sub_rsrc = {0}; + r_d3d11_state->device_ctx->Map(uniforms_buffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &sub_rsrc); + MemoryCopy((U8 *)sub_rsrc.pData, &uniforms, sizeof(uniforms)); + r_d3d11_state->device_ctx->Unmap(uniforms_buffer, 0); + } + + ID3D11Buffer* uniforms_buffers[] = { uniforms_buffer, uniforms_buffer }; + + U32 uniform_offset[Axis2_COUNT][2] = + { + { 0 * sizeof(R_D3D11_Uniforms_BlurPass) / 16, OffsetOf(R_D3D11_Uniforms_Blur, kernel) / 16 }, + { 1 * sizeof(R_D3D11_Uniforms_BlurPass) / 16, OffsetOf(R_D3D11_Uniforms_Blur, kernel) / 16 }, + }; + + U32 uniform_count[Axis2_COUNT][2] = + { + { sizeof(R_D3D11_Uniforms_BlurPass) / 16, sizeof(uniforms.kernel) / 16 }, + { sizeof(R_D3D11_Uniforms_BlurPass) / 16, sizeof(uniforms.kernel) / 16 }, + }; + + // rjf: for unsetting srv + ID3D11ShaderResourceView* srv = 0; + + // horizontal pass + d_ctx->OMSetRenderTargets(1, &wnd->stage_scratch_color_rtv, 0); + d_ctx->PSSetConstantBuffers1(0, ArrayCount(uniforms_buffers), uniforms_buffers, uniform_offset[Axis2_X], uniform_count[Axis2_X]); + d_ctx->PSSetShaderResources(0, 1, &wnd->stage_color_srv); + d_ctx->Draw(4, 0); + d_ctx->PSSetShaderResources(0, 1, &srv); + + // vertical pass + d_ctx->OMSetRenderTargets(1, &wnd->stage_color_rtv, 0); + d_ctx->PSSetConstantBuffers1(0, ArrayCount(uniforms_buffers), uniforms_buffers, uniform_offset[Axis2_Y], uniform_count[Axis2_Y]); + d_ctx->PSSetShaderResources(0, 1, &wnd->stage_scratch_color_srv); + d_ctx->Draw(4, 0); + d_ctx->PSSetShaderResources(0, 1, &srv); }break; diff --git a/src/render/d3d11/render_d3d11.h b/src/render/d3d11/render_d3d11.h index 565928e6..ca08fc79 100644 --- a/src/render/d3d11/render_d3d11.h +++ b/src/render/d3d11/render_d3d11.h @@ -32,13 +32,20 @@ struct R_D3D11_Uniforms_Rect Vec2F32 xform_scale; }; -struct R_D3D11_Uniforms_Blur +struct R_D3D11_Uniforms_BlurPass { Rng2F32 rect; - Vec2F32 viewport_size; - F32 blur_size; - F32 is_vertical; Vec4F32 corner_radii; + Vec2F32 direction; + Vec2F32 viewport_size; + U32 blur_count; + U8 _padding0_[204]; +}; +StaticAssert(sizeof(R_D3D11_Uniforms_BlurPass) % 256 == 0, NotAligned); // constant count/offset must be aligned to 256 bytes + +struct R_D3D11_Uniforms_Blur +{ + R_D3D11_Uniforms_BlurPass passes[Axis2_COUNT]; Vec4F32 kernel[32]; }; diff --git a/src/render/d3d11/render_d3d11.mdesk b/src/render/d3d11/render_d3d11.mdesk index 64361bd0..6130d74c 100644 --- a/src/render/d3d11/render_d3d11.mdesk +++ b/src/render/d3d11/render_d3d11.mdesk @@ -204,10 +204,14 @@ ps_main(Vertex2Pixel vertex2pixel) : SV_TARGET cbuffer Globals : register(b0) { float4 rect; - float2 viewport_size; - float blur_size; - float is_vertical; float4 corner_radii_px; + float2 direction; + float2 viewport_size; + uint blur_count; +} + +cbuffer Kernel : register(b1) +{ float4 kernel[32]; } @@ -220,7 +224,7 @@ struct Vertex2Pixel { float4 position : SV_POSITION; float2 texcoord : TEX; - float2 cornercoord : CRN; + float2 sdf_sample_pos : SDF; float corner_radius : RAD; }; @@ -237,12 +241,12 @@ float rect_sdf(float2 sample_pos, float2 rect_half_size, float r) Vertex2Pixel vs_main(CPU2Vertex c2v) { - float4 vertex_positions__scrn[] = + float2 vertex_positions__scrn[] = { - float4(rect.x, rect.w, 0, 1) * float4(1, -1, 1, 1) + float4(0, viewport_size.y, 0, 0), - float4(rect.x, rect.y, 0, 1) * float4(1, -1, 1, 1) + float4(0, viewport_size.y, 0, 0), - float4(rect.z, rect.w, 0, 1) * float4(1, -1, 1, 1) + float4(0, viewport_size.y, 0, 0), - float4(rect.z, rect.y, 0, 1) * float4(1, -1, 1, 1) + float4(0, viewport_size.y, 0, 0), + rect.xw, + rect.xy, + rect.zw, + rect.zy, }; float corner_radii__px[] = { @@ -251,22 +255,20 @@ vs_main(CPU2Vertex c2v) corner_radii_px.w, corner_radii_px.z, }; - float2 cornercoords__pct[] = - { - float2(0, 1), - float2(0, 0), - float2(1, 1), - float2(1, 0), - }; - float4 vertex_position__scrn = vertex_positions__scrn[c2v.vertex_id]; - float4 vertex_position__clip = float4(2*vertex_position__scrn.x/viewport_size.x - 1, - 2*vertex_position__scrn.y/viewport_size.y - 1, - 0, 1); + float2 cornercoords__pct = float2( + (c2v.vertex_id >> 1) ? 1.f : 0.f, + (c2v.vertex_id & 1) ? 0.f : 1.f); + + float2 vertex_position__pct = vertex_positions__scrn[c2v.vertex_id] / viewport_size; + float2 vertex_position__scr = 2.f * vertex_position__pct - 1.f; + + float2 rect_half_size = float2((rect.z-rect.x)/2, (rect.w-rect.y)/2); + Vertex2Pixel v2p; { - v2p.position = vertex_position__clip; - v2p.texcoord = float2(vertex_position__scrn.x/viewport_size.x, 1 - vertex_position__scrn.y/viewport_size.y); - v2p.cornercoord = cornercoords__pct[c2v.vertex_id]; + v2p.position = float4(vertex_position__scr.x, -vertex_position__scr.y, 0.f, 1.f); + v2p.texcoord = vertex_position__pct; + v2p.sdf_sample_pos = (2.f * cornercoords__pct - 1.f) * rect_half_size; v2p.corner_radius = corner_radii__px[c2v.vertex_id]; } return v2p; @@ -278,26 +280,27 @@ float4 ps_main(Vertex2Pixel v2p) : SV_TARGET { // rjf: blend weighted texture samples into color - float4 color = stage_t2d.Sample(stage_sampler, v2p.texcoord) * kernel[0].x; + float4 color = kernel[0].x * stage_t2d.Sample(stage_sampler, v2p.texcoord); color.a = kernel[0].x; - for(float i = 1; i < blur_size; i += 1) + + for(uint i = 1; i < blur_count; i += 1) { - float weight = ((float[4])kernel[uint(i)/4])[uint(i)%4]; - float4 min_sample = stage_t2d.Sample(stage_sampler, v2p.texcoord - float2(!is_vertical*i/viewport_size.x, is_vertical*i/viewport_size.y)); - float4 max_sample = stage_t2d.Sample(stage_sampler, v2p.texcoord + float2(!is_vertical*i/viewport_size.x, is_vertical*i/viewport_size.y)); - min_sample.a = 1; - max_sample.a = 1; - color += min_sample*weight; - color += max_sample*weight; + float weight = kernel[i].x; + float offset = kernel[i].y; + float4 min_sample = stage_t2d.Sample(stage_sampler, v2p.texcoord - offset * direction); + float4 max_sample = stage_t2d.Sample(stage_sampler, v2p.texcoord + offset * direction); + min_sample.a = 1.f; + max_sample.a = 1.f; + color += min_sample * weight; + color += max_sample * weight; } // rjf: determine SDF sample position float2 rect_half_size = float2((rect.z-rect.x)/2, (rect.w-rect.y)/2); - float2 sdf_sample_pos = float2((2*v2p.cornercoord.x-1)*rect_half_size.x, - (2*v2p.cornercoord.y-1)*rect_half_size.y); + float2 sdf_sample_pos = v2p.sdf_sample_pos; // rjf: sample for corners - float corner_sdf_s = rect_sdf(sdf_sample_pos, rect_half_size - float2(2.f, 2.f), v2p.corner_radius); + float corner_sdf_s = rect_sdf(sdf_sample_pos, rect_half_size - 2.f, v2p.corner_radius); float corner_sdf_t = 1-smoothstep(0, 2, corner_sdf_s); // rjf: weight output color by sdf