mirror of
				https://git.tardis.systems/mirrors/yuzu
				synced 2025-10-31 18:54:14 +01:00 
			
		
		
		
	Query Cache: Implement host side sample counting.
This commit is contained in:
		
							parent
							
								
									2fea1b8407
								
							
						
					
					
						commit
						c8237d5c31
					
				| @ -41,6 +41,7 @@ set(SHADER_FILES | ||||
|     pitch_unswizzle.comp | ||||
|     present_bicubic.frag | ||||
|     present_gaussian.frag | ||||
|     queries_prefix_scan_sum.comp | ||||
|     resolve_conditional_render.comp | ||||
|     smaa_edge_detection.vert | ||||
|     smaa_edge_detection.frag | ||||
|  | ||||
							
								
								
									
										124
									
								
								src/video_core/host_shaders/queries_prefix_scan_sum.comp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										124
									
								
								src/video_core/host_shaders/queries_prefix_scan_sum.comp
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,124 @@ | ||||
| // SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel | ||||
| // SPDX-License-Identifier: MIT | ||||
| 
 | ||||
| // Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and | ||||
| // Nicholas Haemel. Modified to suit needs and optimize for subgroup | ||||
| 
 | ||||
| #version 460 core | ||||
| 
 | ||||
| #ifdef VULKAN | ||||
| 
 | ||||
| #extension GL_KHR_shader_subgroup_arithmetic : enable | ||||
| #define HAS_EXTENDED_TYPES 1 | ||||
| #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { | ||||
| #define END_PUSH_CONSTANTS                                                                         \ | ||||
|     }                                                                                              \ | ||||
|     ; | ||||
| #define UNIFORM(n) | ||||
| #define BINDING_INPUT_BUFFER 0 | ||||
| #define BINDING_OUTPUT_IMAGE 1 | ||||
| 
 | ||||
| #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | ||||
| 
 | ||||
| #extension GL_KHR_shader_subgroup_arithmetic : enable | ||||
| #extension GL_NV_gpu_shader5 : enable | ||||
| #ifdef GL_NV_gpu_shader5 | ||||
| #define HAS_EXTENDED_TYPES 1 | ||||
| #else | ||||
| #define HAS_EXTENDED_TYPES 0 | ||||
| #endif | ||||
| #define BEGIN_PUSH_CONSTANTS | ||||
| #define END_PUSH_CONSTANTS | ||||
| #define UNIFORM(n) layout(location = n) uniform | ||||
| #define BINDING_INPUT_BUFFER 0 | ||||
| #define BINDING_OUTPUT_IMAGE 0 | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| BEGIN_PUSH_CONSTANTS | ||||
| UNIFORM(0) uint max_accumulation_base; | ||||
| UNIFORM(1) uint accumulation_limit; | ||||
| END_PUSH_CONSTANTS | ||||
| 
 | ||||
| layout(local_size_x = 32) in; | ||||
| 
 | ||||
| layout(std430, binding = 0) readonly buffer block1 { | ||||
|     uvec2 input_data[gl_WorkGroupSize.x]; | ||||
| }; | ||||
| 
 | ||||
| layout(std430, binding = 1) writeonly coherent buffer block2 { | ||||
|     uvec2 output_data[gl_WorkGroupSize.x]; | ||||
| }; | ||||
| 
 | ||||
| layout(std430, binding = 2) coherent buffer block3 { | ||||
|     uvec2 accumulated_data; | ||||
| }; | ||||
| 
 | ||||
| shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; | ||||
| 
 | ||||
| uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { | ||||
|     uint carry = 0; | ||||
|     uvec2 result; | ||||
|     result.x = uaddCarry(value_1.x, value_2.x, carry); | ||||
|     result.y = value_1.y + value_2.y + carry; | ||||
|     return result; | ||||
| } | ||||
| 
 | ||||
| void main(void) { | ||||
|     uint id = gl_LocalInvocationID.x; | ||||
|     uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); | ||||
|     uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); | ||||
|     uint work_size = gl_WorkGroupSize.x; | ||||
|     uint rd_id; | ||||
|     uint wr_id; | ||||
|     uint mask; | ||||
|     uvec2 input_1 = input_data[id * 2]; | ||||
|     uvec2 input_2 = input_data[id * 2 + 1]; | ||||
|     // The number of steps is the log base 2 of the | ||||
|     // work group size, which should be a power of 2 | ||||
|     const uint steps = uint(log2(work_size)) + 1; | ||||
|     uint step = 0; | ||||
| 
 | ||||
|     // Each invocation is responsible for the content of | ||||
|     // two elements of the output array | ||||
|     shared_data[id * 2] = input_1; | ||||
|     shared_data[id * 2 + 1] = input_2; | ||||
|     // Synchronize to make sure that everyone has initialized | ||||
|     // their elements of shared_data[] with data loaded from | ||||
|     // the input arrays | ||||
|     barrier(); | ||||
|     memoryBarrierShared(); | ||||
|     // For each step... | ||||
|     for (step = 0; step < steps; step++) { | ||||
|         // Calculate the read and write index in the | ||||
|         // shared array | ||||
|         mask = (1 << step) - 1; | ||||
|         rd_id = ((id >> step) << (step + 1)) + mask; | ||||
|         wr_id = rd_id + 1 + (id & mask); | ||||
|         // Accumulate the read data into our element | ||||
| 
 | ||||
|         shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]); | ||||
|         // Synchronize again to make sure that everyone | ||||
|         // has caught up with us | ||||
|         barrier(); | ||||
|         memoryBarrierShared(); | ||||
|     } | ||||
|     // Add the accumulation | ||||
|     shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); | ||||
|     shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); | ||||
|     barrier(); | ||||
|     memoryBarrierShared(); | ||||
| 
 | ||||
|     // Finally write our data back to the output buffer | ||||
|     output_data[id * 2] = shared_data[id * 2]; | ||||
|     output_data[id * 2 + 1] = shared_data[id * 2 + 1]; | ||||
|     if (id == 0) { | ||||
|         if (max_accumulation_base >= accumulation_limit + 1) { | ||||
|             accumulated_data = shared_data[accumulation_limit]; | ||||
|             return; | ||||
|         } | ||||
|         uvec2 value_1 = shared_data[max_accumulation_base]; | ||||
|         uvec2 value_2 = shared_data[accumulation_limit]; | ||||
|         accumulated_data = AddUint64(value_1, -value_2); | ||||
|     } | ||||
| } | ||||
| @ -12,6 +12,7 @@ | ||||
| #include "common/common_types.h" | ||||
| #include "common/div_ceil.h" | ||||
| #include "video_core/host_shaders/astc_decoder_comp_spv.h" | ||||
| #include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h" | ||||
| #include "video_core/host_shaders/resolve_conditional_render_comp_spv.h" | ||||
| #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" | ||||
| #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" | ||||
| @ -58,6 +59,30 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 2> INPUT_OUTPUT_DESCRIPTOR_SE | ||||
|     }, | ||||
| }}; | ||||
| 
 | ||||
| constexpr std::array<VkDescriptorSetLayoutBinding, 3> QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS{{ | ||||
|     { | ||||
|         .binding = 0, | ||||
|         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||||
|         .descriptorCount = 1, | ||||
|         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | ||||
|         .pImmutableSamplers = nullptr, | ||||
|     }, | ||||
|     { | ||||
|         .binding = 1, | ||||
|         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||||
|         .descriptorCount = 1, | ||||
|         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | ||||
|         .pImmutableSamplers = nullptr, | ||||
|     }, | ||||
|     { | ||||
|         .binding = 2, | ||||
|         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||||
|         .descriptorCount = 1, | ||||
|         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | ||||
|         .pImmutableSamplers = nullptr, | ||||
|     }, | ||||
| }}; | ||||
| 
 | ||||
| constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ | ||||
|     .uniform_buffers = 0, | ||||
|     .storage_buffers = 2, | ||||
| @ -68,6 +93,16 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ | ||||
|     .score = 2, | ||||
| }; | ||||
| 
 | ||||
| constexpr DescriptorBankInfo QUERIES_SCAN_BANK_INFO{ | ||||
|     .uniform_buffers = 0, | ||||
|     .storage_buffers = 3, | ||||
|     .texture_buffers = 0, | ||||
|     .image_buffers = 0, | ||||
|     .textures = 0, | ||||
|     .images = 0, | ||||
|     .score = 3, | ||||
| }; | ||||
| 
 | ||||
| constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{ | ||||
|     { | ||||
|         .binding = ASTC_BINDING_INPUT_BUFFER, | ||||
| @ -104,6 +139,15 @@ constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLAT | ||||
|     .stride = sizeof(DescriptorUpdateEntry), | ||||
| }; | ||||
| 
 | ||||
| constexpr VkDescriptorUpdateTemplateEntry QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE{ | ||||
|     .dstBinding = 0, | ||||
|     .dstArrayElement = 0, | ||||
|     .descriptorCount = 3, | ||||
|     .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||||
|     .offset = 0, | ||||
|     .stride = sizeof(DescriptorUpdateEntry), | ||||
| }; | ||||
| 
 | ||||
| constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS> | ||||
|     ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{ | ||||
|         { | ||||
| @ -132,6 +176,11 @@ struct AstcPushConstants { | ||||
|     u32 block_height; | ||||
|     u32 block_height_mask; | ||||
| }; | ||||
| 
 | ||||
| struct QueriesPrefixScanPushConstants { | ||||
|     u32 max_accumulation_base; | ||||
|     u32 accumulation_limit; | ||||
| }; | ||||
| } // Anonymous namespace
 | ||||
| 
 | ||||
| ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, | ||||
| @ -313,8 +362,6 @@ ConditionalRenderingResolvePass::ConditionalRenderingResolvePass( | ||||
| 
 | ||||
| void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, | ||||
|                                               u32 src_offset, bool compare_to_zero) { | ||||
|     scheduler.RequestOutsideRenderPassOperationContext(); | ||||
| 
 | ||||
|     const size_t compare_size = compare_to_zero ? 8 : 24; | ||||
| 
 | ||||
|     compute_pass_descriptor_queue.Acquire(); | ||||
| @ -327,7 +374,7 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_ | ||||
|         static constexpr VkMemoryBarrier read_barrier{ | ||||
|             .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||||
|             .pNext = nullptr, | ||||
|             .srcAccessMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, | ||||
|             .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT, | ||||
|             .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, | ||||
|         }; | ||||
|         static constexpr VkMemoryBarrier write_barrier{ | ||||
| @ -349,6 +396,63 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_ | ||||
|     }); | ||||
| } | ||||
| 
 | ||||
| QueriesPrefixScanPass::QueriesPrefixScanPass( | ||||
|     const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_, | ||||
|     ComputePassDescriptorQueue& compute_pass_descriptor_queue_) | ||||
|     : ComputePass(device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS, | ||||
|                   QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO, | ||||
|                   COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>, | ||||
|                   QUERIES_PREFIX_SCAN_SUM_COMP_SPV), | ||||
|       scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {} | ||||
| 
 | ||||
| void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, | ||||
|                                 VkBuffer src_buffer, size_t number_of_sums, | ||||
|                                 size_t max_accumulation_limit) { | ||||
|     size_t aligned_runs = Common::AlignUp(number_of_sums, 32); | ||||
| 
 | ||||
|     compute_pass_descriptor_queue.Acquire(); | ||||
|     compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, aligned_runs * sizeof(u64)); | ||||
|     compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, aligned_runs * sizeof(u64)); | ||||
|     compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64)); | ||||
|     const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()}; | ||||
| 
 | ||||
|     scheduler.RequestOutsideRenderPassOperationContext(); | ||||
|     scheduler.Record([this, descriptor_data, max_accumulation_limit, number_of_sums, | ||||
|                       aligned_runs](vk::CommandBuffer cmdbuf) { | ||||
|         static constexpr VkMemoryBarrier read_barrier{ | ||||
|             .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||||
|             .pNext = nullptr, | ||||
|             .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, | ||||
|             .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, | ||||
|         }; | ||||
|         static constexpr VkMemoryBarrier write_barrier{ | ||||
|             .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, | ||||
|             .pNext = nullptr, | ||||
|             .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, | ||||
|             .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT | | ||||
|                              VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | | ||||
|                              VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT | | ||||
|                              VK_ACCESS_UNIFORM_READ_BIT | | ||||
|                              VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT, | ||||
|         }; | ||||
|         const QueriesPrefixScanPushConstants uniforms{ | ||||
|             .max_accumulation_base = static_cast<u32>(max_accumulation_limit), | ||||
|             .accumulation_limit = static_cast<u32>(number_of_sums - 1), | ||||
|         }; | ||||
|         const VkDescriptorSet set = descriptor_allocator.Commit(); | ||||
|         device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data); | ||||
| 
 | ||||
|         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, | ||||
|                                VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier); | ||||
|         cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); | ||||
|         cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {}); | ||||
|         cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms); | ||||
|         cmdbuf.Dispatch(static_cast<u32>(aligned_runs / 32U), 1, 1); | ||||
|         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, | ||||
|                                VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier); | ||||
|     }); | ||||
| } | ||||
| 
 | ||||
| ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, | ||||
|                                  DescriptorPool& descriptor_pool_, | ||||
|                                  StagingBufferPool& staging_buffer_pool_, | ||||
|  | ||||
| @ -95,6 +95,20 @@ private: | ||||
|     ComputePassDescriptorQueue& compute_pass_descriptor_queue; | ||||
| }; | ||||
| 
 | ||||
| class QueriesPrefixScanPass final : public ComputePass { | ||||
| public: | ||||
|     explicit QueriesPrefixScanPass(const Device& device_, Scheduler& scheduler_, | ||||
|                                    DescriptorPool& descriptor_pool_, | ||||
|                                    ComputePassDescriptorQueue& compute_pass_descriptor_queue_); | ||||
| 
 | ||||
|     void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer, | ||||
|              size_t number_of_sums, size_t max_accumulation_limit); | ||||
| 
 | ||||
| private: | ||||
|     Scheduler& scheduler; | ||||
|     ComputePassDescriptorQueue& compute_pass_descriptor_queue; | ||||
| }; | ||||
| 
 | ||||
| class ASTCDecoderPass final : public ComputePass { | ||||
| public: | ||||
|     explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, | ||||
|  | ||||
| @ -11,6 +11,7 @@ | ||||
| #include <utility> | ||||
| #include <vector> | ||||
| 
 | ||||
| #include "common/bit_util.h" | ||||
| #include "common/common_types.h" | ||||
| #include "core/memory.h" | ||||
| #include "video_core/engines/draw_manager.h" | ||||
| @ -112,14 +113,34 @@ class SamplesStreamer : public BaseStreamer { | ||||
| public: | ||||
|     explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_, | ||||
|                              VideoCore::RasterizerInterface* rasterizer_, const Device& device_, | ||||
|                              Scheduler& scheduler_, const MemoryAllocator& memory_allocator_) | ||||
|                              Scheduler& scheduler_, const MemoryAllocator& memory_allocator_, | ||||
|                              ComputePassDescriptorQueue& compute_pass_descriptor_queue, | ||||
|                              DescriptorPool& descriptor_pool) | ||||
|         : BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_}, | ||||
|           scheduler{scheduler_}, memory_allocator{memory_allocator_} { | ||||
|         BuildResolveBuffer(); | ||||
|         current_bank = nullptr; | ||||
|         current_query = nullptr; | ||||
|         ammend_value = 0; | ||||
|         acumulation_value = 0; | ||||
|         queries_prefix_scan_pass = std::make_unique<QueriesPrefixScanPass>( | ||||
|             device, scheduler, descriptor_pool, compute_pass_descriptor_queue); | ||||
| 
 | ||||
|         const VkBufferCreateInfo buffer_ci = { | ||||
|             .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, | ||||
|             .pNext = nullptr, | ||||
|             .flags = 0, | ||||
|             .size = 8, | ||||
|             .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | | ||||
|                      VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, | ||||
|             .sharingMode = VK_SHARING_MODE_EXCLUSIVE, | ||||
|             .queueFamilyIndexCount = 0, | ||||
|             .pQueueFamilyIndices = nullptr, | ||||
|         }; | ||||
|         accumulation_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); | ||||
|         scheduler.RequestOutsideRenderPassOperationContext(); | ||||
|         scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { | ||||
|             cmdbuf.FillBuffer(buffer, 0, 8, 0); | ||||
|         }); | ||||
|     } | ||||
| 
 | ||||
|     ~SamplesStreamer() = default; | ||||
| @ -159,6 +180,8 @@ public: | ||||
|             acumulation_value = 0; | ||||
|         }); | ||||
|         rasterizer->SyncOperation(std::move(func)); | ||||
|         accumulation_since_last_sync = false; | ||||
|         last_accumulation_checkpoint = std::min(last_accumulation_checkpoint, num_slots_used); | ||||
|     } | ||||
| 
 | ||||
|     void CloseCounter() override { | ||||
| @ -175,7 +198,8 @@ public: | ||||
|         } | ||||
| 
 | ||||
|         for (size_t i = 0; i < sync_values_stash.size(); i++) { | ||||
|             runtime.template SyncValues<HostSyncValues>(sync_values_stash[i], *resolve_buffers[i]); | ||||
|             runtime.template SyncValues<HostSyncValues>(sync_values_stash[i], | ||||
|                                                         *buffers[resolve_buffers[i]]); | ||||
|         } | ||||
| 
 | ||||
|         sync_values_stash.clear(); | ||||
| @ -189,36 +213,21 @@ public: | ||||
|         sync_values_stash.clear(); | ||||
|         sync_values_stash.emplace_back(); | ||||
|         std::vector<HostSyncValues>* sync_values = &sync_values_stash.back(); | ||||
|         sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE); | ||||
|         sync_values->reserve(num_slots_used); | ||||
|         std::unordered_map<size_t, std::pair<size_t, size_t>> offsets; | ||||
|         size_t this_bank_slot = std::numeric_limits<size_t>::max(); | ||||
|         size_t resolve_slots_remaining = resolve_slots; | ||||
|         size_t resolve_buffer_index = 0; | ||||
|         resolve_buffers.clear(); | ||||
|         size_t resolve_buffer_index = ObtainBuffer<true>(num_slots_used); | ||||
|         resolve_buffers.push_back(resolve_buffer_index); | ||||
|         size_t base_offset = 0; | ||||
| 
 | ||||
|         ApplyBanksWideOp<true>(pending_sync, [&](SamplesQueryBank* bank, size_t start, | ||||
|                                                  size_t amount) { | ||||
|             size_t bank_id = bank->GetIndex(); | ||||
|             if (this_bank_slot != bank_id) { | ||||
|                 this_bank_slot = bank_id; | ||||
|                 if (resolve_slots_remaining == 0) { | ||||
|                     resolve_buffer_index++; | ||||
|                     if (resolve_buffer_index >= resolve_buffers.size()) { | ||||
|                         BuildResolveBuffer(); | ||||
|                     } | ||||
|                     resolve_slots_remaining = resolve_slots; | ||||
|                     sync_values_stash.emplace_back(); | ||||
|                     sync_values = &sync_values_stash.back(); | ||||
|                     sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE); | ||||
|                 } | ||||
|                 resolve_slots_remaining--; | ||||
|             } | ||||
|             auto& resolve_buffer = resolve_buffers[resolve_buffer_index]; | ||||
|             const size_t base_offset = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * | ||||
|                                        (resolve_slots - resolve_slots_remaining - 1); | ||||
|             auto& resolve_buffer = buffers[resolve_buffer_index]; | ||||
|             VkQueryPool query_pool = bank->GetInnerPool(); | ||||
|             scheduler.RequestOutsideRenderPassOperationContext(); | ||||
|             scheduler.Record([start, amount, base_offset, query_pool, | ||||
|                               buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) { | ||||
|                 size_t final_offset = base_offset + start * SamplesQueryBank::QUERY_SIZE; | ||||
|                 const VkBufferMemoryBarrier copy_query_pool_barrier{ | ||||
|                     .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, | ||||
|                     .pNext = nullptr, | ||||
| @ -227,39 +236,60 @@ public: | ||||
|                     .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | ||||
|                     .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, | ||||
|                     .buffer = buffer, | ||||
|                     .offset = final_offset, | ||||
|                     .offset = base_offset, | ||||
|                     .size = amount * SamplesQueryBank::QUERY_SIZE, | ||||
|                 }; | ||||
| 
 | ||||
|                 cmdbuf.CopyQueryPoolResults( | ||||
|                     query_pool, static_cast<u32>(start), static_cast<u32>(amount), buffer, | ||||
|                     static_cast<u32>(final_offset), SamplesQueryBank::QUERY_SIZE, | ||||
|                     static_cast<u32>(base_offset), SamplesQueryBank::QUERY_SIZE, | ||||
|                     VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT); | ||||
|                 cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, | ||||
|                                        VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier); | ||||
|             }); | ||||
|             offsets[bank_id] = {sync_values_stash.size() - 1, base_offset}; | ||||
|             offsets[bank_id] = {start, base_offset}; | ||||
|             base_offset += amount * SamplesQueryBank::QUERY_SIZE; | ||||
|         }); | ||||
| 
 | ||||
|         // Convert queries
 | ||||
|         bool has_multi_queries = false; | ||||
|         for (auto q : pending_sync) { | ||||
|             auto* query = GetQuery(q); | ||||
|             size_t sync_value_slot = 0; | ||||
|             if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) { | ||||
|                 continue; | ||||
|             } | ||||
|             if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) { | ||||
|                 continue; | ||||
|             } | ||||
|             if (query->size_slots > 1) { | ||||
|                 // This is problematic.
 | ||||
|                 // UNIMPLEMENTED();
 | ||||
|             if (accumulation_since_last_sync || query->size_slots > 1) { | ||||
|                 if (!has_multi_queries) { | ||||
|                     has_multi_queries = true; | ||||
|                     sync_values_stash.emplace_back(); | ||||
|                 } | ||||
|                 sync_value_slot = 1; | ||||
|             } | ||||
|             query->flags |= VideoCommon::QueryFlagBits::IsHostSynced; | ||||
|             auto loc_data = offsets[query->start_bank_id]; | ||||
|             sync_values_stash[loc_data.first].emplace_back(HostSyncValues{ | ||||
|             sync_values_stash[sync_value_slot].emplace_back(HostSyncValues{ | ||||
|                 .address = query->guest_address, | ||||
|                 .size = SamplesQueryBank::QUERY_SIZE, | ||||
|                 .offset = loc_data.second + query->start_slot * SamplesQueryBank::QUERY_SIZE, | ||||
|                 .offset = | ||||
|                     loc_data.second + (query->start_slot - loc_data.first + query->size_slots - 1) * | ||||
|                                           SamplesQueryBank::QUERY_SIZE, | ||||
|             }); | ||||
|         } | ||||
| 
 | ||||
|         if (has_multi_queries) { | ||||
|             size_t intermediary_buffer_index = ObtainBuffer<false>(num_slots_used); | ||||
|             resolve_buffers.push_back(intermediary_buffer_index); | ||||
|             queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index], | ||||
|                                           *buffers[resolve_buffer_index], num_slots_used, | ||||
|                                           std::min(last_accumulation_checkpoint, num_slots_used)); | ||||
|         } else { | ||||
|             scheduler.RequestOutsideRenderPassOperationContext(); | ||||
|             scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) { | ||||
|                 cmdbuf.FillBuffer(buffer, 0, 8, 0); | ||||
|             }); | ||||
|         } | ||||
| 
 | ||||
| @ -267,6 +297,9 @@ public: | ||||
|         std::function<void()> func([this] { ammend_value = acumulation_value; }); | ||||
|         rasterizer->SyncOperation(std::move(func)); | ||||
|         AbandonCurrentQuery(); | ||||
|         num_slots_used = 0; | ||||
|         last_accumulation_checkpoint = std::numeric_limits<size_t>::max(); | ||||
|         accumulation_since_last_sync = has_multi_queries; | ||||
|         pending_sync.clear(); | ||||
|     } | ||||
| 
 | ||||
| @ -400,6 +433,7 @@ private: | ||||
|     void ReserveHostQuery() { | ||||
|         size_t new_slot = ReserveBankSlot(); | ||||
|         current_bank->AddReference(1); | ||||
|         num_slots_used++; | ||||
|         if (current_query) { | ||||
|             size_t bank_id = current_query->start_bank_id; | ||||
|             size_t banks_set = current_query->size_banks - 1; | ||||
| @ -470,32 +504,50 @@ private: | ||||
|         }); | ||||
|     } | ||||
| 
 | ||||
|     void BuildResolveBuffer() { | ||||
|     template <bool is_resolve> | ||||
|     size_t ObtainBuffer(size_t num_needed) { | ||||
|         const size_t log_2 = std::max<size_t>(6U, Common::Log2Ceil64(num_needed)); | ||||
|         if constexpr (is_resolve) { | ||||
|             if (resolve_table[log_2] != 0) { | ||||
|                 return resolve_table[log_2] - 1; | ||||
|             } | ||||
|         } else { | ||||
|             if (intermediary_table[log_2] != 0) { | ||||
|                 return intermediary_table[log_2] - 1; | ||||
|             } | ||||
|         } | ||||
|         const VkBufferCreateInfo buffer_ci = { | ||||
|             .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, | ||||
|             .pNext = nullptr, | ||||
|             .flags = 0, | ||||
|             .size = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * resolve_slots, | ||||
|             .size = SamplesQueryBank::QUERY_SIZE * (1ULL << log_2), | ||||
|             .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | | ||||
|                      VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, | ||||
|             .sharingMode = VK_SHARING_MODE_EXCLUSIVE, | ||||
|             .queueFamilyIndexCount = 0, | ||||
|             .pQueueFamilyIndices = nullptr, | ||||
|         }; | ||||
|         resolve_buffers.emplace_back( | ||||
|             memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal)); | ||||
|         buffers.emplace_back(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal)); | ||||
|         if constexpr (is_resolve) { | ||||
|             resolve_table[log_2] = buffers.size(); | ||||
|         } else { | ||||
|             intermediary_table[log_2] = buffers.size(); | ||||
|         } | ||||
|         return buffers.size() - 1; | ||||
|     } | ||||
| 
 | ||||
|     static constexpr size_t resolve_slots = 8; | ||||
| 
 | ||||
|     QueryCacheRuntime& runtime; | ||||
|     VideoCore::RasterizerInterface* rasterizer; | ||||
|     const Device& device; | ||||
|     Scheduler& scheduler; | ||||
|     const MemoryAllocator& memory_allocator; | ||||
|     VideoCommon::BankPool<SamplesQueryBank> bank_pool; | ||||
|     std::deque<vk::Buffer> resolve_buffers; | ||||
|     std::deque<vk::Buffer> buffers; | ||||
|     std::array<size_t, 32> resolve_table{}; | ||||
|     std::array<size_t, 32> intermediary_table{}; | ||||
|     vk::Buffer accumulation_buffer; | ||||
|     std::deque<std::vector<HostSyncValues>> sync_values_stash; | ||||
|     std::vector<size_t> resolve_buffers; | ||||
| 
 | ||||
|     // syncing queue
 | ||||
|     std::vector<size_t> pending_sync; | ||||
| @ -510,10 +562,14 @@ private: | ||||
|     SamplesQueryBank* current_bank; | ||||
|     VkQueryPool current_query_pool; | ||||
|     size_t current_query_id; | ||||
|     size_t num_slots_used{}; | ||||
|     size_t last_accumulation_checkpoint{}; | ||||
|     bool accumulation_since_last_sync{}; | ||||
|     VideoCommon::HostQueryBase* current_query; | ||||
|     bool has_started{}; | ||||
|     bool current_unset{}; | ||||
|     std::mutex flush_guard; | ||||
| 
 | ||||
|     std::unique_ptr<QueriesPrefixScanPass> queries_prefix_scan_pass; | ||||
| }; | ||||
| 
 | ||||
| // Transform feedback queries
 | ||||
| @ -1090,7 +1146,8 @@ struct QueryCacheRuntimeImpl { | ||||
|           memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_}, | ||||
|           guest_streamer(0, runtime), | ||||
|           sample_streamer(static_cast<size_t>(QueryType::ZPassPixelCount64), runtime, rasterizer, | ||||
|                           device, scheduler, memory_allocator), | ||||
|                           device, scheduler, memory_allocator, compute_pass_descriptor_queue, | ||||
|                           descriptor_pool), | ||||
|           tfb_streamer(static_cast<size_t>(QueryType::StreamingByteCount), runtime, device, | ||||
|                        scheduler, memory_allocator, staging_pool), | ||||
|           primitives_succeeded_streamer( | ||||
| @ -1319,10 +1376,10 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku | ||||
|             return true; | ||||
|         } | ||||
|     } | ||||
|     if (!is_in_bc[0] && !is_in_bc[1]) { | ||||
|     /*if (!is_in_bc[0] && !is_in_bc[1]) {
 | ||||
|         // Both queries are in query cache, it's best to just flush.
 | ||||
|         return false; | ||||
|     } | ||||
|         return true; | ||||
|     }*/ | ||||
|     HostConditionalRenderingCompareBCImpl(object_1.address, equal_check); | ||||
|     return true; | ||||
| } | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user