Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

video_core: added support for indirect draws #678

Merged
merged 2 commits into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/core/libraries/gnmdriver/gnmdriver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -650,12 +650,12 @@ s32 PS4_SYSV_ABI sceGnmDrawIndexAuto(u32* cmdbuf, u32 size, u32 index_count, u32
}

s32 PS4_SYSV_ABI sceGnmDrawIndexIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage,
u32 vertex_sgpr_offset, u32 instance_vgpr_offset,
u32 vertex_sgpr_offset, u32 instance_sgpr_offset,
u32 flags) {
LOG_TRACE(Lib_GnmDriver, "called");

if (cmdbuf && (size == 9) && (shader_stage < ShaderStages::Max) &&
(vertex_sgpr_offset < 0x10u) && (instance_vgpr_offset < 0x10u)) {
(vertex_sgpr_offset < 0x10u) && (instance_sgpr_offset < 0x10u)) {

const auto predicate = flags & 1 ? PM4Predicate::PredEnable : PM4Predicate::PredDisable;
cmdbuf = WriteHeader<PM4ItOpcode::DrawIndexIndirect>(
Expand All @@ -665,7 +665,7 @@ s32 PS4_SYSV_ABI sceGnmDrawIndexIndirect(u32* cmdbuf, u32 size, u32 data_offset,

cmdbuf[0] = data_offset;
cmdbuf[1] = vertex_sgpr_offset == 0 ? 0 : (vertex_sgpr_offset & 0xffffu) + sgpr_offset;
cmdbuf[2] = instance_vgpr_offset == 0 ? 0 : (instance_vgpr_offset & 0xffffu) + sgpr_offset;
cmdbuf[2] = instance_sgpr_offset == 0 ? 0 : (instance_sgpr_offset & 0xffffu) + sgpr_offset;
cmdbuf[3] = 0;

cmdbuf += 4;
Expand Down Expand Up @@ -707,11 +707,11 @@ s32 PS4_SYSV_ABI sceGnmDrawIndexOffset(u32* cmdbuf, u32 size, u32 index_offset,
}

s32 PS4_SYSV_ABI sceGnmDrawIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage,
u32 vertex_sgpr_offset, u32 instance_vgpr_offset, u32 flags) {
u32 vertex_sgpr_offset, u32 instance_sgpr_offset, u32 flags) {
LOG_TRACE(Lib_GnmDriver, "called");

if (cmdbuf && (size == 9) && (shader_stage < ShaderStages::Max) &&
(vertex_sgpr_offset < 0x10u) && (instance_vgpr_offset < 0x10u)) {
(vertex_sgpr_offset < 0x10u) && (instance_sgpr_offset < 0x10u)) {

const auto predicate = flags & 1 ? PM4Predicate::PredEnable : PM4Predicate::PredDisable;
cmdbuf = WriteHeader<PM4ItOpcode::DrawIndirect>(cmdbuf, 4, PM4ShaderType::ShaderGraphics,
Expand All @@ -721,7 +721,7 @@ s32 PS4_SYSV_ABI sceGnmDrawIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32

cmdbuf[0] = data_offset;
cmdbuf[1] = vertex_sgpr_offset == 0 ? 0 : (vertex_sgpr_offset & 0xffffu) + sgpr_offset;
cmdbuf[2] = instance_vgpr_offset == 0 ? 0 : (instance_vgpr_offset & 0xffffu) + sgpr_offset;
cmdbuf[2] = instance_sgpr_offset == 0 ? 0 : (instance_sgpr_offset & 0xffffu) + sgpr_offset;
cmdbuf[3] = 2; // auto index

cmdbuf += 4;
Expand Down
4 changes: 2 additions & 2 deletions src/core/libraries/gnmdriver/gnmdriver.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,15 @@ s32 PS4_SYSV_ABI sceGnmDrawIndex(u32* cmdbuf, u32 size, u32 index_count, uintptr
u32 flags, u32 type);
s32 PS4_SYSV_ABI sceGnmDrawIndexAuto(u32* cmdbuf, u32 size, u32 index_count, u32 flags);
s32 PS4_SYSV_ABI sceGnmDrawIndexIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage,
u32 vertex_sgpr_offset, u32 instance_vgpr_offset,
u32 vertex_sgpr_offset, u32 instance_sgpr_offset,
u32 flags);
int PS4_SYSV_ABI sceGnmDrawIndexIndirectCountMulti();
int PS4_SYSV_ABI sceGnmDrawIndexIndirectMulti();
int PS4_SYSV_ABI sceGnmDrawIndexMultiInstanced();
s32 PS4_SYSV_ABI sceGnmDrawIndexOffset(u32* cmdbuf, u32 size, u32 index_offset, u32 index_count,
u32 flags);
s32 PS4_SYSV_ABI sceGnmDrawIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage,
u32 vertex_sgpr_offset, u32 instance_vgpr_offset, u32 flags);
u32 vertex_sgpr_offset, u32 instance_sgpr_offset, u32 flags);
int PS4_SYSV_ABI sceGnmDrawIndirectCountMulti();
int PS4_SYSV_ABI sceGnmDrawIndirectMulti();
u32 PS4_SYSV_ABI sceGnmDrawInitDefaultHardwareState(u32* cmdbuf, u32 size);
Expand Down
31 changes: 31 additions & 0 deletions src/video_core/amdgpu/liverpool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,36 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
}
break;
}
case PM4ItOpcode::DrawIndirect: {
const auto* draw_indirect = reinterpret_cast<const PM4CmdDrawIndirect*>(header);
const auto offset = draw_indirect->data_offset;
const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr;
const auto size = sizeof(PM4CmdDrawIndirect::DrawInstancedArgs);
if (rasterizer) {
const auto cmd_address = reinterpret_cast<const void*>(header);
rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:DrawIndirect", cmd_address));
rasterizer->Breadcrumb(u64(cmd_address));
rasterizer->DrawIndirect(false, ib_address, offset, size);
rasterizer->ScopeMarkerEnd();
}
break;
}
case PM4ItOpcode::DrawIndexIndirect: {
const auto* draw_index_indirect =
reinterpret_cast<const PM4CmdDrawIndexIndirect*>(header);
const auto offset = draw_index_indirect->data_offset;
const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr;
const auto size = sizeof(PM4CmdDrawIndexIndirect::DrawIndexInstancedArgs);
if (rasterizer) {
const auto cmd_address = reinterpret_cast<const void*>(header);
rasterizer->ScopeMarkerBegin(
fmt::format("dcb:{}:DrawIndexIndirect", cmd_address));
rasterizer->Breadcrumb(u64(cmd_address));
rasterizer->DrawIndirect(true, ib_address, offset, size);
rasterizer->ScopeMarkerEnd();
}
break;
}
case PM4ItOpcode::DispatchDirect: {
const auto* dispatch_direct = reinterpret_cast<const PM4CmdDispatchDirect*>(header);
regs.cs_program.dim_x = dispatch_direct->dim_x;
Expand Down Expand Up @@ -488,6 +518,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
break;
}
case PM4ItOpcode::PfpSyncMe: {
rasterizer->CpSync();
break;
}
default:
Expand Down
61 changes: 47 additions & 14 deletions src/video_core/amdgpu/pm4_cmds.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,20 +253,6 @@ struct PM4CmdDrawIndexAuto {
u32 draw_initiator;
};

struct PM4CmdDrawIndirect {
PM4Type3Header header; ///< header
u32 data_offset; ///< DWORD aligned offset
union {
u32 dw2;
BitField<0, 16, u32> base_vtx_loc; ///< base vertex location
};
union {
u32 dw3;
BitField<0, 16, u32> start_inst_loc; ///< start instance location
};
u32 draw_initiator; ///< Draw Initiator Register
};

enum class DataSelect : u32 {
None = 0,
Data32Low = 1,
Expand Down Expand Up @@ -740,4 +726,51 @@ struct PM4CmdDispatchIndirect {
u32 dispatch_initiator; ///< Dispatch Initiator Register
};

struct PM4CmdDrawIndirect {
struct DrawInstancedArgs {
u32 vertex_count_per_instance;
u32 instance_count;
u32 start_vertex_location;
u32 start_instance_location;
};

PM4Type3Header header; ///< header
u32 data_offset; ///< Byte aligned offset where the required data structure starts
union {
u32 dw2;
BitField<0, 16, u32> base_vtx_loc; ///< Offset where the CP will write the
///< BaseVertexLocation it fetched from memory
};
union {
u32 dw3;
BitField<0, 16, u32> start_inst_loc; ///< Offset where the CP will write the
///< StartInstanceLocation it fetched from memory
};
u32 draw_initiator; ///< Draw Initiator Register
};

struct PM4CmdDrawIndexIndirect {
struct DrawIndexInstancedArgs {
u32 index_count_per_instance;
u32 instance_count;
u32 start_index_location;
u32 base_vertex_location;
u32 start_instance_location;
};

PM4Type3Header header; ///< header
u32 data_offset; ///< Byte aligned offset where the required data structure starts
union {
u32 dw2;
BitField<0, 16, u32> base_vtx_loc; ///< Offset where the CP will write the
///< BaseVertexLocation it fetched from memory
};
union { // NOTE: this one is undocumented in AMD spec, but Gnm driver writes this field
u32 dw3;
BitField<0, 16, u32> start_inst_loc; ///< Offset where the CP will write the
///< StartInstanceLocation it fetched from memory
};
u32 draw_initiator; ///< Draw Initiator Register
};

} // namespace AmdGpu
65 changes: 52 additions & 13 deletions src/video_core/renderer_vulkan/vk_rasterizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,19 @@ Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_,

Rasterizer::~Rasterizer() = default;

void Rasterizer::CpSync() {
scheduler.EndRendering();
auto cmdbuf = scheduler.CommandBuffer();

const vk::MemoryBarrier ib_barrier{
.srcAccessMask = vk::AccessFlagBits::eShaderWrite,
.dstAccessMask = vk::AccessFlagBits::eIndirectCommandRead,
};
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader,
vk::PipelineStageFlagBits::eDrawIndirect,
vk::DependencyFlagBits::eByRegion, ib_barrier, {}, {});
}

void Rasterizer::Draw(bool is_indexed, u32 index_offset) {
RENDERER_TRACE;

Expand Down Expand Up @@ -66,6 +79,45 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) {
}
}

void Rasterizer::DrawIndirect(bool is_indexed, VAddr address, u32 offset, u32 size) {
RENDERER_TRACE;

const auto cmdbuf = scheduler.CommandBuffer();
const auto& regs = liverpool->regs;
const GraphicsPipeline* pipeline = pipeline_cache.GetGraphicsPipeline();
if (!pipeline) {
return;
}

ASSERT_MSG(regs.primitive_type != AmdGpu::Liverpool::PrimitiveType::RectList,
"Unsupported primitive type for indirect draw");

try {
pipeline->BindResources(regs, buffer_cache, texture_cache);
} catch (...) {
UNREACHABLE();
}

const auto& vs_info = pipeline->GetStage(Shader::Stage::Vertex);
buffer_cache.BindVertexBuffers(vs_info);
const u32 num_indices = buffer_cache.BindIndexBuffer(is_indexed, 0);

BeginRendering();
UpdateDynamicState(*pipeline);

const auto [buffer, base] = buffer_cache.ObtainBuffer(address, size, true);
const auto total_offset = base + offset;

// We can safely ignore both SGPR UD indices and results of fetch shader parsing, as vertex and
// instance offsets will be automatically applied by Vulkan from indirect args buffer.

if (is_indexed) {
cmdbuf.drawIndexedIndirect(buffer->Handle(), total_offset, 1, 0);
} else {
cmdbuf.drawIndirect(buffer->Handle(), total_offset, 1, 0);
}
}

void Rasterizer::DispatchDirect() {
RENDERER_TRACE;

Expand Down Expand Up @@ -113,19 +165,6 @@ void Rasterizer::DispatchIndirect(VAddr address, u32 offset, u32 size) {
cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->Handle());
const auto [buffer, base] = buffer_cache.ObtainBuffer(address, size, true);
const auto total_offset = base + offset;

// Emulate PFP-to-ME sync packet
const vk::BufferMemoryBarrier ib_barrier{
.srcAccessMask = vk::AccessFlagBits::eShaderWrite,
.dstAccessMask = vk::AccessFlagBits::eIndirectCommandRead,
.buffer = buffer->Handle(),
.offset = total_offset,
.size = size,
};
cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader,
vk::PipelineStageFlagBits::eDrawIndirect,
vk::DependencyFlagBits::eByRegion, {}, ib_barrier, {});

cmdbuf.dispatchIndirect(buffer->Handle(), total_offset);
}

Expand Down
2 changes: 2 additions & 0 deletions src/video_core/renderer_vulkan/vk_rasterizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class Rasterizer {
}

void Draw(bool is_indexed, u32 index_offset = 0);
void DrawIndirect(bool is_indexed, VAddr address, u32 offset, u32 size);

void DispatchDirect();
void DispatchIndirect(VAddr address, u32 offset, u32 size);
Expand All @@ -45,6 +46,7 @@ class Rasterizer {
void MapMemory(VAddr addr, u64 size);
void UnmapMemory(VAddr addr, u64 size);

void CpSync();
u64 Flush();

private:
Expand Down
Loading