Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions 73_ImageUploadBenchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
include(common RESULT_VARIABLE RES)
if(NOT RES)
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
endif()

nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")

if(NBL_EMBED_BUILTIN_RESOURCES)
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
set(RESOURCE_DIR "app_resources")

get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)

file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
endforeach()

ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")

LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
endif()

set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
set(DEPENDS
app_resources/common.hlsl
app_resources/tile_upload.comp.hlsl
)
target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)

set(SM 6_8)
set(JSON [=[
[
{
"INPUT": "app_resources/tile_upload.comp.hlsl",
"KEY": "snakeStore"
}
]
]=])
string(CONFIGURE "${JSON}" JSON)

NBL_CREATE_NSC_COMPILE_RULES(
TARGET ${EXECUTABLE_NAME}SPIRV
LINK_TO ${EXECUTABLE_NAME}
DEPENDS ${DEPENDS}
BINARY_DIR ${OUTPUT_DIRECTORY}
MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
COMMON_OPTIONS -I "${CMAKE_CURRENT_SOURCE_DIR}" -T lib_${SM}
OUTPUT_VAR KEYS
INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
NAMESPACE nbl::this_example::builtin::build
INPUTS ${JSON}
)

NBL_CREATE_RESOURCE_ARCHIVE(
NAMESPACE nbl::this_example::builtin::build
TARGET ${EXECUTABLE_NAME}_builtinsBuild
LINK_TO ${EXECUTABLE_NAME}
BIND ${OUTPUT_DIRECTORY}
BUILTINS ${KEYS}
)
10 changes: 10 additions & 0 deletions 73_ImageUploadBenchmark/app_resources/common.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#include <nbl/builtin/hlsl/morton.hlsl>

struct PushConstantData
{
uint64_t deviceBufferAddress;
uint32_t2 dstOffset;
uint32_t srcWidth;
uint32_t srcHeight;
uint32_t tilesPerRow;
};
72 changes: 72 additions & 0 deletions 73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#include "common.hlsl"

[[vk::binding(0,0)]] RWTexture2D<float32_t4> dstImage;
[[vk::push_constant]] PushConstantData pc;

using namespace nbl::hlsl;

static const uint32_t TILE_SIZE = 128u;

[numthreads(128, 1, 1)]
Copy link
Copy Markdown
Contributor

@Erfan-Ahmadi Erfan-Ahmadi Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • try 2D workgroups.
  • make workgroup size 128x4 (make sure it reflects on you dispatch) --> 128*4=512 is a good workgroup size, it can fit exactly 3 workgroups on modern SMs
  • pixelPos will be just the global thread idx. we need to have individual offsets for each tile requests
  • your tile size is PoT, you could just use bitshift with 7 for division << TILE_SIZE_LOG2. for modulo use &127u or &(TILE_SIZE-1)
  • tileIdx will globalPos.xy << 7 (define TILE_SIZE_LOG2)
  • localPos will be globalPos.xy&127
  • your start read location will be tileIdxTILE_SIZETILE_SIZE or tileIdx << 14u
  • you get it :D I won't continue

[shader("compute")]
void SnakeStore(uint32_t3 ID : SV_DispatchThreadID)
{
const uint32_t gIdx = ID.x;
const uint32_t tileIdx = gIdx / (TILE_SIZE * TILE_SIZE);
const uint32_t localIdx = gIdx % (TILE_SIZE * TILE_SIZE);
const uint32_t2 tileBase = uint32_t2(tileIdx % pc.tilesPerRow, tileIdx / pc.tilesPerRow) * TILE_SIZE;
const uint32_t2 localPos = uint32_t2(localIdx % TILE_SIZE, localIdx / TILE_SIZE);
const uint32_t2 pixelPos = tileBase + localPos;

const uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u);
dstImage[pixelPos] = unpackUnorm4x8(int32_t(packed));
}

[numthreads(128, 1, 1)]
[shader("compute")]
void SnakeLoad(uint32_t3 ID : SV_DispatchThreadID)
{
const uint32_t gIdx = ID.x;
const uint32_t tileIdx = gIdx / (TILE_SIZE * TILE_SIZE);
const uint32_t localIdx = gIdx % (TILE_SIZE * TILE_SIZE);
const uint32_t2 tileBase = uint32_t2(tileIdx % pc.tilesPerRow, tileIdx / pc.tilesPerRow) * TILE_SIZE;
const uint32_t2 localPos = uint32_t2(localIdx % TILE_SIZE, localIdx / TILE_SIZE);
const uint32_t2 pixelPos = tileBase + localPos;

vk::RawBufferStore<uint32_t>(pc.deviceBufferAddress + gIdx * 4u, uint32_t(packUnorm4x8(dstImage[pixelPos])));
}

[numthreads(128, 1, 1)]
[shader("compute")]
void MortonStore(uint32_t3 ID : SV_DispatchThreadID)
Copy link
Copy Markdown
Contributor

@Erfan-Ahmadi Erfan-Ahmadi Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • make the workgroup 2d 16x16 or a 512 1D workgroup
  • but it needs to handle 16x16 region of the tile
  • your read pos stays the same with flattened global idx (we need to make sure you're reading contigously)
  • morton::code<false, 7, 2> mc; you now only need 4 bits for 16x16 so it becomes 4,2 I think
  • use the bitshift and & for division and modulo like my prev commit.
    • it's very likely the compiler already does this optimization for you since TILE_SIZE is a macro, but it's good practice, in case it changed later to a push constant or something not known at compile time
  • make sure this change is reflected on your dispatch;
    • since each tile is 128x128, it'll take 64 workgroups of size 512 to handle copy of a single tile for you
    • use morton code locally within this 16x16 to figure out the write location + add offset.
    • doing morton on 16x16 tiles with added offset is no different than doing morton globally on a 128x128 tile (see image below)

Mortong Example for 16x16 group:
thread 0: reads(0,0) at location 0 writes to pixelPos(0,0)
thread 1: reads(1,0) at location 1*ByteSize writes to pixelPos(1,0)
thread 2: reads(0,1) at location 2*ByteSize writes to pixelPos(0,1)
thread 3: reads(1,1) at location 3*ByteSize writes to pixelPos(1,1)
thread 4: reads(2,0) at location 4*ByteSize writes to pixelPos(2,0)
...
make sure this is what happes, reads are contigous, writes are morton ordered
might be actually easier to achieve this with single 1D 512 workgroup, not sure

Image

Copy link
Copy Markdown
Contributor

@Erfan-Ahmadi Erfan-Ahmadi Apr 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw we're not going to go with morton benchmarking any further, but let's just fix it up and make it work as we first intended.

{
const uint32_t gIdx = ID.x;
const uint32_t tileIdx = gIdx / (TILE_SIZE * TILE_SIZE);
const uint32_t localIdx = gIdx % (TILE_SIZE * TILE_SIZE);
const uint32_t2 tileBase = uint32_t2(tileIdx % pc.tilesPerRow, tileIdx / pc.tilesPerRow) * TILE_SIZE;

morton::code<false, 7, 2> mc;
mc.value = uint16_t(localIdx);
const uint32_t2 localPos = _static_cast<uint32_t2>(mc);
const uint32_t2 pixelPos = tileBase + localPos;

const uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u);
dstImage[pixelPos] = unpackUnorm4x8(int32_t(packed));
}

[numthreads(128, 1, 1)]
[shader("compute")]
void MortonLoad(uint32_t3 ID : SV_DispatchThreadID)
{
const uint32_t gIdx = ID.x;
const uint32_t tileIdx = gIdx / (TILE_SIZE * TILE_SIZE);
const uint32_t localIdx = gIdx % (TILE_SIZE * TILE_SIZE);
const uint32_t2 tileBase = uint32_t2(tileIdx % pc.tilesPerRow, tileIdx / pc.tilesPerRow) * TILE_SIZE;

morton::code<false, 7, 2> mc;
mc.value = uint16_t(localIdx);
const uint32_t2 localPos = _static_cast<uint32_t2>(mc);
const uint32_t2 pixelPos = tileBase + localPos;

vk::RawBufferStore<uint32_t>(pc.deviceBufferAddress + gIdx * 4u, uint32_t(packUnorm4x8(dstImage[pixelPos])));
}
28 changes: 28 additions & 0 deletions 73_ImageUploadBenchmark/config.json.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"enableParallelBuild": true,
"threadsPerBuildProcess" : 2,
"isExecuted": false,
"scriptPath": "",
"cmake": {
"configurations": [ "Release", "Debug", "RelWithDebInfo" ],
"buildModes": [],
"requiredOptions": []
},
"profiles": [
{
"backend": "vulkan", // should be none
"platform": "windows",
"buildModes": [],
"runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
"gpuArchitectures": []
}
],
"dependencies": [],
"data": [
{
"dependencies": [],
"command": [""],
"outputs": []
}
]
}
Loading