From 37b6859865d3699ef7ad7ae2dfa2009a6cac9828 Mon Sep 17 00:00:00 2001 From: zhuanghb3 Date: Thu, 25 Sep 2025 02:18:18 +0000 Subject: [PATCH 1/3] =?UTF-8?q?=E4=BD=BF=E8=83=BDvulkan=E6=94=AF=E6=8C=81a?= =?UTF-8?q?stc=E7=BA=B9=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- patchForAndroid/external-mesa-0002.patch | 6992 +++++++++++++++++----- 1 file changed, 5460 insertions(+), 1532 deletions(-) diff --git a/patchForAndroid/external-mesa-0002.patch b/patchForAndroid/external-mesa-0002.patch index bd9c3bf..50563ea 100644 --- a/patchForAndroid/external-mesa-0002.patch +++ b/patchForAndroid/external-mesa-0002.patch @@ -1,456 +1,1343 @@ -diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build -index 71c8b16d4fc..691ebd9d616 100644 ---- a/src/amd/vulkan/meson.build -+++ b/src/amd/vulkan/meson.build -@@ -70,6 +70,7 @@ libradv_files = files( - 'radv_meta_fmask_copy.c', - 'radv_meta_fmask_expand.c', - 'radv_meta_astc_decode.c', -+ 'radv_meta_rgba_encode.c', - 'radv_meta_resolve.c', - 'radv_meta_resolve_cs.c', - 'radv_meta_resolve_fs.c', -diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c -index 3fd4ea777a8..dfd7bdca894 100644 ---- a/src/amd/vulkan/radv_image.c -+++ b/src/amd/vulkan/radv_image.c -@@ -38,6 +38,7 @@ - #include +diff --git a/include/vulkan/vulkan_core.h b/include/vulkan/vulkan_core.h +index 5c8b8461f1c..ca180aa5bdc 100644 +--- a/include/vulkan/vulkan_core.h ++++ b/include/vulkan/vulkan_core.h +@@ -1499,6 +1499,8 @@ typedef enum VkFormat { + VK_FORMAT_PVRTC1_4BPP_SRGB_BLOCK_IMG = 1000054005, + VK_FORMAT_PVRTC2_2BPP_SRGB_BLOCK_IMG = 1000054006, + VK_FORMAT_PVRTC2_4BPP_SRGB_BLOCK_IMG = 1000054007, ++ VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR = 1000470000, ++ VK_FORMAT_A8_UNORM_KHR = 1000470001, + VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK_EXT = VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK, + VK_FORMAT_ASTC_5x4_SFLOAT_BLOCK_EXT = VK_FORMAT_ASTC_5x4_SFLOAT_BLOCK, + VK_FORMAT_ASTC_5x5_SFLOAT_BLOCK_EXT = VK_FORMAT_ASTC_5x5_SFLOAT_BLOCK, +diff --git a/meson.build b/meson.build +index e479c3bf873..1b620f52f8d 100644 +--- a/meson.build ++++ b/meson.build +@@ -666,6 +666,15 @@ if vdpau_drivers_path == '' + vdpau_drivers_path = join_paths(get_option('libdir'), 'vdpau') + endif + ++prog_glslang = find_program('glslangValidator', native : true, required : with_vulkan_overlay_layer or with_aco_tests or with_amd_vk or with_intel_vk) ++if prog_glslang.found() ++ if run_command(prog_glslang, [ '--quiet', '--version' ], check : false).returncode() == 0 ++ glslang_quiet = ['--quiet'] ++ else ++ glslang_quiet = [] ++ endif ++endif ++ + if with_gallium_zink + dep_vulkan = dependency('vulkan') + endif +diff --git a/src/amd/common/ac_surface.c b/src/amd/common/ac_surface.c +index f07c465220c..54d49e5f12c 100644 +--- a/src/amd/common/ac_surface.c ++++ b/src/amd/common/ac_surface.c +@@ -557,6 +557,82 @@ static int surf_config_sanity(const struct ac_surf_config *config, unsigned flag + return 0; + } + ++static unsigned bpe_to_format(struct radeon_surf *surf) ++{ ++ if (surf->blk_w != 1 || surf->blk_h != 1) { ++ if (surf->blk_w == 4 && surf->blk_h == 4) { ++ switch (surf->bpe) { ++ case 8: ++ return ADDR_FMT_BC1; ++ case 16: ++ /* since BC3 and ASTC4x4 has same blk dimension and bpe reporting BC3 also for ASTC4x4. ++ * matching is fine since addrlib needs only blk_w, blk_h and bpe to compute surface ++ * properties. ++ * TODO: If compress_type can be passed to this function, then this ugly BC3 and ASTC4x4 ++ * matching can be avoided. ++ */ ++ return ADDR_FMT_BC3; ++ default: ++ unreachable("invalid compressed bpe"); ++ } ++ } else if (surf->blk_w == 5 && surf->blk_h == 4) ++ return ADDR_FMT_ASTC_5x4; ++ else if (surf->blk_w == 5 && surf->blk_h == 5) ++ return ADDR_FMT_ASTC_5x5; ++ else if (surf->blk_w == 6 && surf->blk_h == 5) ++ return ADDR_FMT_ASTC_6x5; ++ else if (surf->blk_w == 6 && surf->blk_h == 6) ++ return ADDR_FMT_ASTC_6x6; ++ else if (surf->blk_w == 8 && surf->blk_h == 5) ++ return ADDR_FMT_ASTC_8x5; ++ else if (surf->blk_w == 8 && surf->blk_h == 6) ++ return ADDR_FMT_ASTC_8x6; ++ else if (surf->blk_w == 8 && surf->blk_h == 8) ++ return ADDR_FMT_ASTC_8x8; ++ else if (surf->blk_w == 10 && surf->blk_h == 5) ++ return ADDR_FMT_ASTC_10x5; ++ else if (surf->blk_w == 10 && surf->blk_h == 6) ++ return ADDR_FMT_ASTC_10x6; ++ else if (surf->blk_w == 10 && surf->blk_h == 8) ++ return ADDR_FMT_ASTC_10x8; ++ else if (surf->blk_w == 10 && surf->blk_h == 10) ++ return ADDR_FMT_ASTC_10x10; ++ else if (surf->blk_w == 12 && surf->blk_h == 10) ++ return ADDR_FMT_ASTC_12x10; ++ else if (surf->blk_w == 12 && surf->blk_h == 12) ++ return ADDR_FMT_ASTC_12x12; ++ } else { ++ switch (surf->bpe) { ++ case 1: ++ assert(!(surf->flags & RADEON_SURF_ZBUFFER)); ++ return ADDR_FMT_8; ++ case 2: ++ assert(surf->flags & RADEON_SURF_ZBUFFER || !(surf->flags & RADEON_SURF_SBUFFER)); ++ return ADDR_FMT_16; ++ case 4: ++ assert(surf->flags & RADEON_SURF_ZBUFFER || !(surf->flags & RADEON_SURF_SBUFFER)); ++ return ADDR_FMT_32; ++ case 8: ++ assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); ++ return ADDR_FMT_32_32; ++ case 12: ++ assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); ++ return ADDR_FMT_32_32_32; ++ case 16: ++ assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); ++ return ADDR_FMT_32_32_32_32; ++ default: ++ unreachable("invalid bpe"); ++ } ++ } ++ return ADDR_FMT_INVALID; ++} ++ ++/* The addrlib pitch alignment is forced to this number for all chips to support interop ++ * between any 2 chips. ++ */ ++#define LINEAR_PITCH_ALIGNMENT 256 ++ + static int gfx6_compute_level(ADDR_HANDLE addrlib, const struct ac_surf_config *config, + struct radeon_surf *surf, bool is_stencil, unsigned level, + bool compressed, ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn, +@@ -1002,23 +1078,10 @@ static int gfx6_compute_surface(ADDR_HANDLE addrlib, const struct radeon_info *i + assert(0); + } - #include "gfx10_format_table.h" -+#include "vk_texcompress_bcn.h" +- /* The format must be set correctly for the allocation of compressed +- * textures to work. In other cases, setting the bpp is sufficient. +- */ +- if (compressed) { +- switch (surf->bpe) { +- case 8: +- AddrSurfInfoIn.format = ADDR_FMT_BC1; +- break; +- case 16: +- AddrSurfInfoIn.format = ADDR_FMT_BC3; +- break; +- default: +- assert(0); +- } +- } else { ++ AddrSurfInfoIn.format = bpe_to_format(surf); ++ if (!compressed) + AddrDccIn.bpp = AddrSurfInfoIn.bpp = surf->bpe * 8; +- } ++ - static const VkImageUsageFlagBits RADV_IMAGE_USAGE_WRITE_BITS = - VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | -@@ -546,6 +547,66 @@ radv_patch_image_from_extra_info(struct radv_device *device, struct radv_image * + AddrDccIn.numSamples = AddrSurfInfoIn.numSamples = MAX2(1, config->info.samples); + AddrSurfInfoIn.tileIndex = -1; +@@ -2046,50 +2109,9 @@ static int gfx9_compute_surface(struct ac_addrlib *addrlib, const struct radeon_ - extern bool radv_is_format_emulated(const struct radv_physical_device *physical_device, VkFormat format); + compressed = surf->blk_w == 4 && surf->blk_h == 4; -+static char isEnableHardEncode[PROPERTY_VALUE_MAX]; -+VkFormat -+radv_translate_rgba2dxt5(VkFormat format) +- /* The format must be set correctly for the allocation of compressed +- * textures to work. In other cases, setting the bpp is sufficient. */ +- if (compressed) { +- switch (surf->bpe) { +- case 8: +- AddrSurfInfoIn.format = ADDR_FMT_BC1; +- break; +- case 16: +- AddrSurfInfoIn.format = ADDR_FMT_BC3; +- break; +- default: +- assert(0); +- } +- } else { +- switch (surf->bpe) { +- case 1: +- assert(!(surf->flags & RADEON_SURF_ZBUFFER)); +- AddrSurfInfoIn.format = ADDR_FMT_8; +- break; +- case 2: +- assert(surf->flags & RADEON_SURF_ZBUFFER || !(surf->flags & RADEON_SURF_SBUFFER)); +- AddrSurfInfoIn.format = ADDR_FMT_16; +- break; +- case 4: +- assert(surf->flags & RADEON_SURF_ZBUFFER || !(surf->flags & RADEON_SURF_SBUFFER)); +- AddrSurfInfoIn.format = ADDR_FMT_32; +- break; +- case 8: +- assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); +- AddrSurfInfoIn.format = ADDR_FMT_32_32; +- break; +- case 12: +- assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); +- AddrSurfInfoIn.format = ADDR_FMT_32_32_32; +- break; +- case 16: +- assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)); +- AddrSurfInfoIn.format = ADDR_FMT_32_32_32_32; +- break; +- default: +- assert(0); +- } ++ AddrSurfInfoIn.format = bpe_to_format(surf); ++ if (!compressed) + AddrSurfInfoIn.bpp = surf->bpe * 8; +- } + + bool is_color_surface = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER); + AddrSurfInfoIn.flags.color = is_color_surface && !(surf->flags & RADEON_SURF_NO_RENDER_TARGET); +@@ -2812,7 +2834,7 @@ bool ac_surface_override_offset_stride(const struct radeon_info *info, struct ra + } + } + surf->u.gfx9.surf_offset = offset; +- if (surf->u.gfx9.zs.stencil_offset) ++ if (surf->has_stencil) + surf->u.gfx9.zs.stencil_offset += offset; + } else { + if (pitch) { +@@ -2919,6 +2941,84 @@ uint64_t ac_surface_get_plane_size(const struct radeon_surf *surf, + } + } + ++uint64_t ++ac_surface_addr_from_coord(struct ac_addrlib *addrlib, const struct radeon_info *info, ++ const struct radeon_surf *surf, const struct ac_surf_info *surf_info, ++ unsigned level, unsigned x, unsigned y, unsigned layer, bool is_3d) +{ -+ switch (format) { -+ case VK_FORMAT_R8G8B8A8_UNORM: -+ return VK_FORMAT_BC3_UNORM_BLOCK; -+ case VK_FORMAT_R8G8B8A8_SRGB: -+ return VK_FORMAT_BC3_SRGB_BLOCK; ++ /* Only implemented for GFX9+ */ ++ assert(info->gfx_level >= GFX9); ++ ++ ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT input = {0}; ++ input.size = sizeof(ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT); ++ input.slice = layer; ++ input.mipId = level; ++ input.unalignedWidth = DIV_ROUND_UP(surf_info->width, surf->blk_w); ++ input.unalignedHeight = DIV_ROUND_UP(surf_info->height, surf->blk_h); ++ input.numSlices = is_3d ? surf_info->depth : surf_info->array_size; ++ input.numMipLevels = surf_info->levels; ++ input.numSamples = surf_info->samples; ++ input.numFrags = surf_info->samples; ++ input.swizzleMode = surf->u.gfx9.swizzle_mode; ++ input.resourceType = surf->u.gfx9.resource_type; ++ input.pipeBankXor = surf->tile_swizzle; ++ input.bpp = surf->bpe * 8; ++ input.x = x; ++ input.y = y; ++ ++ ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT output = {0}; ++ output.size = sizeof(ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT); ++ Addr2ComputeSurfaceAddrFromCoord(addrlib->handle, &input, &output); ++ return output.addr; ++} ++ ++void ++ac_surface_compute_nbc_view(struct ac_addrlib *addrlib, const struct radeon_info *info, ++ const struct radeon_surf *surf, const struct ac_surf_info *surf_info, ++ unsigned level, unsigned layer, struct ac_surf_nbc_view *out) ++{ ++ /* Only implemented for GFX10+ */ ++ assert(info->gfx_level >= GFX10); ++ ++ ADDR2_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT input = {0}; ++ input.size = sizeof(ADDR2_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT); ++ input.swizzleMode = surf->u.gfx9.swizzle_mode; ++ input.resourceType = surf->u.gfx9.resource_type; ++ switch (surf->bpe) { ++ case 8: ++ input.format = ADDR_FMT_BC1; ++ break; ++ case 16: ++ input.format = ADDR_FMT_BC3; ++ break; + default: -+ return format; ++ assert(0); ++ } ++ input.width = surf_info->width; ++ input.height = surf_info->height; ++ input.numSlices = surf_info->array_size; ++ input.numMipLevels = surf_info->levels; ++ input.pipeBankXor = surf->tile_swizzle; ++ input.slice = layer; ++ input.mipId = level; ++ ++ ADDR_E_RETURNCODE res; ++ ADDR2_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT output = {0}; ++ output.size = sizeof(ADDR2_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT); ++ res = Addr2ComputeNonBlockCompressedView(addrlib->handle, &input, &output); ++ if (res == ADDR_OK) { ++ out->base_address_offset = output.offset; ++ out->tile_swizzle = output.pipeBankXor; ++ out->width = output.unalignedWidth; ++ out->height = output.unalignedHeight; ++ out->num_levels = output.numMipLevels; ++ out->level = output.mipId; ++ out->valid = true; ++ } else { ++ out->valid = false; + } +} + -+static bool -+radv_need_hard_encode(const struct radv_physical_device *pdev, const VkImageCreateInfo *pCreateInfo) + void ac_surface_print_info(FILE *out, const struct radeon_info *info, + const struct radeon_surf *surf) + { +diff --git a/src/amd/common/ac_surface.h b/src/amd/common/ac_surface.h +index 1f8114bb4b7..baf5aea3d12 100644 +--- a/src/amd/common/ac_surface.h ++++ b/src/amd/common/ac_surface.h +@@ -415,6 +415,18 @@ struct ac_surf_config { + unsigned is_cube : 1; + }; + ++ ++/* Output parameters for ac_surface_compute_nbc_view */ ++struct ac_surf_nbc_view { ++ bool valid; ++ uint32_t width; ++ uint32_t height; ++ uint32_t level; ++ uint32_t num_levels; /* Used for max_mip in the resource descriptor */ ++ uint8_t tile_swizzle; ++ uint64_t base_address_offset; ++}; ++ + struct ac_addrlib *ac_addrlib_create(const struct radeon_info *info, uint64_t *max_alignment); + void ac_addrlib_destroy(struct ac_addrlib *addrlib); + void *ac_addrlib_get_handle(struct ac_addrlib *addrlib); +@@ -470,6 +482,16 @@ uint64_t ac_surface_get_plane_stride(enum chip_class chip_class, + uint64_t ac_surface_get_plane_size(const struct radeon_surf *surf, + unsigned plane); + ++ ++uint64_t ac_surface_addr_from_coord(struct ac_addrlib *addrlib, const struct radeon_info *info, ++ const struct radeon_surf *surf, ++ const struct ac_surf_info *surf_info, unsigned level, ++ unsigned x, unsigned y, unsigned layer, bool is_3d); ++void ac_surface_compute_nbc_view(struct ac_addrlib *addrlib, const struct radeon_info *info, ++ const struct radeon_surf *surf, ++ const struct ac_surf_info *surf_info, unsigned level, ++ unsigned layer, struct ac_surf_nbc_view *out); ++ + void ac_surface_print_info(FILE *out, const struct radeon_info *info, + const struct radeon_surf *surf); + +diff --git a/src/amd/compiler/aco_lower_phis.cpp b/src/amd/compiler/aco_lower_phis.cpp +index 6b8f611ecc7..e49e9805734 100644 +--- a/src/amd/compiler/aco_lower_phis.cpp ++++ b/src/amd/compiler/aco_lower_phis.cpp +@@ -50,73 +50,81 @@ struct ssa_state { + std::vector outputs; /* the output per block */ + }; + +-Operand +-get_ssa(Program* program, unsigned block_idx, ssa_state* state, bool input) ++Operand get_output(Program* program, unsigned block_idx, ssa_state* state); ++ ++void ++fill_outputs(Program* program, ssa_state* state, unsigned start, unsigned end) ++ + { +- if (!input) { +- if (state->visited[block_idx]) +- return state->outputs[block_idx]; +- +- /* otherwise, output == input */ +- Operand output = get_ssa(program, block_idx, state, true); +- state->visited[block_idx] = true; +- state->outputs[block_idx] = output; +- return output; ++ for (unsigned i = start; i < end; ++i) { ++ if (state->visited[i]) ++ continue; ++ state->outputs[i] = get_output(program, i, state); ++ state->visited[i] = true; + } ++} ++ ++Operand ++get_output(Program* program, unsigned block_idx, ssa_state* state) +{ -+ if (property_get("sys.vmi.vk.texturecompress", isEnableHardEncode, "0") && strcmp(isEnableHardEncode, "2") != 0) { -+ return false; ++ Block& block = program->blocks[block_idx]; + +- /* retrieve the Operand by checking the predecessors */ + if (state->any_pred_defined[block_idx] == pred_defined::undef) + return Operand(program->lane_mask); + +- Block& block = program->blocks[block_idx]; +- size_t pred = block.linear_preds.size(); +- Operand op; +- if (block.loop_nest_depth < state->loop_nest_depth) { ++ if (block.loop_nest_depth < state->loop_nest_depth) + /* loop-carried value for loop exit phis */ +- op = Operand::zero(program->lane_mask.bytes()); +- } else if (block.loop_nest_depth > state->loop_nest_depth || pred == 1 || +- block.kind & block_kind_loop_exit) { +- op = get_ssa(program, block.linear_preds[0], state, false); +- } else { +- assert(pred > 1); +- bool previously_visited = state->visited[block_idx]; +- /* potential recursion: anchor at loop header */ +- if (block.kind & block_kind_loop_header) { +- assert(!previously_visited); +- previously_visited = true; +- state->visited[block_idx] = true; +- state->outputs[block_idx] = Operand(Temp(program->allocateTmp(program->lane_mask))); +- } ++ return Operand::zero(program->lane_mask.bytes()); + +- /* collect predecessor output operands */ +- std::vector ops(pred); +- for (unsigned i = 0; i < pred; i++) +- ops[i] = get_ssa(program, block.linear_preds[i], state, false); ++ size_t pred = block.linear_preds.size(); ++ if (block.loop_nest_depth > state->loop_nest_depth || pred == 1 || ++ block.kind & block_kind_loop_exit) ++ return state->outputs[block.linear_preds[0]]; + +- /* check triviality */ +- if (std::all_of(ops.begin() + 1, ops.end(), [&](Operand same) { return same == ops[0]; })) +- return ops[0]; ++ if (block.kind & block_kind_loop_header) { ++ assert(!state->visited[block_idx]); + +- /* Return if this was handled in a recursive call by a loop header phi */ +- if (!previously_visited && state->visited[block_idx]) +- return state->outputs[block_idx]; + +- if (block.kind & block_kind_loop_header) +- op = state->outputs[block_idx]; +- else +- op = Operand(Temp(program->allocateTmp(program->lane_mask))); +- +- /* create phi */ +- aco_ptr phi{ +- create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)}; +- for (unsigned i = 0; i < pred; i++) +- phi->operands[i] = ops[i]; +- phi->definitions[0] = Definition(op.getTemp()); +- block.instructions.emplace(block.instructions.begin(), std::move(phi)); ++ unsigned start_idx = block_idx + 1; ++ unsigned end_idx = block_idx + 1; ++ while (program->blocks[end_idx].loop_nest_depth >= state->loop_nest_depth) ++ end_idx++; ++ ++ state->outputs[block_idx] = Operand(Temp(program->allocateTmp(program->lane_mask))); ++ fill_outputs(program, state, start_idx, end_idx); + } + +- assert(op.size() == program->lane_mask.size()); +- return op; ++ /* collect predecessor output operands */ ++ std::vector ops(pred); ++ for (unsigned i = 0; i < pred; i++) ++ ops[i] = state->outputs[block.linear_preds[i]]; ++ ++ /* check triviality ++ * For loop exit phis that go through the loop header, we already defined a temp as the loop ++ * header block's output. This temp always needs to be defined, so we can't avoid creating the ++ * phi instruction. */ ++ if (!(block.kind & block_kind_loop_header) && ++ std::all_of(ops.begin() + 1, ops.end(), [&](Operand same) { return same == ops[0]; })) ++ return ops[0]; ++ ++ Operand output; ++ ++ if (block.kind & block_kind_loop_header) ++ output = state->outputs[block_idx]; ++ else ++ output = Operand(Temp(program->allocateTmp(program->lane_mask))); ++ ++ /* create phi */ ++ aco_ptr phi{ ++ create_instruction(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)}; ++ for (unsigned i = 0; i < pred; i++) ++ phi->operands[i] = ops[i]; ++ phi->definitions[0] = Definition(output.getTemp()); ++ block.instructions.emplace(block.instructions.begin(), std::move(phi)); ++ ++ assert(output.size() == program->lane_mask.size()); ++ ++ return output; + } + + void +@@ -139,7 +147,7 @@ build_merge_code(Program* program, ssa_state* state, Block* block, Operand cur) + { + unsigned block_idx = block->index; + Definition dst = Definition(state->outputs[block_idx].getTemp()); +- Operand prev = get_ssa(program, block_idx, state, true); ++ Operand prev = get_output(program, block_idx, state); + if (cur.isUndefined()) + cur = Operand::zero(program->lane_mask.bytes()); + +@@ -193,8 +201,10 @@ build_merge_code(Program* program, ssa_state* state, Block* block, Operand cur) + } + + void +-init_any_pred_defined(Program* program, ssa_state* state, Block* block, aco_ptr& phi) ++fill_state(Program* program, Block* block, ssa_state* state, aco_ptr& phi) + { ++ Builder bld(program); ++ + std::fill(state->any_pred_defined.begin(), state->any_pred_defined.end(), pred_defined::undef); + for (unsigned i = 0; i < block->logical_preds.size(); i++) { + if (phi->operands[i].isUndefined()) +@@ -209,14 +219,14 @@ init_any_pred_defined(Program* program, ssa_state* state, Block* block, aco_ptr< + unsigned start = block->logical_preds[0]; + unsigned end = block->index; + +- /* for loop exit phis, start at the loop header */ ++ /* for loop exit phis, start at the loop pre-header */ + if (block->kind & block_kind_loop_exit) { +- while (program->blocks[start - 1].loop_nest_depth >= state->loop_nest_depth) ++ while (program->blocks[start].loop_nest_depth >= state->loop_nest_depth) + start--; + /* If the loop-header has a back-edge, we need to insert a phi. + * This will contain a defined value */ +- if (program->blocks[start].linear_preds.size() > 1) +- state->any_pred_defined[start] = pred_defined::temp; ++ if (program->blocks[start + 1].linear_preds.size() > 1) ++ state->any_pred_defined[start + 1] = pred_defined::temp; + } + /* for loop header phis, end at the loop exit */ + if (block->kind & block_kind_loop_header) { +@@ -231,10 +241,10 @@ init_any_pred_defined(Program* program, ssa_state* state, Block* block, aco_ptr< + // TODO: find more occasions where pred_defined::zero is beneficial (e.g. with 2+ temp merges) + if (block->kind & block_kind_loop_exit) { + /* zero the loop-carried variable */ +- if (program->blocks[start].linear_preds.size() > 1) { +- state->any_pred_defined[start] |= pred_defined::zero; ++ if (program->blocks[start + 1].linear_preds.size() > 1) { ++ state->any_pred_defined[start + 1] |= pred_defined::zero; + // TODO: emit this zero explicitly +- state->any_pred_defined[start - 1] = pred_defined::const_0; ++ state->any_pred_defined[start] = pred_defined::const_0; + } + } + +@@ -246,14 +256,24 @@ init_any_pred_defined(Program* program, ssa_state* state, Block* block, aco_ptr< + } + + state->any_pred_defined[block->index] = pred_defined::undef; ++ for (unsigned i = 0; i < phi->operands.size(); i++) { ++ unsigned pred = block->logical_preds[i]; ++ if (state->any_pred_defined[pred] != pred_defined::undef) ++ state->outputs[pred] = Operand(bld.tmp(bld.lm)); ++ else ++ state->outputs[pred] = phi->operands[i]; ++ assert(state->outputs[pred].size() == bld.lm.size()); ++ state->visited[pred] = true; + } + -+ // 暂不支持对图像数组进行压缩 -+ if (pCreateInfo->arrayLayers > 1) { -+ return false; -+ } ++ fill_outputs(program, state, start, end); + -+ if (!radv_is_format_emulated(pdev, pCreateInfo->format)) { -+ return false; + } + + void + lower_divergent_bool_phi(Program* program, ssa_state* state, Block* block, + aco_ptr& phi) + { +- Builder bld(program); +- + if (!state->checked_preds_for_uniform) { + state->all_preds_uniform = !(block->kind & block_kind_merge) && + block->linear_preds.size() == block->logical_preds.size(); +@@ -276,17 +296,7 @@ lower_divergent_bool_phi(Program* program, ssa_state* state, Block* block, + if (block->kind & block_kind_loop_exit) + state->loop_nest_depth += 1; + std::fill(state->visited.begin(), state->visited.end(), false); +- init_any_pred_defined(program, state, block, phi); +- +- for (unsigned i = 0; i < phi->operands.size(); i++) { +- unsigned pred = block->logical_preds[i]; +- if (state->any_pred_defined[pred] != pred_defined::undef) +- state->outputs[pred] = Operand(bld.tmp(bld.lm)); +- else +- state->outputs[pred] = phi->operands[i]; +- assert(state->outputs[pred].size() == bld.lm.size()); +- state->visited[pred] = true; +- } ++ fill_state(program, block, state, phi); + + for (unsigned i = 0; i < phi->operands.size(); i++) + build_merge_code(program, state, &program->blocks[block->logical_preds[i]], phi->operands[i]); +@@ -303,7 +313,7 @@ lower_divergent_bool_phi(Program* program, ssa_state* state, Block* block, + assert(phi->operands.size() == num_preds); + + for (unsigned i = 0; i < num_preds; i++) +- phi->operands[i] = get_ssa(program, block->linear_preds[i], state, false); ++ phi->operands[i] = state->outputs[block->linear_preds[i]]; + + return; + } +diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build +index 8e5c908dbbf..71c8b16d4fc 100644 +--- a/src/amd/vulkan/meson.build ++++ b/src/amd/vulkan/meson.build +@@ -69,6 +69,7 @@ libradv_files = files( + 'radv_meta_fast_clear.c', + 'radv_meta_fmask_copy.c', + 'radv_meta_fmask_expand.c', ++ 'radv_meta_astc_decode.c', + 'radv_meta_resolve.c', + 'radv_meta_resolve_cs.c', + 'radv_meta_resolve_fs.c', +diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c +index 875f196f29c..4e85be520af 100644 +--- a/src/amd/vulkan/radv_cmd_buffer.c ++++ b/src/amd/vulkan/radv_cmd_buffer.c +@@ -7531,7 +7531,7 @@ radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_inf + radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH); + } + +-static void ++void + radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info) + { + radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline, +diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c +index 449ec3037a6..7dbaebafe5c 100644 +--- a/src/amd/vulkan/radv_device.c ++++ b/src/amd/vulkan/radv_device.c +@@ -708,9 +708,11 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm + + #ifdef ANDROID + device->emulate_etc2 = !radv_device_supports_etc(device); ++ device->emulate_astc = true; + #else + device->emulate_etc2 = !radv_device_supports_etc(device) && + driQueryOptionb(&device->instance->dri_options, "radv_require_etc2"); ++ device->emulate_astc = driQueryOptionb(&device->instance->dri_options, "radv_require_astc"); + #endif + + snprintf(device->name, sizeof(device->name), "AMD RADV %s%s", device->rad_info.name, +@@ -957,6 +959,7 @@ static const driOptionDescription radv_dri_options[] = { + DRI_CONF_RADV_DISABLE_DCC(false) + DRI_CONF_RADV_REPORT_APU_AS_DGPU(false) + DRI_CONF_RADV_REQUIRE_ETC2(false) ++ DRI_CONF_RADV_REQUIRE_ASTC(true) + DRI_CONF_RADV_DISABLE_HTILE_LAYERS(false) + DRI_CONF_RADV_DISABLE_ANISO_SINGLE_LEVEL(false) + DRI_CONF_RADV_DISABLE_SINKING_LOAD_INPUT_FS(false) +@@ -1222,7 +1225,7 @@ radv_GetPhysicalDeviceFeatures(VkPhysicalDevice physicalDevice, VkPhysicalDevice + .multiViewport = true, + .samplerAnisotropy = true, + .textureCompressionETC2 = radv_device_supports_etc(pdevice) || pdevice->emulate_etc2, +- .textureCompressionASTC_LDR = false, ++ .textureCompressionASTC_LDR = pdevice->emulate_astc, + .textureCompressionBC = true, + .occlusionQueryPrecise = true, + .pipelineStatisticsQuery = true, +@@ -5574,6 +5577,7 @@ radv_initialise_color_surface(struct radv_device *device, struct radv_color_buff + uint64_t va; + const struct radv_image_plane *plane = &iview->image->planes[iview->plane_id]; + const struct radeon_surf *surf = &plane->surface; ++ uint8_t tile_swizzle = plane->surface.tile_swizzle; + + desc = vk_format_description(iview->vk_format); + +@@ -5583,6 +5587,11 @@ radv_initialise_color_surface(struct radv_device *device, struct radv_color_buff + cb->cb_color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1); + + va = radv_buffer_get_va(iview->image->bo) + iview->image->offset; ++ ++ if (iview->nbc_view.valid) { ++ va += iview->nbc_view.base_address_offset; ++ tile_swizzle = iview->nbc_view.tile_swizzle; + } + + cb->cb_color_base = va >> 8; + +@@ -5609,14 +5618,14 @@ radv_initialise_color_surface(struct radv_device *device, struct radv_color_buff + } + + cb->cb_color_base += surf->u.gfx9.surf_offset >> 8; +- cb->cb_color_base |= surf->tile_swizzle; ++ cb->cb_color_base |= tile_swizzle; + } else { + const struct legacy_surf_level *level_info = &surf->u.legacy.level[iview->base_mip]; + unsigned pitch_tile_max, slice_tile_max, tile_mode_index; + + cb->cb_color_base += level_info->offset_256B; + if (level_info->mode == RADEON_SURF_MODE_2D) +- cb->cb_color_base |= surf->tile_swizzle; ++ cb->cb_color_base |= tile_swizzle; + + pitch_tile_max = level_info->nblk_x / 8 - 1; + slice_tile_max = (level_info->nblk_x * level_info->nblk_y) / 64 - 1; +@@ -5655,7 +5664,7 @@ radv_initialise_color_surface(struct radv_device *device, struct radv_color_buff + device->physical_device->rad_info.chip_class <= GFX8) + va += plane->surface.u.legacy.color.dcc_level[iview->base_mip].dcc_offset; + +- unsigned dcc_tile_swizzle = surf->tile_swizzle; ++ unsigned dcc_tile_swizzle = tile_swizzle; + dcc_tile_swizzle &= ((1 << surf->meta_alignment_log2) - 1) >> 8; + + cb->cb_dcc_base = va >> 8; +@@ -5764,9 +5773,17 @@ radv_initialise_color_surface(struct radv_device *device, struct radv_color_buff + vk_format_get_plane_width(iview->image->vk_format, iview->plane_id, iview->extent.width); + unsigned height = + vk_format_get_plane_height(iview->image->vk_format, iview->plane_id, iview->extent.height); ++ unsigned max_mip = iview->image->info.levels - 1; + + if (device->physical_device->rad_info.chip_class >= GFX10) { +- cb->cb_color_view |= S_028C6C_MIP_LEVEL_GFX10(iview->base_mip); ++ unsigned base_level = iview->base_mip; + -+ const enum util_format_layout format_layout = vk_format_description(pCreateInfo->format)->layout; -+ if (format_layout != UTIL_FORMAT_LAYOUT_ASTC && -+ vk_texcompress_etc2_emulation_format(pCreateInfo->format) != VK_FORMAT_R8G8B8A8_UNORM && -+ vk_texcompress_etc2_emulation_format(pCreateInfo->format) != VK_FORMAT_R8G8B8A8_SRGB) { -+ return false; -+ } ++ if (iview->nbc_view.valid) { ++ base_level = iview->nbc_view.level; ++ max_mip = iview->nbc_view.num_levels - 1; ++ } + -+ // 过滤掉对稀疏纹理的压缩,对该部分纹理进行压缩时可能造成GPU reset。 -+ if (pCreateInfo->flags & (VK_IMAGE_CREATE_SPARSE_BINDING_BIT | VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT | -+ VK_IMAGE_CREATE_SPARSE_ALIASED_BIT)) { -+ return false; -+ } ++ cb->cb_color_view |= S_028C6C_MIP_LEVEL_GFX10(base_level); + + cb->cb_color_attrib3 |= S_028EE0_MIP0_DEPTH(mip0_depth) | + S_028EE0_RESOURCE_TYPE(surf->u.gfx9.resource_type) | +@@ -5778,7 +5795,7 @@ radv_initialise_color_surface(struct radv_device *device, struct radv_color_buff + } + + cb->cb_color_attrib2 = S_028C68_MIP0_WIDTH(width - 1) | S_028C68_MIP0_HEIGHT(height - 1) | +- S_028C68_MAX_MIP(iview->image->info.levels - 1); ++ S_028C68_MAX_MIP(max_mip); + } + } + +diff --git a/src/amd/vulkan/radv_formats.c b/src/amd/vulkan/radv_formats.c +index 475bd9a17d9..a3d6fd8b650 100644 +--- a/src/amd/vulkan/radv_formats.c ++++ b/src/amd/vulkan/radv_formats.c +@@ -37,6 +37,7 @@ + #include "util/half_float.h" + #include "vulkan/util/vk_format.h" + #include "vulkan/util/vk_enum_defines.h" ++#include "vk_texcompress_astc.h" + + uint32_t + radv_translate_buffer_dataformat(const struct util_format_description *desc, int first_non_void) +@@ -652,24 +653,56 @@ radv_is_zs_format_supported(VkFormat format) + return radv_translate_dbformat(format) != V_028040_Z_INVALID || format == VK_FORMAT_S8_UINT; + } + + -+ // 检查图像的用途,防止压缩 swapchain 的 images;并只对 VK_IMAGE_TYPE_2D 类型纹理进行压缩 -+ if (!(pCreateInfo->usage & VK_IMAGE_USAGE_SAMPLED_BIT) || pCreateInfo->imageType != VK_IMAGE_TYPE_2D) { -+ return false; -+ } + static bool +-radv_is_filter_minmax_format_supported(VkFormat format) ++radv_is_filter_minmax_format_supported(const struct radv_physical_device *pdev, VkFormat format) + { +- /* From the Vulkan spec 1.1.71: +- * +- * "The following formats must support the +- * VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT feature with +- * VK_IMAGE_TILING_OPTIMAL, if they support +- * VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT." +- */ +- /* TODO: enable more formats. */ ++ enum chip_class gfx_level = pdev->rad_info.chip_class; ++ + switch (format) { ++ case VK_FORMAT_R4G4_UNORM_PACK8: ++ case VK_FORMAT_R4G4B4A4_UNORM_PACK16: ++ case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT: ++ case VK_FORMAT_B4G4R4A4_UNORM_PACK16: ++ case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT: ++ case VK_FORMAT_R5G6B5_UNORM_PACK16: ++ case VK_FORMAT_B5G6R5_UNORM_PACK16: ++ case VK_FORMAT_R5G5B5A1_UNORM_PACK16: ++ case VK_FORMAT_B5G5R5A1_UNORM_PACK16: ++ case VK_FORMAT_A1R5G5B5_UNORM_PACK16: ++ case VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR: + case VK_FORMAT_R8_UNORM: ++ case VK_FORMAT_A8_UNORM_KHR: + case VK_FORMAT_R8_SNORM: ++ case VK_FORMAT_R8_SRGB: ++ case VK_FORMAT_R8G8_UNORM: ++ case VK_FORMAT_R8G8_SNORM: ++ case VK_FORMAT_R8G8_SRGB: ++ case VK_FORMAT_R8G8B8A8_UNORM: ++ case VK_FORMAT_R8G8B8A8_SNORM: ++ case VK_FORMAT_R8G8B8A8_SRGB: ++ case VK_FORMAT_B8G8R8A8_UNORM: ++ case VK_FORMAT_B8G8R8A8_SNORM: ++ case VK_FORMAT_B8G8R8A8_SRGB: ++ case VK_FORMAT_A8B8G8R8_UNORM_PACK32: ++ case VK_FORMAT_A8B8G8R8_SNORM_PACK32: ++ case VK_FORMAT_A8B8G8R8_SRGB_PACK32: ++ case VK_FORMAT_A2R10G10B10_UNORM_PACK32: ++ case VK_FORMAT_A2B10G10R10_UNORM_PACK32: + case VK_FORMAT_R16_UNORM: + case VK_FORMAT_R16_SNORM: + case VK_FORMAT_R16_SFLOAT: ++ case VK_FORMAT_R16G16_UNORM: ++ case VK_FORMAT_R16G16_SNORM: ++ case VK_FORMAT_R16G16_SFLOAT: ++ case VK_FORMAT_R16G16B16A16_UNORM: ++ case VK_FORMAT_R16G16B16A16_SNORM: ++ case VK_FORMAT_R16G16B16A16_SFLOAT: + case VK_FORMAT_R32_SFLOAT: ++ case VK_FORMAT_R32G32_SFLOAT: ++ case VK_FORMAT_R32G32B32_SFLOAT: ++ case VK_FORMAT_R32G32B32A32_SFLOAT: ++ case VK_FORMAT_B10G11R11_UFLOAT_PACK32: + case VK_FORMAT_D16_UNORM: + case VK_FORMAT_X8_D24_UNORM_PACK32: + case VK_FORMAT_D32_SFLOAT: +@@ -677,11 +710,53 @@ radv_is_filter_minmax_format_supported(VkFormat format) + case VK_FORMAT_D24_UNORM_S8_UINT: + case VK_FORMAT_D32_SFLOAT_S8_UINT: + return true; ++ case VK_FORMAT_R8_UINT: ++ case VK_FORMAT_R8_SINT: ++ case VK_FORMAT_R8G8_UINT: ++ case VK_FORMAT_R8G8_SINT: ++ case VK_FORMAT_R8G8B8A8_UINT: ++ case VK_FORMAT_R8G8B8A8_SINT: ++ case VK_FORMAT_B8G8R8A8_UINT: ++ case VK_FORMAT_B8G8R8A8_SINT: ++ case VK_FORMAT_A8B8G8R8_UINT_PACK32: ++ case VK_FORMAT_A8B8G8R8_SINT_PACK32: ++ case VK_FORMAT_A2R10G10B10_UINT_PACK32: ++ case VK_FORMAT_A2B10G10R10_UINT_PACK32: ++ case VK_FORMAT_R16_UINT: ++ case VK_FORMAT_R16_SINT: ++ case VK_FORMAT_R16G16_UINT: ++ case VK_FORMAT_R16G16_SINT: ++ case VK_FORMAT_R16G16B16A16_UINT: ++ case VK_FORMAT_R16G16B16A16_SINT: ++ case VK_FORMAT_R32_UINT: ++ case VK_FORMAT_R32_SINT: ++ case VK_FORMAT_R32G32_UINT: ++ case VK_FORMAT_R32G32_SINT: ++ case VK_FORMAT_R32G32B32_UINT: ++ case VK_FORMAT_R32G32B32_SINT: ++ case VK_FORMAT_R32G32B32A32_UINT: ++ case VK_FORMAT_R32G32B32A32_SINT: ++ case VK_FORMAT_S8_UINT: ++ return gfx_level >= GFX9; ++ case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32: ++ return gfx_level >= GFX7; + default: + return false; + } + } + ++bool ++radv_is_format_emulated(const struct radv_physical_device *physical_device, VkFormat format) ++{ ++ if (physical_device->emulate_etc2 && vk_texcompress_etc2_emulation_format(format) != VK_FORMAT_UNDEFINED) ++ return true; + -+ // 图像太大内存申请不了 -+ if (pCreateInfo->extent.width >= 3960 && pCreateInfo->extent.height >= 3960) { -+ return false; -+ } ++ if (physical_device->emulate_astc && vk_texcompress_astc_emulation_format(format) != VK_FORMAT_UNDEFINED) ++ return true; + -+ // 检查图像大小 -+ if (pCreateInfo->extent.width >= 256 && pCreateInfo->extent.height >= 256) { -+ return true; -+ } -+ return false; ++ return false; +} + + bool + radv_device_supports_etc(struct radv_physical_device *physical_device) + { +@@ -708,8 +783,21 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical + return; + } + +- if (desc->layout == UTIL_FORMAT_LAYOUT_ETC && !radv_device_supports_etc(physical_device) && +- !physical_device->emulate_etc2) { ++ if ((desc->layout == UTIL_FORMAT_LAYOUT_ETC && !radv_device_supports_etc(physical_device)) ++ || desc->layout == UTIL_FORMAT_LAYOUT_ASTC) { ++ if (radv_is_format_emulated(physical_device, format)) { ++ /* required features for compressed formats */ ++ tiled = VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT | VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT | ++ VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT | ++ VK_FORMAT_FEATURE_2_BLIT_SRC_BIT; ++ ++ VkFormat emulation_format = vk_texcompress_etc2_emulation_format(format); ++ if (emulation_format == VK_FORMAT_UNDEFINED) ++ emulation_format = vk_texcompress_astc_emulation_format(format); ++ ++ if (radv_is_filter_minmax_format_supported(physical_device, emulation_format)) ++ tiled |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT; ++ } + out_properties->linearTilingFeatures = linear; + out_properties->optimalTilingFeatures = tiled; + out_properties->bufferFeatures = buffer; +@@ -763,7 +851,7 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical + tiled |= VK_FORMAT_FEATURE_2_BLIT_SRC_BIT | VK_FORMAT_FEATURE_2_BLIT_DST_BIT; + tiled |= VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT; + +- if (radv_is_filter_minmax_format_supported(format)) ++ if (radv_is_filter_minmax_format_supported(physical_device, format)) + tiled |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT; + + if (vk_format_has_depth(format)) { +@@ -784,7 +872,7 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical + linear |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT | VK_FORMAT_FEATURE_2_BLIT_SRC_BIT; + tiled |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT | VK_FORMAT_FEATURE_2_BLIT_SRC_BIT; + +- if (radv_is_filter_minmax_format_supported(format)) ++ if (radv_is_filter_minmax_format_supported(physical_device, format)) + tiled |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT; + + if (linear_sampling) { +@@ -1357,6 +1445,9 @@ radv_check_modifier_support(struct radv_physical_device *dev, + if (info->type != VK_IMAGE_TYPE_2D) + return VK_ERROR_FORMAT_NOT_SUPPORTED; + ++ if (radv_is_format_emulated(dev, format)) ++ return VK_ERROR_FORMAT_NOT_SUPPORTED; ++ + if (!desc || (desc->layout == UTIL_FORMAT_LAYOUT_ETC && dev->emulate_etc2)) + return VK_ERROR_FORMAT_NOT_SUPPORTED; + +@@ -1493,6 +1584,8 @@ radv_get_image_format_properties(struct radv_physical_device *physical_device, + if (format_feature_flags == 0) + goto unsupported; + ++ if (info->type == VK_IMAGE_TYPE_1D && radv_is_format_emulated(physical_device, format)) ++ goto unsupported; + if (info->type != VK_IMAGE_TYPE_2D && vk_format_is_depth_or_stencil(format)) + goto unsupported; + +@@ -1633,9 +1726,8 @@ radv_get_image_format_properties(struct radv_physical_device *physical_device, + goto unsupported; + } + +- if ((info->flags & +- (VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT | VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT)) && +- (desc->layout == UTIL_FORMAT_LAYOUT_ETC && physical_device->emulate_etc2)) { ++ if ((info->flags & (VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT | VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT)) && ++ radv_is_format_emulated(physical_device, format)) { + goto unsupported; + } + +@@ -1681,9 +1773,8 @@ get_external_image_format_properties(struct radv_physical_device *physical_devic + VkExternalMemoryFeatureFlagBits flags = 0; + VkExternalMemoryHandleTypeFlags export_flags = 0; + VkExternalMemoryHandleTypeFlags compat_flags = 0; +- const struct util_format_description *desc = vk_format_description(pImageFormatInfo->format); + +- if (!desc || (desc->layout == UTIL_FORMAT_LAYOUT_ETC && physical_device->emulate_etc2)) ++ if (radv_is_format_emulated(physical_device, pImageFormatInfo->format)) + return; + + if (pImageFormatInfo->flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT) +diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c +index ae19fc1987c..3fd4ea777a8 100644 +--- a/src/amd/vulkan/radv_image.c ++++ b/src/amd/vulkan/radv_image.c +@@ -544,40 +544,19 @@ radv_patch_image_from_extra_info(struct radv_device *device, struct radv_image * + return VK_SUCCESS; + } + +-static VkFormat +-etc2_emulation_format(VkFormat format) +-{ +- switch (format) { +- case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK: +- case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK: +- case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK: +- return VK_FORMAT_R8G8B8A8_UNORM; +- case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK: +- case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK: +- case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK: +- return VK_FORMAT_R8G8B8A8_SRGB; +- case VK_FORMAT_EAC_R11_UNORM_BLOCK: +- return VK_FORMAT_R16_UNORM; +- case VK_FORMAT_EAC_R11_SNORM_BLOCK: +- return VK_FORMAT_R16_SNORM; +- case VK_FORMAT_EAC_R11G11_UNORM_BLOCK: +- return VK_FORMAT_R16G16_UNORM; +- case VK_FORMAT_EAC_R11G11_SNORM_BLOCK: +- return VK_FORMAT_R16G16_SNORM; +- default: +- unreachable("Unhandled ETC format"); +- } +-} ++extern bool radv_is_format_emulated(const struct radv_physical_device *physical_device, VkFormat format); + static VkFormat radv_image_get_plane_format(const struct radv_physical_device *pdev, const struct radv_image *image, unsigned plane) -@@ -553,10 +614,15 @@ radv_image_get_plane_format(const struct radv_physical_device *pdev, const struc - if (radv_is_format_emulated(pdev, image->vk_format)) { + { +- if (pdev->emulate_etc2 && +- vk_format_description(image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ETC) { ++ if (radv_is_format_emulated(pdev, image->vk_format)) { if (plane == 0) return image->vk_format; -- if (vk_format_description(image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ASTC) -+ if (vk_format_description(image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ASTC) { -+ if (image->isNeedHardEncode) -+ return radv_translate_rgba2dxt5(vk_texcompress_astc_emulation_format(image->vk_format)); - return vk_texcompress_astc_emulation_format(image->vk_format); -- else -+ } else { -+ if (image->isNeedHardEncode) -+ return radv_translate_rgba2dxt5(vk_texcompress_etc2_emulation_format(image->vk_format)); - return vk_texcompress_etc2_emulation_format(image->vk_format); -+ } +- return etc2_emulation_format(image->vk_format); ++ if (vk_format_description(image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ASTC) ++ return vk_texcompress_astc_emulation_format(image->vk_format); ++ else ++ return vk_texcompress_etc2_emulation_format(image->vk_format); } return vk_format_get_plane_format(image->vk_format, plane); } -@@ -1710,6 +1776,12 @@ radv_destroy_image(struct radv_device *device, const VkAllocationCallbacks *pAll - vk_free2(&device->vk.alloc, pAllocator, image); +@@ -773,11 +752,13 @@ si_set_mutable_tex_desc_fields(struct radv_device *device, struct radv_image *im + const struct legacy_surf_level *base_level_info, unsigned plane_id, + unsigned base_level, unsigned first_level, unsigned block_width, + bool is_stencil, bool is_storage_image, bool disable_compression, +- bool enable_write_compression, uint32_t *state) ++ bool enable_write_compression, uint32_t *state, ++ const struct ac_surf_nbc_view *nbc_view) + { + struct radv_image_plane *plane = &image->planes[plane_id]; + uint64_t gpu_address = image->bo ? radv_buffer_get_va(image->bo) + image->offset : 0; + uint64_t va = gpu_address; ++ uint8_t swizzle = plane->surface.tile_swizzle; + enum chip_class chip_class = device->physical_device->rad_info.chip_class; + uint64_t meta_va = 0; + if (chip_class >= GFX9) { +@@ -785,12 +766,16 @@ si_set_mutable_tex_desc_fields(struct radv_device *device, struct radv_image *im + va += plane->surface.u.gfx9.zs.stencil_offset; + else + va += plane->surface.u.gfx9.surf_offset; ++ if (nbc_view && nbc_view->valid) { ++ va += nbc_view->base_address_offset; ++ swizzle = nbc_view->tile_swizzle; ++ } + } else + va += (uint64_t)base_level_info->offset_256B * 256; + + state[0] = va >> 8; + if (chip_class >= GFX9 || base_level_info->mode == RADEON_SURF_MODE_2D) +- state[0] |= plane->surface.tile_swizzle; ++ state[0] |= swizzle; + state[1] &= C_008F14_BASE_ADDRESS_HI; + state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40); + +@@ -960,7 +945,8 @@ gfx10_make_texture_descriptor(struct radv_device *device, struct radv_image *ima + const VkComponentMapping *mapping, unsigned first_level, + unsigned last_level, unsigned first_layer, unsigned last_layer, + unsigned width, unsigned height, unsigned depth, float min_lod, +- uint32_t *state, uint32_t *fmask_state) ++ uint32_t *state, uint32_t *fmask_state, ++ const struct ac_surf_nbc_view *nbc_view) + { + const struct util_format_description *desc; + enum pipe_swizzle swizzle[4]; +@@ -1015,12 +1001,17 @@ gfx10_make_texture_descriptor(struct radv_device *device, struct radv_image *ima + state[4] = S_00A010_DEPTH(type == V_008F1C_SQ_RSRC_IMG_3D ? depth - 1 : last_layer) | + S_00A010_BASE_ARRAY(first_layer); + state[5] = S_00A014_ARRAY_PITCH(0) | +- S_00A014_MAX_MIP(image->info.samples > 1 ? util_logbase2(image->info.samples) +- : image->info.levels - 1) | + S_00A014_PERF_MOD(4); + state[6] = 0; + state[7] = 0; + ++ unsigned max_mip = ++ image->info.samples > 1 ? util_logbase2(image->info.samples) : image->info.levels - 1; ++ if (nbc_view && nbc_view->valid) ++ max_mip = nbc_view->num_levels - 1; ++ ++ state[5] |= S_00A014_MAX_MIP(max_mip); ++ + if (radv_dcc_enabled(image, first_level)) { + state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) | + S_00A018_MAX_COMPRESSED_BLOCK_SIZE( +@@ -1299,12 +1290,12 @@ radv_make_texture_descriptor(struct radv_device *device, struct radv_image *imag + const VkComponentMapping *mapping, unsigned first_level, + unsigned last_level, unsigned first_layer, unsigned last_layer, + unsigned width, unsigned height, unsigned depth, float min_lod, uint32_t *state, +- uint32_t *fmask_state) ++ uint32_t *fmask_state, const struct ac_surf_nbc_view *nbc_view) + { + if (device->physical_device->rad_info.chip_class >= GFX10) { + gfx10_make_texture_descriptor(device, image, is_storage_image, view_type, vk_format, mapping, + first_level, last_level, first_layer, last_layer, width, height, +- depth, min_lod, state, fmask_state); ++ depth, min_lod, state, fmask_state, nbc_view); + } else { + si_make_texture_descriptor(device, image, is_storage_image, view_type, vk_format, mapping, + first_level, last_level, first_layer, last_layer, width, height, +@@ -1324,11 +1315,11 @@ radv_query_opaque_metadata(struct radv_device *device, struct radv_image *image, + radv_make_texture_descriptor(device, image, false, (VkImageViewType)image->type, + image->vk_format, &fixedmapping, 0, image->info.levels - 1, 0, + image->info.array_size - 1, image->info.width, image->info.height, +- image->info.depth, 0.0f, desc, NULL); ++ image->info.depth, 0.0f, desc, NULL, NULL); + + si_set_mutable_tex_desc_fields(device, image, &image->planes[0].surface.u.legacy.level[0], 0, 0, + 0, image->planes[0].surface.blk_w, false, false, false, false, +- desc); ++ desc, NULL); + + ac_surface_get_umd_metadata(&device->physical_device->rad_info, &image->planes[0].surface, + image->info.levels, desc, &md->size_metadata, md->metadata); +@@ -1562,7 +1553,7 @@ radv_image_use_comp_to_single(const struct radv_device *device, const struct rad + static unsigned + radv_get_internal_plane_count(const struct radv_physical_device *pdev, VkFormat fmt) + { +- if (pdev->emulate_etc2 && vk_format_description(fmt)->layout == UTIL_FORMAT_LAYOUT_ETC) ++ if (radv_is_format_emulated(pdev, fmt)) + return 2; + return vk_format_get_plane_count(fmt); + } +@@ -2024,17 +2015,34 @@ radv_image_create(VkDevice _device, const struct radv_image_create_info *create_ + return VK_SUCCESS; } -+void -+radv_internal_image_finish(struct radv_image *image) ++ ++static inline void ++compute_non_block_compressed_view(const struct radv_device *device, ++ const struct radv_image_view *iview, ++ struct ac_surf_nbc_view *nbc_view) +{ -+ vk_object_base_finish(&image->base); ++ const struct radv_image *image = iview->image; ++ const struct radeon_surf *surf = &image->planes[0].surface; ++ struct ac_addrlib *addrlib = device->ws->get_addrlib(device->ws); ++ ++ ac_surface_compute_nbc_view(addrlib, &device->physical_device->rad_info, surf, &image->info, ++ iview->base_mip, iview->base_layer, nbc_view); +} ++ + static void - radv_image_print_info(struct radv_device *device, struct radv_image *image) + radv_image_view_make_descriptor(struct radv_image_view *iview, struct radv_device *device, + VkFormat vk_format, const VkComponentMapping *components, + float min_lod, + bool is_storage_image, bool disable_compression, + bool enable_compression, unsigned plane_id, +- unsigned descriptor_plane_id) ++ unsigned descriptor_plane_id, ++ const struct ac_surf_nbc_view *nbc_view) { -@@ -1890,9 +1962,11 @@ radv_image_create(VkDevice _device, const struct radv_image_create_info *create_ - vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_MEMORY_IMAGE_CREATE_INFO); - assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO); - -+ bool needHardEncode = radv_need_hard_encode(device->physical_device, pCreateInfo); -+ - unsigned plane_count = radv_get_internal_plane_count(device->physical_device, format); - -- bool needSoftEncode = !external_info && radv_need_soft_encode(pCreateInfo); -+ bool needSoftEncode = !external_info && radv_need_soft_encode(pCreateInfo) && !needHardEncode; - bool needSoftDecode = needSoftEncode && radv_need_soft_decode(format); - VkFormat softDecodeFormat = radv_translate_etc(format); - VkFormat softEncodeFormat = radv_translate_rgb2BC(format); -@@ -1936,6 +2010,7 @@ radv_image_create(VkDevice _device, const struct radv_image_create_info *create_ - image->usage = pCreateInfo->usage; - image->flags = pCreateInfo->flags; - image->plane_count = vk_format_get_plane_count(format); -+ image->isNeedHardEncode = needHardEncode; - - //满足要求的 ETC2 纹理使用软解 - if (needSoftDecode) { + struct radv_image *image = iview->image; + struct radv_image_plane *plane = &image->planes[plane_id]; + bool is_stencil = iview->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT; ++ unsigned first_layer = iview->base_layer; + uint32_t blk_w; + union radv_descriptor *descriptor; + uint32_t hw_level = 0; +@@ -2050,16 +2058,27 @@ radv_image_view_make_descriptor(struct radv_image_view *iview, struct radv_devic + blk_w = plane->surface.blk_w / vk_format_get_blockwidth(plane->format) * + vk_format_get_blockwidth(vk_format); + +- if (device->physical_device->rad_info.chip_class >= GFX9) ++ if (device->physical_device->rad_info.chip_class >= GFX9) { + hw_level = iview->base_mip; ++ if (nbc_view->valid) { ++ hw_level = nbc_view->level; ++ iview->extent.width = nbc_view->width; ++ iview->extent.height = nbc_view->height; ++ ++ /* Clear the base array layer because addrlib adds it as part of the base addr offset. */ ++ first_layer = 0; ++ } ++ } ++ + radv_make_texture_descriptor( + device, image, is_storage_image, iview->type, vk_format, components, hw_level, +- hw_level + iview->level_count - 1, iview->base_layer, ++ hw_level + iview->level_count - 1, first_layer, + iview->base_layer + iview->layer_count - 1, + vk_format_get_plane_width(image->vk_format, plane_id, iview->extent.width), + vk_format_get_plane_height(image->vk_format, plane_id, iview->extent.height), + iview->extent.depth, min_lod, descriptor->plane_descriptors[descriptor_plane_id], +- descriptor_plane_id || is_storage_image ? NULL : descriptor->fmask_descriptor); ++ descriptor_plane_id || is_storage_image ? NULL : descriptor->fmask_descriptor, ++ nbc_view); + + const struct legacy_surf_level *base_level_info = NULL; + if (device->physical_device->rad_info.chip_class <= GFX9) { +@@ -2075,7 +2094,7 @@ radv_image_view_make_descriptor(struct radv_image_view *iview, struct radv_devic + si_set_mutable_tex_desc_fields(device, image, base_level_info, plane_id, iview->base_mip, + iview->base_mip, blk_w, is_stencil, is_storage_image, + disable_compression, enable_write_compression, +- descriptor->plane_descriptors[descriptor_plane_id]); ++ descriptor->plane_descriptors[descriptor_plane_id], nbc_view); + } + + static unsigned +@@ -2182,6 +2201,7 @@ radv_image_view_init(struct radv_image_view *iview, struct radv_device *device, + iview->image = image; + iview->type = pCreateInfo->viewType; + iview->plane_id = radv_plane_from_aspect(pCreateInfo->subresourceRange.aspectMask); ++ iview->nbc_view.valid = false; + iview->aspect_mask = pCreateInfo->subresourceRange.aspectMask; + iview->base_layer = range->baseArrayLayer; + iview->layer_count = radv_get_layerCount(image, range); +@@ -2215,15 +2235,11 @@ radv_image_view_init(struct radv_image_view *iview, struct radv_device *device, + iview->vk_format = radv_translate_etc(iview->vk_format); + } + +- if (device->physical_device->emulate_etc2 && +- vk_format_description(image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ETC) { +- const struct util_format_description *desc = vk_format_description(iview->vk_format); +- assert(desc); +- if (desc->layout == UTIL_FORMAT_LAYOUT_ETC) { +- iview->plane_id = 1; +- iview->vk_format = etc2_emulation_format(iview->vk_format); +- } +- ++ /* when the view format is emulated, redirect the view to the hidden plane 1 */ ++ if (radv_is_format_emulated(device->physical_device, iview->vk_format)) { ++ assert(radv_is_format_emulated(device->physical_device, image->vk_format)); ++ iview->plane_id = 1; ++ iview->vk_format = image->planes[iview->plane_id].format; + plane_count = 1; + } + +@@ -2242,13 +2258,14 @@ radv_image_view_init(struct radv_image_view *iview, struct radv_device *device, + } + + if (iview->vk_format != image->planes[iview->plane_id].format) { ++ const struct radv_image_plane *plane = &image->planes[iview->plane_id]; + unsigned view_bw = vk_format_get_blockwidth(iview->vk_format); + unsigned view_bh = vk_format_get_blockheight(iview->vk_format); +- unsigned img_bw = vk_format_get_blockwidth(image->planes[iview->plane_id].format); +- unsigned img_bh = vk_format_get_blockheight(image->planes[iview->plane_id].format); ++ unsigned plane_bw = vk_format_get_blockwidth(plane->format); ++ unsigned plane_bh = vk_format_get_blockheight(plane->format); + +- iview->extent.width = round_up_u32(iview->extent.width * view_bw, img_bw); +- iview->extent.height = round_up_u32(iview->extent.height * view_bh, img_bh); ++ iview->extent.width = round_up_u32(iview->extent.width * view_bw, plane_bw); ++ iview->extent.height = round_up_u32(iview->extent.height * view_bh, plane_bh); + + /* Comment ported from amdvlk - + * If we have the following image: +@@ -2277,27 +2294,37 @@ radv_image_view_init(struct radv_image_view *iview, struct radv_device *device, + * the plain converted dimensions the physical layout is correct. + */ + if (device->physical_device->rad_info.chip_class >= GFX9 && +- vk_format_is_compressed(image->vk_format) && !vk_format_is_compressed(iview->vk_format)) { ++ vk_format_is_block_compressed(image->vk_format) && ++ !vk_format_is_block_compressed(iview->vk_format)) { + /* If we have multiple levels in the view we should ideally take the last level, + * but the mip calculation has a max(..., 1) so walking back to the base mip in an + * useful way is hard. */ + if (iview->level_count > 1) { +- iview->extent.width = iview->image->planes[0].surface.u.gfx9.base_mip_width; +- iview->extent.height = iview->image->planes[0].surface.u.gfx9.base_mip_height; ++ iview->extent.width = plane->surface.u.gfx9.base_mip_width; ++ iview->extent.height = plane->surface.u.gfx9.base_mip_height; + } else { + unsigned lvl_width = radv_minify(image->info.width, range->baseMipLevel); + unsigned lvl_height = radv_minify(image->info.height, range->baseMipLevel); + +- lvl_width = round_up_u32(lvl_width * view_bw, img_bw); +- lvl_height = round_up_u32(lvl_height * view_bh, img_bh); +- +- lvl_width <<= range->baseMipLevel; +- lvl_height <<= range->baseMipLevel; +- +- iview->extent.width = CLAMP(lvl_width, iview->extent.width, +- iview->image->planes[0].surface.u.gfx9.base_mip_width); +- iview->extent.height = CLAMP(lvl_height, iview->extent.height, +- iview->image->planes[0].surface.u.gfx9.base_mip_height); ++ lvl_width = round_up_u32(lvl_width * view_bw, plane_bw); ++ lvl_height = round_up_u32(lvl_height * view_bh, plane_bh); ++ ++ iview->extent.width = CLAMP(lvl_width << range->baseMipLevel, iview->extent.width, ++ plane->surface.u.gfx9.base_mip_width); ++ iview->extent.height = CLAMP(lvl_height << range->baseMipLevel, iview->extent.height, ++ plane->surface.u.gfx9.base_mip_height); ++ ++ /* If the hardware-computed extent is still be too small, on GFX10 ++ * we can attempt another workaround provided by addrlib that ++ * changes the descriptor's base level, and adjusts the address and ++ * extents accordingly. ++ */ ++ if (device->physical_device->rad_info.chip_class >= GFX10 && ++ (radv_minify(iview->extent.width, range->baseMipLevel) < lvl_width || ++ radv_minify(iview->extent.height, range->baseMipLevel) < lvl_height) && ++ iview->layer_count == 1) { ++ compute_non_block_compressed_view(device, iview, &iview->nbc_view); ++ } + } + } + } +@@ -2311,10 +2338,10 @@ radv_image_view_init(struct radv_image_view *iview, struct radv_device *device, + VkFormat format = vk_format_get_plane_format(iview->vk_format, i); + radv_image_view_make_descriptor(iview, device, format, &pCreateInfo->components, min_lod, false, + disable_compression, enable_compression, iview->plane_id + i, +- i); ++ i, &iview->nbc_view); + radv_image_view_make_descriptor(iview, device, format, &pCreateInfo->components, min_lod, true, + disable_compression, enable_compression, iview->plane_id + i, +- i); ++ i, &iview->nbc_view); + } + } + diff --git a/src/amd/vulkan/radv_meta.c b/src/amd/vulkan/radv_meta.c -index b1080ef8935..9b5a73cd1be 100644 +index 82eb45748dc..b1080ef8935 100644 --- a/src/amd/vulkan/radv_meta.c +++ b/src/amd/vulkan/radv_meta.c -@@ -601,8 +601,14 @@ radv_device_init_meta(struct radv_device *device) +@@ -597,8 +597,14 @@ radv_device_init_meta(struct radv_device *device) if (result != VK_SUCCESS) - goto fail_astc_decode; + goto fail_etc_decode; -+ result = radv_device_init_meta_rgba_encode_state(device); ++ result = radv_device_init_meta_astc_decode_state(device, on_demand); + if (result != VK_SUCCESS) -+ goto fail_bcn_encode; ++ goto fail_astc_decode; + return VK_SUCCESS; -+fail_bcn_encode: -+ radv_device_finish_meta_rgba_encode_state(device); - fail_astc_decode: - radv_device_finish_meta_astc_decode_state(device); ++fail_astc_decode: ++ radv_device_finish_meta_astc_decode_state(device); fail_etc_decode: -@@ -644,6 +650,7 @@ fail_clear: - void + radv_device_finish_meta_etc_decode_state(device); + fail_fmask_copy: +@@ -639,6 +645,7 @@ void radv_device_finish_meta(struct radv_device *device) { -+ radv_device_finish_meta_rgba_encode_state(device); radv_device_finish_meta_etc_decode_state(device); - radv_device_finish_meta_astc_decode_state(device); ++ radv_device_finish_meta_astc_decode_state(device); radv_device_finish_accel_struct_build_state(device); + radv_device_finish_meta_clear_state(device); + radv_device_finish_meta_resolve_state(device); diff --git a/src/amd/vulkan/radv_meta.h b/src/amd/vulkan/radv_meta.h -index d8ab75481d9..9702bfc9c7b 100644 +index 854cd37766c..d8ab75481d9 100644 --- a/src/amd/vulkan/radv_meta.h +++ b/src/amd/vulkan/radv_meta.h -@@ -107,6 +107,9 @@ void radv_device_finish_meta_etc_decode_state(struct radv_device *device); - VkResult radv_device_init_meta_astc_decode_state(struct radv_device *device, bool on_demand); - void radv_device_finish_meta_astc_decode_state(struct radv_device *device); +@@ -104,6 +104,9 @@ void radv_device_finish_accel_struct_build_state(struct radv_device *device); + VkResult radv_device_init_meta_etc_decode_state(struct radv_device *device, bool on_demand); + void radv_device_finish_meta_etc_decode_state(struct radv_device *device); -+VkResult radv_device_init_meta_rgba_encode_state(struct radv_device *device); -+void radv_device_finish_meta_rgba_encode_state(struct radv_device *device); ++VkResult radv_device_init_meta_astc_decode_state(struct radv_device *device, bool on_demand); ++void radv_device_finish_meta_astc_decode_state(struct radv_device *device); + void radv_meta_save(struct radv_meta_saved_state *saved_state, struct radv_cmd_buffer *cmd_buffer, uint32_t flags); -@@ -232,6 +235,9 @@ void radv_meta_decode_etc(struct radv_cmd_buffer *cmd_buffer, struct radv_image - void radv_meta_decode_astc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout layout, - const VkImageSubresourceLayers *subresource, VkOffset3D offset, VkExtent3D extent); +@@ -226,6 +229,9 @@ void radv_meta_decode_etc(struct radv_cmd_buffer *cmd_buffer, struct radv_image + VkImageLayout layout, const VkImageSubresourceLayers *subresource, + VkOffset3D offset, VkExtent3D extent); -+void radv_meta_bcn_encoded(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout layout, ++void radv_meta_decode_astc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout layout, + const VkImageSubresourceLayers *subresource, VkOffset3D offset, VkExtent3D extent); + /** * Return whether the bound pipeline is the FMASK decompress pass. */ diff --git a/src/amd/vulkan/radv_meta_astc_decode.c b/src/amd/vulkan/radv_meta_astc_decode.c -index 93a63692c09..006394c639b 100644 ---- a/src/amd/vulkan/radv_meta_astc_decode.c +new file mode 100644 +index 00000000000..93a63692c09 +--- /dev/null +++ b/src/amd/vulkan/radv_meta_astc_decode.c -@@ -166,8 +166,15 @@ radv_meta_decode_astc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *ima - struct radv_image_view src_iview, dst_iview; - image_view_init(device, image, VK_FORMAT_R32G32B32A32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, - subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &src_iview); -- image_view_init(device, image, VK_FORMAT_R8G8B8A8_UINT, VK_IMAGE_ASPECT_PLANE_1_BIT, subresource->mipLevel, -- subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &dst_iview); -+ if (image->isNeedHardEncode) { -+ RADV_FROM_HANDLE(radv_image, store_image, image->staging_images->image_rgba); -+ cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, store_image); -+ image_view_init(device, store_image, VK_FORMAT_R8G8B8A8_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, -+ subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &dst_iview); -+ } else { -+ image_view_init(device, image, VK_FORMAT_R8G8B8A8_UINT, VK_IMAGE_ASPECT_PLANE_1_BIT, subresource->mipLevel, -+ subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &dst_iview); -+ } - - VkExtent3D extent_copy = { - .width = extent.width, -diff --git a/src/amd/vulkan/radv_meta_copy.c b/src/amd/vulkan/radv_meta_copy.c -index b418d3f2ab9..a0a6b41cce9 100644 ---- a/src/amd/vulkan/radv_meta_copy.c -+++ b/src/amd/vulkan/radv_meta_copy.c -@@ -26,6 +26,7 @@ - #include "main/formats.h" - #include "main/texcompress_etc.h" - #include "main/texcompress_bptc_tmp.h" -+#include "vk_texcompress_bcn.h" - - static VkExtent3D - meta_image_block_size(const struct radv_image *image) -@@ -471,6 +472,74 @@ copy_buffer_to_image(struct radv_cmd_buffer *cmd_buffer, struct radv_buffer *buf - radv_meta_restore(&saved_state, cmd_buffer); - } - -+void radv_create_staging_images(struct radv_image *dst_image, struct radv_cmd_buffer *cmd_buffer) -+{ -+ struct vk_texcompress_bcn_image *staging_images = -+ vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(struct vk_texcompress_bcn_image), 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); -+ dst_image->staging_images = staging_images; -+ -+ uint32_t size = 0; -+ VkExtent3D extent = (VkExtent3D){dst_image->info.width, dst_image->info.height, 1}; -+ VkFormat format = VK_FORMAT_R8G8B8A8_UNORM; -+ vk_texcompress_create_image(&cmd_buffer->device->vk, NULL, &staging_images->image_mem, -+ &staging_images->image_rgba, extent, format, 0); -+ RADV_FROM_HANDLE(radv_image, _image_rgba, dst_image->staging_images->image_rgba); -+ size += _image_rgba->size; -+ -+ extent = (VkExtent3D){(dst_image->info.width + 3) / 4, (dst_image->info.height + 3) / 4, 1}; -+ format = VK_FORMAT_R32G32_UINT; -+ vk_texcompress_create_image(&cmd_buffer->device->vk, NULL, &staging_images->image_mem, -+ &staging_images->image_bc1, extent, format, 0); -+ RADV_FROM_HANDLE(radv_image, _image_bc1, dst_image->staging_images->image_bc1); -+ size += _image_bc1->size; -+ -+ extent = (VkExtent3D){(dst_image->info.width + 3) / 4, (dst_image->info.height + 3) / 4, 1}; -+ format = VK_FORMAT_R32G32_UINT; -+ vk_texcompress_create_image(&cmd_buffer->device->vk, NULL, &staging_images->image_mem, -+ &staging_images->image_bc4, extent, format, 0); -+ RADV_FROM_HANDLE(radv_image, _image_bc4, dst_image->staging_images->image_bc4); -+ size += _image_bc4->size; -+ -+ extent = (VkExtent3D){dst_image->info.width, dst_image->info.height, 1}; -+ format = VK_FORMAT_R8G8B8A8_UNORM; -+ vk_texcompress_create_image(&cmd_buffer->device->vk, NULL, &staging_images->image_mem, -+ &staging_images->image_dummy, extent, format, size); -+ RADV_FROM_HANDLE(radv_image, _image_dummy, dst_image->staging_images->image_dummy); -+ -+ bind_memory(&cmd_buffer->device->vk, &staging_images->image_rgba, &staging_images->image_mem, 0); -+ bind_memory(&cmd_buffer->device->vk, &staging_images->image_bc1, &staging_images->image_mem, _image_rgba->size); -+ bind_memory(&cmd_buffer->device->vk, &staging_images->image_bc4, &staging_images->image_mem, _image_bc1->size); -+ -+ cmd_buffer->vk.is_need_hard_encode = true; -+ -+ vk_texcompress_insert_head(&cmd_buffer->vk.staging_image_list_head, dst_image->staging_images); -+} -+ -+void radv_copy_buffer_hard_encode(struct radv_image *dst_image, -+ const enum util_format_layout format_layout, -+ struct radv_cmd_buffer *cmd_buffer, -+ const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo) -+{ -+ radv_create_staging_images(dst_image, cmd_buffer); -+ for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) { -+ if (format_layout == UTIL_FORMAT_LAYOUT_ASTC) { -+ radv_meta_decode_astc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout, -+ &pCopyBufferToImageInfo->pRegions[r].imageSubresource, -+ pCopyBufferToImageInfo->pRegions[r].imageOffset, -+ pCopyBufferToImageInfo->pRegions[r].imageExtent); -+ } else { -+ radv_meta_decode_etc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout, -+ &pCopyBufferToImageInfo->pRegions[r].imageSubresource, -+ pCopyBufferToImageInfo->pRegions[r].imageOffset, -+ pCopyBufferToImageInfo->pRegions[r].imageExtent); -+ } -+ radv_meta_bcn_encoded(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout, -+ &pCopyBufferToImageInfo->pRegions[r].imageSubresource, -+ pCopyBufferToImageInfo->pRegions[r].imageOffset, -+ pCopyBufferToImageInfo->pRegions[r].imageExtent); -+ } -+} +@@ -0,0 +1,183 @@ ++/* Copyright (c) 2017-2023 Hans-Kristian Arntzen ++ * ++ * SPDX-License-Identifier: MIT ++ */ + - extern bool - radv_is_format_emulated(const struct radv_physical_device *physical_device, VkFormat format); - -@@ -495,6 +564,12 @@ radv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer, - radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_image); - - const enum util_format_layout format_layout = vk_format_description(dst_image->vk_format)->layout; ++#include ++#include + -+ if (dst_image->isNeedHardEncode) { -+ radv_copy_buffer_hard_encode(dst_image, format_layout, cmd_buffer, pCopyBufferToImageInfo); -+ return; -+ } -+ - for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) { - if (format_layout == UTIL_FORMAT_LAYOUT_ASTC) { - radv_meta_decode_astc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout, -@@ -847,6 +922,36 @@ copy_image(struct radv_cmd_buffer *cmd_buffer, struct radv_image *src_image, - radv_meta_restore(&saved_state, cmd_buffer); - } - -+void radv_copy_image_hard_encode(struct radv_image *dst_image, -+ const enum util_format_layout format_layout, -+ struct radv_cmd_buffer *cmd_buffer, -+ const VkCopyImageInfo2 *pCopyImageInfo) -+{ -+ radv_create_staging_images(dst_image, cmd_buffer); -+ RADV_FROM_HANDLE(radv_image, src_image, pCopyImageInfo->srcImage); -+ for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) { -+ VkExtent3D dst_extent = pCopyImageInfo->pRegions[r].extent; -+ if (src_image->vk_format != dst_image->vk_format) { -+ dst_extent.width = dst_extent.width / vk_format_get_blockwidth(src_image->vk_format) * -+ vk_format_get_blockwidth(dst_image->vk_format); -+ dst_extent.height = dst_extent.height / vk_format_get_blockheight(src_image->vk_format) * -+ vk_format_get_blockheight(dst_image->vk_format); -+ } -+ if (format_layout == UTIL_FORMAT_LAYOUT_ASTC) { -+ radv_meta_decode_astc(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout, -+ &pCopyImageInfo->pRegions[r].dstSubresource, pCopyImageInfo->pRegions[r].dstOffset, -+ dst_extent); -+ } else { -+ radv_meta_decode_etc(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout, -+ &pCopyImageInfo->pRegions[r].dstSubresource, pCopyImageInfo->pRegions[r].dstOffset, -+ dst_extent); -+ } -+ radv_meta_bcn_encoded(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout, -+ &pCopyImageInfo->pRegions[r].dstSubresource, pCopyImageInfo->pRegions[r].dstOffset, -+ dst_extent); -+ } -+} -+ - VKAPI_ATTR void VKAPI_CALL - radv_CmdCopyImage2(VkCommandBuffer commandBuffer, const VkCopyImageInfo2 *pCopyImageInfo) - { -@@ -866,6 +971,12 @@ radv_CmdCopyImage2(VkCommandBuffer commandBuffer, const VkCopyImageInfo2 *pCopyI - radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_image); - - const enum util_format_layout format_layout = vk_format_description(dst_image->vk_format)->layout; -+ -+ if (dst_image->isNeedHardEncode) { -+ radv_copy_image_hard_encode(dst_image, format_layout, cmd_buffer, pCopyImageInfo); -+ return; -+ } -+ - for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) { - VkExtent3D dst_extent = pCopyImageInfo->pRegions[r].extent; - if (src_image->vk_format != dst_image->vk_format) { -diff --git a/src/amd/vulkan/radv_meta_etc_decode.c b/src/amd/vulkan/radv_meta_etc_decode.c -index d5d002c63a0..3685218d915 100644 ---- a/src/amd/vulkan/radv_meta_etc_decode.c -+++ b/src/amd/vulkan/radv_meta_etc_decode.c -@@ -801,23 +801,45 @@ radv_meta_decode_etc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *imag - store_format = VK_FORMAT_R8G8B8A8_UNORM; - } - struct radv_image_view dest_iview; -- radv_image_view_init( -- &dest_iview, cmd_buffer->device, -- &(VkImageViewCreateInfo){ -- .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, -- .image = radv_image_to_handle(image), -- .viewType = radv_meta_get_view_type(image), -- .format = store_format, -- .subresourceRange = -- { -- .aspectMask = VK_IMAGE_ASPECT_PLANE_1_BIT, -- .baseMipLevel = subresource->mipLevel, -- .levelCount = 1, -- .baseArrayLayer = 0, -- .layerCount = subresource->baseArrayLayer + subresource->layerCount, -- }, -- }, -- NULL); -+ if (image->isNeedHardEncode) { -+ RADV_FROM_HANDLE(radv_image, store_image, image->staging_images->image_rgba); -+ cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, store_image); -+ radv_image_view_init( -+ &dest_iview, cmd_buffer->device, -+ &(VkImageViewCreateInfo){ -+ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, -+ .image = radv_image_to_handle(store_image), -+ .viewType = radv_meta_get_view_type(store_image), -+ .format = store_format, -+ .subresourceRange = -+ { -+ .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, -+ .baseMipLevel = subresource->mipLevel, -+ .levelCount = 1, -+ .baseArrayLayer = 0, -+ .layerCount = subresource->baseArrayLayer + subresource->layerCount, -+ }, -+ }, -+ NULL); -+ } else { -+ radv_image_view_init( -+ &dest_iview, cmd_buffer->device, -+ &(VkImageViewCreateInfo){ -+ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, -+ .image = radv_image_to_handle(image), -+ .viewType = radv_meta_get_view_type(image), -+ .format = store_format, -+ .subresourceRange = -+ { -+ .aspectMask = VK_IMAGE_ASPECT_PLANE_1_BIT, -+ .baseMipLevel = subresource->mipLevel, -+ .levelCount = 1, -+ .baseArrayLayer = 0, -+ .layerCount = subresource->baseArrayLayer + subresource->layerCount, -+ }, -+ }, -+ NULL); -+ } - - decode_etc(cmd_buffer, &src_iview, &dest_iview, &(VkOffset3D){offset.x, offset.y, base_slice}, - &(VkExtent3D){extent.width, extent.height, slice_count}); -diff --git a/src/amd/vulkan/radv_meta_rgba_encode.c b/src/amd/vulkan/radv_meta_rgba_encode.c -new file mode 100644 -index 00000000000..b144767143d ---- /dev/null -+++ b/src/amd/vulkan/radv_meta_rgba_encode.c -@@ -0,0 +1,312 @@ -+/* Copyright (c) 2017-2023 Hans-Kristian Arntzen -+ * -+ * SPDX-License-Identifier: MIT -+ */ -+ -+#include -+#include -+ -+#include "radv_meta.h" -+#include "sid.h" -+#include "vk_common_entrypoints.h" -+#include "vk_format.h" -+#include "vk_texcompress_bcn.h" -+#include "radv_debug.h" ++#include "radv_meta.h" ++#include "sid.h" ++#include "vk_common_entrypoints.h" ++#include "vk_format.h" + +struct radv_dispatch_info { + /** @@ -482,27 +1369,73 @@ index 00000000000..b144767143d +}; + +VkResult -+radv_device_init_meta_rgba_encode_state(struct radv_device *device) ++radv_device_init_meta_astc_decode_state(struct radv_device *device, bool on_demand) +{ + const struct radv_physical_device *pdev = device->physical_device; + struct radv_meta_state *state = &device->meta_state; + -+ return vk_texcompress_bcn_init(&device->vk, &state->alloc, radv_pipeline_cache_to_handle(&state->cache), state->bcn_encoded); ++ if (!pdev->emulate_astc) ++ return VK_SUCCESS; ++ ++ return vk_texcompress_astc_init(&device->vk, &state->alloc, radv_pipeline_cache_to_handle(&state->cache), &state->astc_decode); +} + +void -+radv_device_finish_meta_rgba_encode_state(struct radv_device *device) ++radv_device_finish_meta_astc_decode_state(struct radv_device *device) +{ + struct radv_meta_state *state = &device->meta_state; -+ struct vk_texcompress_bcn_state **bcns = state->bcn_encoded; ++ struct vk_texcompress_astc_state *astc = state->astc_decode; + -+ if (bcns) -+ vk_texcompress_bcn_finish(&device->vk, &state->alloc, state->bcn_encoded); ++ if (astc) ++ vk_texcompress_astc_finish(&device->vk, &state->alloc, astc); +} + +extern void +radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info); + ++static void ++decode_astc(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *src_iview, struct radv_image_view *dst_iview, ++ VkImageLayout layout, const VkOffset3D *offset, const VkExtent3D *extent) ++{ ++ struct radv_device *device = cmd_buffer->device; ++ struct radv_meta_state *state = &device->meta_state; ++ struct vk_texcompress_astc_write_descriptor_set write_desc_set; ++ VkFormat format = src_iview->image->vk_format; ++ int blk_w = vk_format_get_blockwidth(format); ++ int blk_h = vk_format_get_blockheight(format); ++ ++ vk_texcompress_astc_fill_write_descriptor_sets(state->astc_decode, &write_desc_set, ++ radv_image_view_to_handle(src_iview), layout, ++ radv_image_view_to_handle(dst_iview), format); ++ radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, state->astc_decode->p_layout, ++ 0, /* set number */ ++ VK_TEXCOMPRESS_ASTC_WRITE_DESC_SET_COUNT, write_desc_set.descriptor_set); ++ ++ VkPipeline pipeline = ++ vk_texcompress_astc_get_decode_pipeline(&device->vk, &state->alloc, state->astc_decode, radv_pipeline_cache_to_handle(&state->cache), format); ++ if (pipeline == VK_NULL_HANDLE) ++ return; ++ ++ radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); ++ ++ bool is_3Dimage = (src_iview->image->type == VK_IMAGE_TYPE_3D) ? true : false; ++ int push_constants[5] = {offset->x / blk_w, offset->y / blk_h, extent->width + offset->x, extent->height + offset->y, ++ is_3Dimage}; ++ radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), device->meta_state.etc_decode.p_layout, ++ VK_SHADER_STAGE_COMPUTE_BIT, 0, 20, push_constants); ++ ++ struct radv_dispatch_info info = { ++ .blocks[0] = DIV_ROUND_UP(extent->width, blk_w * 2), ++ .blocks[1] = DIV_ROUND_UP(extent->height, blk_h * 2), ++ .blocks[2] = extent->depth, ++ .offsets[0] = 0, ++ .offsets[1] = 0, ++ .offsets[2] = offset->z, ++ .unaligned = 0, ++ }; ++ radv_compute_dispatch(cmd_buffer, &info); ++} ++ +static VkImageViewType +get_view_type(const struct radv_image *image) +{ @@ -518,8 +1451,7 @@ index 00000000000..b144767143d + +static void +image_view_init(struct radv_device *device, struct radv_image *image, VkFormat format, VkImageAspectFlags aspectMask, -+ uint32_t baseMipLevel, uint32_t baseArrayLayer, uint32_t layerCount, VkComponentMapping *component, -+ struct radv_image_view *iview) ++ uint32_t baseMipLevel, uint32_t baseArrayLayer, uint32_t layerCount, struct radv_image_view *iview) +{ + VkImageViewCreateInfo iview_create_info = { + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, @@ -535,461 +1467,3507 @@ index 00000000000..b144767143d + .layerCount = baseArrayLayer + layerCount, + }, + }; -+ if (component) { -+ iview_create_info.components = *component; -+ } + + radv_image_view_init(iview, device, &iview_create_info, NULL); +} + -+static void -+encode_bc1(struct radv_cmd_buffer *cmd_buffer, VkImageLayout layout, -+ const VkExtent3D *extent, const VkImageSubresourceLayers *subresource, -+ struct radv_image *src_image, struct radv_image *dst_image) ++void ++radv_meta_decode_astc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout layout, ++ const VkImageSubresourceLayers *subresource, VkOffset3D offset, VkExtent3D extent) +{ + struct radv_device *device = cmd_buffer->device; -+ struct radv_meta_state *state = &device->meta_state; ++ struct radv_meta_saved_state saved_state; ++ radv_meta_save(&saved_state, cmd_buffer, ++ RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS | RADV_META_SAVE_DESCRIPTORS); + -+ struct radv_image_view src_iview, dst_iview; -+ VkComponentMapping component = { -+ .r = VK_COMPONENT_SWIZZLE_R, -+ .g = VK_COMPONENT_SWIZZLE_G, -+ .b = VK_COMPONENT_SWIZZLE_B, -+ .a = VK_COMPONENT_SWIZZLE_A -+ }; -+ image_view_init(device, src_image, VK_FORMAT_R8G8B8A8_UNORM, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, -+ subresource->baseArrayLayer, 1, &component, &src_iview); -+ image_view_init(device, dst_image, VK_FORMAT_R16G16B16A16_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, -+ subresource->baseArrayLayer, 1, NULL, &dst_iview); -+ -+ cmd_buffer->state.flush_bits |= -+ RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | -+ radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, src_iview.image) | -+ radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_iview.image); -+ -+ struct vk_texcompress_bcn_write_descriptor_set write_desc_set; -+ vk_texcompress_bc1_fill_write_descriptor_sets(state->bcn_encoded[BC1_ENCODE], &write_desc_set, -+ radv_image_view_to_handle(&src_iview), layout, -+ radv_image_view_to_handle(&dst_iview)); -+ -+ radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, state->bcn_encoded[BC1_ENCODE]->p_layout, -+ 0, /* set number */ -+ 3, write_desc_set.descriptor_set); ++ VkImage _image = radv_image_to_handle(image); ++ VK_FROM_HANDLE(vk_image, vkImage, _image); + -+ VkPipeline pipeline = -+ vk_texcompress_rgba_get_encode_pipeline(&device->vk, &state->alloc, -+ state->bcn_encoded[BC1_ENCODE], -+ radv_pipeline_cache_to_handle(&state->cache), BC1_ENCODE); -+ if (pipeline == VK_NULL_HANDLE) -+ return; ++ uint32_t base_slice = radv_meta_get_iview_layer(image, subresource, &offset); ++ uint32_t slice_count = image->type == VK_IMAGE_TYPE_3D ++ ? extent.depth ++ : vk_image_subresource_layer_count(vkImage, subresource); + -+ radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); ++ extent = radv_sanitize_image_extent(image->type, extent); ++ offset = radv_sanitize_image_offset(image->type, offset); + -+ const unsigned num_refinements = 1; -+ const unsigned push_constants[2] = {num_refinements}; -+ radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), state->bcn_encoded[BC1_ENCODE]->p_layout, -+ VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants); ++ struct radv_image_view src_iview, dst_iview; ++ image_view_init(device, image, VK_FORMAT_R32G32B32A32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, ++ subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &src_iview); ++ image_view_init(device, image, VK_FORMAT_R8G8B8A8_UINT, VK_IMAGE_ASPECT_PLANE_1_BIT, subresource->mipLevel, ++ subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &dst_iview); + -+ struct radv_dispatch_info info = { -+ .blocks[0] = DIV_ROUND_UP(extent->width, 32), -+ .blocks[1] = DIV_ROUND_UP(extent->height, 32), -+ .blocks[2] = 1, ++ VkExtent3D extent_copy = { ++ .width = extent.width, ++ .height = extent.height, ++ .depth = slice_count, + }; -+ radv_compute_dispatch(cmd_buffer, &info); ++ decode_astc(cmd_buffer, &src_iview, &dst_iview, layout, &(VkOffset3D){offset.x, offset.y, base_slice}, &extent_copy); ++ + radv_image_view_finish(&src_iview); + radv_image_view_finish(&dst_iview); -+} + ++ radv_meta_restore(&saved_state, cmd_buffer); ++} +diff --git a/src/amd/vulkan/radv_meta_bufimage.c b/src/amd/vulkan/radv_meta_bufimage.c +index bf2d7fcee33..f1a98179b05 100644 +--- a/src/amd/vulkan/radv_meta_bufimage.c ++++ b/src/amd/vulkan/radv_meta_bufimage.c +@@ -1324,6 +1324,92 @@ create_bview_for_r32g32b32(struct radv_cmd_buffer *cmd_buffer, struct radv_buffe + }); + } + ++/* GFX9+ has an issue where the HW does not calculate mipmap degradations ++ * for block-compressed images correctly (see the comment in ++ * radv_image_view_init). Some texels are unaddressable and cannot be copied ++ * to/from by a compute shader. Here we will perform a buffer copy to copy the ++ * texels that the hardware missed. ++ * ++ * GFX10 will not use this workaround because it can be fixed by adjusting its ++ * image view descriptors instead. ++ */ +static void -+encode_bc4(struct radv_cmd_buffer *cmd_buffer, VkImageLayout layout, -+ const VkExtent3D *extent, const VkImageSubresourceLayers *subresource, -+ struct radv_image *src_image, struct radv_image *dst_image) ++fixup_gfx9_cs_copy(struct radv_cmd_buffer *cmd_buffer, ++ const struct radv_meta_blit2d_buffer *buf_bsurf, ++ const struct radv_meta_blit2d_surf *img_bsurf, ++ const struct radv_meta_blit2d_rect *rect, bool to_image) +{ -+ struct radv_device *device = cmd_buffer->device; -+ struct radv_meta_state *state = &device->meta_state; ++ const unsigned mip_level = img_bsurf->level; ++ const struct radv_image *image = img_bsurf->image; ++ const struct radeon_surf *surf = &image->planes[0].surface; ++ const struct radv_device *device = cmd_buffer->device; ++ const struct radeon_info *rad_info = &device->physical_device->rad_info; ++ struct ac_addrlib *addrlib = device->ws->get_addrlib(device->ws); ++ ++ /* GFX10 will use a different workaround unless this is not a 2D image */ ++ if (rad_info->chip_class < GFX9 || ++ (rad_info->chip_class >= GFX10 && image->type == VK_IMAGE_TYPE_2D) || ++ image->info.levels == 1 || !vk_format_is_block_compressed(image->vk_format)) ++ return; + -+ struct radv_image_view src_iview, dst_iview; -+ VkComponentMapping component = { -+ .r = VK_COMPONENT_SWIZZLE_A, -+ .g = VK_COMPONENT_SWIZZLE_ZERO, -+ .b = VK_COMPONENT_SWIZZLE_ZERO, -+ .a = VK_COMPONENT_SWIZZLE_ONE -+ }; -+ image_view_init(device, src_image, VK_FORMAT_R8G8B8A8_UNORM, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, -+ subresource->baseArrayLayer, 1, &component, &src_iview); -+ image_view_init(device, dst_image, VK_FORMAT_R16G16B16A16_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, -+ subresource->baseArrayLayer, 1, NULL, &dst_iview); -+ -+ cmd_buffer->state.flush_bits |= -+ RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | -+ radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, src_iview.image) | -+ radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_iview.image); -+ -+ struct vk_texcompress_bc4_write_descriptor_set write_desc_set; -+ vk_texcompress_bc4_fill_write_descriptor_sets(state->bcn_encoded[BC4_ENCODE], &write_desc_set, -+ radv_image_view_to_handle(&src_iview), layout, -+ radv_image_view_to_handle(&dst_iview)); -+ -+ radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, state->bcn_encoded[BC4_ENCODE]->p_layout, -+ 0, /* set number */ -+ 2, write_desc_set.descriptor_set); ++ /* The physical extent of the base mip */ ++ VkExtent2D hw_base_extent = {surf->u.gfx9.base_mip_width, surf->u.gfx9.base_mip_height}; + -+ VkPipeline pipeline = -+ vk_texcompress_rgba_get_encode_pipeline(&device->vk, &state->alloc, -+ state->bcn_encoded[BC4_ENCODE], -+ radv_pipeline_cache_to_handle(&state->cache), BC4_ENCODE); -+ if (pipeline == VK_NULL_HANDLE) -+ return; ++ /* The hardware-calculated extent of the selected mip ++ * (naive divide-by-two integer math) ++ */ ++ VkExtent2D hw_mip_extent = {radv_minify(hw_base_extent.width, mip_level), ++ radv_minify(hw_base_extent.height, mip_level)}; + -+ radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); ++ /* The actual extent we want to copy */ ++ VkExtent2D mip_extent = {rect->width, rect->height}; + -+ const unsigned alpha_channel_id = 0; // r通道 -+ const unsigned use_norm = 0; -+ const unsigned push_constants[2] = {alpha_channel_id, use_norm}; -+ radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), state->bcn_encoded[BC4_ENCODE]->p_layout, -+ VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants); ++ VkOffset2D mip_offset = {to_image ? rect->dst_x : rect->src_x, ++ to_image ? rect->dst_y : rect->src_y}; + -+ struct radv_dispatch_info info = { -+ .blocks[0] = 1, -+ .blocks[1] = DIV_ROUND_UP(extent->width, 16), -+ .blocks[2] = DIV_ROUND_UP(extent->height, 16), -+ }; -+ radv_compute_dispatch(cmd_buffer, &info); -+ radv_image_view_finish(&src_iview); -+ radv_image_view_finish(&dst_iview); ++ if (hw_mip_extent.width >= mip_offset.x + mip_extent.width && ++ hw_mip_extent.height >= mip_offset.y + mip_extent.height) ++ return; ++ ++ if (!to_image) { ++ /* If we are writing to a buffer, then we need to wait for the compute ++ * shader to finish because it may write over the unaddressable texels ++ * while we're fixing them. If we're writing to an image, we do not need ++ * to wait because the compute shader cannot write to those texels ++ */ ++ cmd_buffer->state.flush_bits |= ++ RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE; ++ } ++ ++ for (uint32_t y = 0; y < mip_extent.width; y++) { ++ uint32_t coordY = y + mip_offset.y; ++ /* If the default copy algorithm (done previously) has already seen this ++ * scanline, then we can bias the starting X coordinate over to skip the ++ * region already copied by the default copy. ++ */ ++ uint32_t x = (coordY < hw_mip_extent.height) ? hw_mip_extent.width : 0; ++ for (; x < mip_extent.width; x++) { ++ uint32_t coordX = x + mip_offset.x; ++ uint64_t addr = ac_surface_addr_from_coord(addrlib, rad_info, surf, &image->info, ++ mip_level, coordX, coordY, img_bsurf->layer, ++ image->type == VK_IMAGE_TYPE_3D); ++ struct radeon_winsys_bo *img_bo = image->bo; ++ struct radeon_winsys_bo *mem_bo = buf_bsurf->buffer->bo; ++ const uint64_t img_offset = image->offset + addr; ++ /* buf_bsurf->offset already includes the layer offset */ ++ const uint64_t mem_offset = buf_bsurf->buffer->offset + ++ buf_bsurf->offset + ++ y * buf_bsurf->pitch * surf->bpe + ++ x * surf->bpe; ++ if (to_image) { ++ radv_copy_buffer(cmd_buffer, mem_bo, img_bo, mem_offset, img_offset, surf->bpe); ++ } else { ++ radv_copy_buffer(cmd_buffer, img_bo, mem_bo, img_offset, mem_offset, surf->bpe); ++ } ++ } ++ } +} + -+static void -+encode_bc3(struct radv_cmd_buffer *cmd_buffer, -+ const VkExtent3D *extent, const VkImageSubresourceLayers *subresource, -+ struct radv_image *bc1_image, struct radv_image *bc4_image, struct radv_image *dst_image) -+{ -+ struct radv_device *device = cmd_buffer->device; -+ struct radv_meta_state *state = &device->meta_state; + static unsigned + get_image_stride_for_r32g32b32(struct radv_cmd_buffer *cmd_buffer, + struct radv_meta_blit2d_surf *surf) +@@ -1399,6 +1485,7 @@ radv_meta_image_to_buffer(struct radv_cmd_buffer *cmd_buffer, struct radv_meta_b + 16, push_constants); + + radv_unaligned_dispatch(cmd_buffer, rects[r].width, rects[r].height, 1); ++ fixup_gfx9_cs_copy(cmd_buffer, dst, src, &rects[r], false); + } + + radv_image_view_finish(&src_view); +@@ -1553,6 +1640,7 @@ radv_meta_buffer_to_image_cs(struct radv_cmd_buffer *cmd_buffer, + 16, push_constants); + + radv_unaligned_dispatch(cmd_buffer, rects[r].width, rects[r].height, 1); ++ fixup_gfx9_cs_copy(cmd_buffer, src, dst, &rects[r], true); + } + + radv_image_view_finish(&dst_view); +diff --git a/src/amd/vulkan/radv_meta_copy.c b/src/amd/vulkan/radv_meta_copy.c +index 25368e64a04..b418d3f2ab9 100644 +--- a/src/amd/vulkan/radv_meta_copy.c ++++ b/src/amd/vulkan/radv_meta_copy.c +@@ -471,6 +471,9 @@ copy_buffer_to_image(struct radv_cmd_buffer *cmd_buffer, struct radv_buffer *buf + radv_meta_restore(&saved_state, cmd_buffer); + } + ++extern bool ++radv_is_format_emulated(const struct radv_physical_device *physical_device, VkFormat format); + -+ struct radv_image_view bc1_iview, bc4_iview, dst_iview; -+ VkComponentMapping component = { -+ .r = VK_COMPONENT_SWIZZLE_R, -+ .g = VK_COMPONENT_SWIZZLE_G, -+ .b = VK_COMPONENT_SWIZZLE_ZERO, -+ .a = VK_COMPONENT_SWIZZLE_ONE -+ }; -+ image_view_init(device, bc1_image, VK_FORMAT_R32G32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, -+ subresource->baseArrayLayer, 1, &component, &bc1_iview); -+ image_view_init(device, bc4_image, VK_FORMAT_R32G32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, -+ subresource->baseArrayLayer, 1, &component, &bc4_iview); -+ image_view_init(device, dst_image, VK_FORMAT_R32G32B32A32_UINT, VK_IMAGE_ASPECT_PLANE_1_BIT, subresource->mipLevel, -+ subresource->baseArrayLayer, 1, NULL, &dst_iview); -+ -+ cmd_buffer->state.flush_bits |= -+ RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | -+ radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, bc1_iview.image) | -+ radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, bc4_iview.image) | -+ radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_iview.image); -+ -+ struct vk_texcompress_bcn_write_descriptor_set write_desc_set; -+ vk_texcompress_bc3_fill_write_descriptor_sets(state->bcn_encoded[BC3_ENCODE], &write_desc_set, -+ radv_image_view_to_handle(&bc1_iview), radv_image_view_to_handle(&bc4_iview), -+ radv_image_view_to_handle(&dst_iview)); -+ -+ radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, state->bcn_encoded[BC3_ENCODE]->p_layout, -+ 0, /* set number */ -+ 3, write_desc_set.descriptor_set); + VKAPI_ATTR void VKAPI_CALL + radv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer, + const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo) +@@ -485,18 +488,25 @@ radv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer, + &pCopyBufferToImageInfo->pRegions[r]); + } + +- if (cmd_buffer->device->physical_device->emulate_etc2 && +- vk_format_description(dst_image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ETC) { ++ if (radv_is_format_emulated(cmd_buffer->device->physical_device, dst_image->vk_format)) { + cmd_buffer->state.flush_bits |= + RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | + radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, dst_image) | +- radv_dst_access_flush( +- cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_image); ++ radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_image); ++ ++ const enum util_format_layout format_layout = vk_format_description(dst_image->vk_format)->layout; + for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) { +- radv_meta_decode_etc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout, +- &pCopyBufferToImageInfo->pRegions[r].imageSubresource, +- pCopyBufferToImageInfo->pRegions[r].imageOffset, +- pCopyBufferToImageInfo->pRegions[r].imageExtent); ++ if (format_layout == UTIL_FORMAT_LAYOUT_ASTC) { ++ radv_meta_decode_astc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout, ++ &pCopyBufferToImageInfo->pRegions[r].imageSubresource, ++ pCopyBufferToImageInfo->pRegions[r].imageOffset, ++ pCopyBufferToImageInfo->pRegions[r].imageExtent); ++ } else { ++ radv_meta_decode_etc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout, ++ &pCopyBufferToImageInfo->pRegions[r].imageSubresource, ++ pCopyBufferToImageInfo->pRegions[r].imageOffset, ++ pCopyBufferToImageInfo->pRegions[r].imageExtent); ++ } + } + } + } +@@ -849,18 +859,30 @@ radv_CmdCopyImage2(VkCommandBuffer commandBuffer, const VkCopyImageInfo2 *pCopyI + pCopyImageInfo->dstImageLayout, &pCopyImageInfo->pRegions[r]); + } + +- if (cmd_buffer->device->physical_device->emulate_etc2 && +- vk_format_description(dst_image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ETC) { ++ if (radv_is_format_emulated(cmd_buffer->device->physical_device, dst_image->vk_format)) { + cmd_buffer->state.flush_bits |= + RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | + radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, dst_image) | +- radv_dst_access_flush( +- cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_image); ++ radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_image); ++ ++ const enum util_format_layout format_layout = vk_format_description(dst_image->vk_format)->layout; + for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) { +- radv_meta_decode_etc(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout, +- &pCopyImageInfo->pRegions[r].dstSubresource, +- pCopyImageInfo->pRegions[r].dstOffset, +- pCopyImageInfo->pRegions[r].extent); ++ VkExtent3D dst_extent = pCopyImageInfo->pRegions[r].extent; ++ if (src_image->vk_format != dst_image->vk_format) { ++ dst_extent.width = dst_extent.width / vk_format_get_blockwidth(src_image->vk_format) * ++ vk_format_get_blockwidth(dst_image->vk_format); ++ dst_extent.height = dst_extent.height / vk_format_get_blockheight(src_image->vk_format) * ++ vk_format_get_blockheight(dst_image->vk_format); ++ } ++ if (format_layout == UTIL_FORMAT_LAYOUT_ASTC) { ++ radv_meta_decode_astc(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout, ++ &pCopyImageInfo->pRegions[r].dstSubresource, pCopyImageInfo->pRegions[r].dstOffset, ++ dst_extent); ++ } else { ++ radv_meta_decode_etc(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout, ++ &pCopyImageInfo->pRegions[r].dstSubresource, pCopyImageInfo->pRegions[r].dstOffset, ++ dst_extent); ++ } + } + } + } +diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h +index 950ded15f58..472858dd401 100644 +--- a/src/amd/vulkan/radv_private.h ++++ b/src/amd/vulkan/radv_private.h +@@ -96,6 +96,7 @@ typedef uint32_t xcb_window_t; + + #include "radv_entrypoints.h" + ++#include "vk_texcompress_astc.h" + #include "wsi_common.h" + + #ifdef __cplusplus +@@ -303,6 +304,9 @@ struct radv_physical_device { + /* Whether to emulate ETC2 image support on HW without support. */ + bool emulate_etc2; + ++ /* Whether to emulate ASTC image support on HW without support. */ ++ bool emulate_astc; ++ + /* This is the drivers on-disk cache used as a fallback as opposed to + * the pipeline cache defined by apps. + */ +@@ -675,6 +679,8 @@ struct radv_meta_state { + VkPipelineLayout p_layout; + VkPipeline pipeline; + } etc_decode; + -+ VkPipeline pipeline = -+ vk_texcompress_rgba_get_encode_pipeline(&device->vk, &state->alloc, -+ state->bcn_encoded[BC3_ENCODE], -+ radv_pipeline_cache_to_handle(&state->cache), BC3_ENCODE); -+ if (pipeline == VK_NULL_HANDLE) -+ return; ++ struct vk_texcompress_astc_state *astc_decode; + }; + + #define RADV_NUM_HW_CTX (RADEON_CTX_PRIORITY_REALTIME + 1) +@@ -2415,6 +2421,9 @@ struct radv_image_view { + * This has a few differences for cube maps (e.g. type). + */ + union radv_descriptor storage_descriptor; + -+ radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); ++ /* Block-compressed image views on GFX10+. */ ++ struct ac_surf_nbc_view nbc_view; + }; + + struct radv_image_create_info { +diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h +index d72b4d670d8..83a2785d4bd 100644 +--- a/src/amd/vulkan/radv_radeon_winsys.h ++++ b/src/amd/vulkan/radv_radeon_winsys.h +@@ -295,6 +295,8 @@ struct radeon_winsys { + + int (*get_fd)(struct radeon_winsys *ws); + ++ struct ac_addrlib *(*get_addrlib)(struct radeon_winsys *ws); + -+ struct radv_dispatch_info info = { -+ .blocks[0] = DIV_ROUND_UP(extent->width, 32), -+ .blocks[1] = DIV_ROUND_UP(extent->height, 32), -+ .blocks[2] = 1, -+ }; -+ radv_compute_dispatch(cmd_buffer, &info); -+ radv_image_view_finish(&bc1_iview); -+ radv_image_view_finish(&bc4_iview); -+ radv_image_view_finish(&dst_iview); + const struct vk_sync_type *const *(*get_sync_types)(struct radeon_winsys *ws); + }; + +diff --git a/src/amd/vulkan/vk_format.h b/src/amd/vulkan/vk_format.h +index 3cb035eb3cc..e6367784002 100644 +--- a/src/amd/vulkan/vk_format.h ++++ b/src/amd/vulkan/vk_format.h +@@ -195,4 +195,29 @@ vk_format_get_plane_format(VkFormat format, unsigned plane_id) + } + } + ++static inline VkFormat ++vk_texcompress_etc2_emulation_format(VkFormat etc2_format) ++{ ++ switch (etc2_format) { ++ case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK: ++ case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK: ++ case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK: ++ return VK_FORMAT_R8G8B8A8_UNORM; ++ case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK: ++ case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK: ++ case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK: ++ return VK_FORMAT_R8G8B8A8_SRGB; ++ case VK_FORMAT_EAC_R11_UNORM_BLOCK: ++ return VK_FORMAT_R16_UNORM; ++ case VK_FORMAT_EAC_R11_SNORM_BLOCK: ++ return VK_FORMAT_R16_SNORM; ++ case VK_FORMAT_EAC_R11G11_UNORM_BLOCK: ++ return VK_FORMAT_R16G16_UNORM; ++ case VK_FORMAT_EAC_R11G11_SNORM_BLOCK: ++ return VK_FORMAT_R16G16_SNORM; ++ default: ++ return VK_FORMAT_UNDEFINED; ++ } +} + -+void -+radv_meta_bcn_encoded(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout layout, -+ const VkImageSubresourceLayers *subresource, VkOffset3D offset, VkExtent3D extent) + #endif /* VK_FORMAT_H */ +diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.c +index 9d160b65ed4..8947ee1dc50 100644 +--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.c ++++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.c +@@ -95,8 +95,17 @@ radv_amdgpu_winsys_surface_init(struct radeon_winsys *_ws, const struct ac_surf_ + return ac_compute_surface(ws->addrlib, &ws->info, &config, mode, surf); + } + ++ ++static struct ac_addrlib * ++radv_amdgpu_get_addrlib(struct radeon_winsys *rws) +{ -+ struct radv_device *device = cmd_buffer->device; -+ struct radv_meta_saved_state saved_state; -+ radv_meta_save(&saved_state, cmd_buffer, -+ RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS | RADV_META_SAVE_DESCRIPTORS); ++ struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(rws); ++ return ws->addrlib; ++} + -+ VkImage _image = radv_image_to_handle(image); -+ VK_FROM_HANDLE(vk_image, vkImage, _image); + void + radv_amdgpu_surface_init_functions(struct radv_amdgpu_winsys *ws) + { ++ ws->base.get_addrlib = radv_amdgpu_get_addrlib; + ws->base.surface_init = radv_amdgpu_winsys_surface_init; + } +diff --git a/src/compiler/glsl/CrossPlatformSettings_piece_all.glsl b/src/compiler/glsl/CrossPlatformSettings_piece_all.glsl +new file mode 100644 +index 00000000000..7ef940a1603 +--- /dev/null ++++ b/src/compiler/glsl/CrossPlatformSettings_piece_all.glsl +@@ -0,0 +1,98 @@ ++/* ++ * Copyright 2020-2022 Matias N. Goldberg ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ */ + -+ extent = radv_sanitize_image_extent(image->type, extent); -+ offset = radv_sanitize_image_offset(image->type, offset); + -+ VkExtent3D extent_copy = { -+ .width = extent.width, -+ .height = extent.height, -+ .depth = 1, -+ }; ++#define min3( a, b, c ) min( a, min( b, c ) ) ++#define max3( a, b, c ) max( a, max( b, c ) ) ++ ++#define float2 vec2 ++#define float3 vec3 ++#define float4 vec4 ++ ++#define int2 ivec2 ++#define int3 ivec3 ++#define int4 ivec4 ++ ++#define uint2 uvec2 ++#define uint3 uvec3 ++#define uint4 uvec4 ++ ++#define float2x2 mat2 ++#define float3x3 mat3 ++#define float4x4 mat4 ++#define ogre_float4x3 mat3x4 ++ ++#define ushort uint ++#define ushort3 uint3 ++#define ushort4 uint4 ++ ++//Short used for read operations. It's an int in GLSL & HLSL. An ushort in Metal ++#define rshort int ++#define rshort2 int2 ++#define rint int ++//Short used for write operations. It's an int in GLSL. An ushort in HLSL & Metal ++#define wshort2 int2 ++#define wshort3 int3 ++ ++#define toFloat3x3( x ) mat3( x ) ++#define buildFloat3x3( row0, row1, row2 ) mat3( row0, row1, row2 ) ++ ++#define mul( x, y ) ((x) * (y)) ++#define saturate(x) clamp( (x), 0.0, 1.0 ) ++#define lerp mix ++#define rsqrt inversesqrt ++#define INLINE ++#define NO_INTERPOLATION_PREFIX flat ++#define NO_INTERPOLATION_SUFFIX ++ ++#define PARAMS_ARG_DECL ++#define PARAMS_ARG ++ ++#define reversebits bitfieldReverse ++ ++#define OGRE_Sample( tex, sampler, uv ) texture( tex, uv ) ++#define OGRE_SampleLevel( tex, sampler, uv, lod ) textureLod( tex, uv, lod ) ++#define OGRE_SampleArray2D( tex, sampler, uv, arrayIdx ) texture( tex, vec3( uv, arrayIdx ) ) ++#define OGRE_SampleArray2DLevel( tex, sampler, uv, arrayIdx, lod ) textureLod( tex, vec3( uv, arrayIdx ), lod ) ++#define OGRE_SampleArrayCubeLevel( tex, sampler, uv, arrayIdx, lod ) textureLod( tex, vec4( uv, arrayIdx ), lod ) ++#define OGRE_SampleGrad( tex, sampler, uv, ddx, ddy ) textureGrad( tex, uv, ddx, ddy ) ++#define OGRE_SampleArray2DGrad( tex, sampler, uv, arrayIdx, ddx, ddy ) textureGrad( tex, vec3( uv, arrayIdx ), ddx, ddy ) ++#define OGRE_ddx( val ) dFdx( val ) ++#define OGRE_ddy( val ) dFdy( val ) ++#define OGRE_Load2D( tex, iuv, lod ) texelFetch( tex, iuv, lod ) ++#define OGRE_LoadArray2D( tex, iuv, arrayIdx, lod ) texelFetch( tex, ivec3( iuv, arrayIdx ), lod ) ++#define OGRE_Load2DMS( tex, iuv, subsample ) texelFetch( tex, iuv, subsample ) ++ ++#define OGRE_Load3D( tex, iuv, lod ) texelFetch( tex, ivec3( iuv ), lod ) ++ ++#define OGRE_GatherRed( tex, sampler, uv ) textureGather( tex, uv, 0 ) ++#define OGRE_GatherGreen( tex, sampler, uv ) textureGather( tex, uv, 1 ) ++#define OGRE_GatherBlue( tex, sampler, uv ) textureGather( tex, uv, 2 ) ++ ++#define bufferFetch1( buffer, idx ) texelFetch( buffer, idx ).x ++ ++#define OGRE_SAMPLER_ARG_DECL( samplerName ) ++#define OGRE_SAMPLER_ARG( samplerName ) ++ ++#define OGRE_Texture3D_float4 sampler3D ++#define OGRE_OUT_REF( declType, variableName ) out declType variableName ++#define OGRE_INOUT_REF( declType, variableName ) inout declType variableName +diff --git a/src/compiler/glsl/astc_decoder.glsl b/src/compiler/glsl/astc_decoder.glsl +new file mode 100644 +index 00000000000..ec00cc5ab41 +--- /dev/null ++++ b/src/compiler/glsl/astc_decoder.glsl +@@ -0,0 +1,1382 @@ ++#version 320 es ++precision highp float; ++precision highp int; ++precision highp usamplerBuffer; ++precision highp usampler2D; ++precision highp image2D; ++precision highp uimage2D; ++ ++/* Copyright (c) 2020-2022 Hans-Kristian Arntzen ++ * Copyright (c) 2022 Intel Corporation ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining ++ * a copy of this software and associated documentation files (the ++ * "Software"), to deal in the Software without restriction, including ++ * without limitation the rights to use, copy, modify, merge, publish, ++ * distribute, sublicense, and/or sell copies of the Software, and to ++ * permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be ++ * included in all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#ifdef VULKAN ++ ++precision highp utextureBuffer; ++precision highp utexture2DArray; ++precision highp uimage2DArray; ++precision highp uimage3D; ++precision highp utexture3D; ++ ++#extension GL_EXT_samplerless_texture_functions : require ++layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z = 4) in; ++ ++layout(set = 0, binding = 0) writeonly uniform uimage2DArray OutputImage2Darray; ++layout(set = 0, binding = 0) writeonly uniform uimage3D OutputImage3D; ++layout(set = 0, binding = 1) uniform utexture2DArray PayloadInput2Darray; ++layout(set = 0, binding = 1) uniform utexture3D PayloadInput3D; ++layout(set = 0, binding = 2) uniform utextureBuffer LUTRemainingBitsToEndpointQuantizer; ++layout(set = 0, binding = 3) uniform utextureBuffer LUTEndpointUnquantize; ++layout(set = 0, binding = 4) uniform utextureBuffer LUTWeightQuantizer; ++layout(set = 0, binding = 5) uniform utextureBuffer LUTWeightUnquantize; ++layout(set = 0, binding = 6) uniform utextureBuffer LUTTritQuintDecode; ++layout(set = 0, binding = 7) uniform utextureBuffer LUTPartitionTable; ++ ++layout(constant_id = 2) const bool DECODE_8BIT = false; ++ ++layout(push_constant, std430) uniform pc { ++ ivec2 texel_blk_start; ++ ivec2 texel_end; ++ bool is_3Dimage; ++}; + -+ RADV_FROM_HANDLE(radv_image, rgba_image, image->staging_images->image_rgba); -+ RADV_FROM_HANDLE(radv_image, image1, image->staging_images->image_bc1); -+ RADV_FROM_HANDLE(radv_image, image4, image->staging_images->image_bc4); ++#else /* VULKAN */ + -+ encode_bc1(cmd_buffer, layout, &extent_copy, subresource, rgba_image, image1); ++layout(local_size_x = %u, local_size_y = %u, local_size_z = 4) in; ++ ++#define utextureBuffer usamplerBuffer ++#define utexture2D usampler2D ++ ++layout(binding = 0) uniform utextureBuffer LUTRemainingBitsToEndpointQuantizer; ++layout(binding = 1) uniform utextureBuffer LUTEndpointUnquantize; ++layout(binding = 2) uniform utextureBuffer LUTWeightQuantizer; ++layout(binding = 3) uniform utextureBuffer LUTWeightUnquantize; ++layout(binding = 4) uniform utextureBuffer LUTTritQuintDecode; ++layout(binding = 5) uniform utexture2D LUTPartitionTable; ++layout(binding = 6) uniform utexture2D PayloadInput; ++ ++layout(rgba8ui, binding = 7) writeonly uniform uimage2D OutputImage; ++const bool DECODE_8BIT = true; ++ ++#endif /* VULKAN */ ++ ++const int MODE_LDR = 0; ++const int MODE_HDR = 1; ++const int MODE_HDR_LDR_ALPHA = 2; ++ ++const uvec4 error_color = uvec4(255, 0, 255, 255); ++ ++/* bitextract.h */ ++int extract_bits(uvec4 payload, int offset, int bits) ++{ ++ int last_offset = offset + bits - 1; ++ int result; ++ ++ if (bits <= 0) ++ result = 0; ++ else if ((last_offset >> 5) == (offset >> 5)) ++ result = int(bitfieldExtract(payload[offset >> 5], offset & 31, bits)); ++ else ++ { ++ int first_bits = 32 - (offset & 31); ++ int result_first = int(bitfieldExtract(payload[offset >> 5], offset & 31, first_bits)); ++ int result_second = int(bitfieldExtract(payload[(offset >> 5) + 1], 0, bits - first_bits)); ++ result = result_first | (result_second << first_bits); ++ } ++ return result; ++} ++ ++/* bitextract.h */ ++int extract_bits_sign(uvec4 payload, int offset, int bits) ++{ ++ int last_offset = offset + bits - 1; ++ int result; ++ ++ if (bits <= 0) ++ result = 0; ++ else if ((last_offset >> 5) == (offset >> 5)) ++ result = bitfieldExtract(int(payload[offset >> 5]), offset & 31, bits); ++ else ++ { ++ int first_bits = 32 - (offset & 31); ++ int result_first = int(bitfieldExtract(payload[offset >> 5], offset & 31, first_bits)); ++ int result_second = bitfieldExtract(int(payload[(offset >> 5) + 1]), 0, bits - first_bits); ++ result = result_first | (result_second << first_bits); ++ } ++ return result; ++} ++ ++/* bitextract.h */ ++int extract_bits_reverse(uvec4 payload, int offset, int bits) ++{ ++ int last_offset = offset + bits - 1; ++ int result; ++ ++ if (bits <= 0) ++ result = 0; ++ else if ((last_offset >> 5) == (offset >> 5)) ++ result = int(bitfieldReverse(bitfieldExtract(payload[offset >> 5], offset & 31, bits)) >> (32 - bits)); ++ else ++ { ++ int first_bits = 32 - (offset & 31); ++ uint result_first = bitfieldExtract(payload[offset >> 5], offset & 31, first_bits); ++ uint result_second = bitfieldExtract(payload[(offset >> 5) + 1], 0, bits - first_bits); ++ result = int(bitfieldReverse(result_first | (result_second << first_bits)) >> (32 - bits)); ++ } ++ return result; ++} ++ ++void swap(inout int a, inout int b) ++{ ++ int tmp = a; ++ a = b; ++ b = tmp; ++} ++ ++ivec4 build_coord() ++{ ++ ivec2 payload_coord = ivec2(gl_WorkGroupID.xy) * 2; ++ payload_coord.x += int(gl_LocalInvocationID.z) & 1; ++ payload_coord.y += (int(gl_LocalInvocationID.z) >> 1) & 1; ++#ifdef VULKAN ++ payload_coord += texel_blk_start; ++#endif /* VULKAN */ ++ ivec2 coord = payload_coord * ivec2(gl_WorkGroupSize.xy); ++ coord += ivec2(gl_LocalInvocationID.xy); ++ return ivec4(coord, payload_coord); ++} ++ ++ivec4 interpolate_endpoint(ivec4 ep0, ivec4 ep1, ivec4 weight, int decode_mode) ++{ ++ if (decode_mode == MODE_HDR) ++ { ++ ep0 <<= 4; ++ ep1 <<= 4; ++ } ++ else if (decode_mode == MODE_HDR_LDR_ALPHA) ++ { ++ ep0.rgb <<= 4; ++ ep1.rgb <<= 4; ++ ep0.a *= 0x101; ++ ep1.a *= 0x101; ++ } ++ else if (DECODE_8BIT) ++ { ++ // This isn't quite right in all cases. ++ // In normal ASTC with sRGB, the alpha channel is supposed to ++ // be decoded as FP16, ++ // even when color components are SRGB 8-bit (?!?!?!?!). ++ // This is correct if decode_unorm8 mode is used though, ++ // for sanity, we're going to assume unorm8 decoding mode ++ // is implied when using sRGB. ++ ep0 = (ep0 << 8) | ivec4(0x80); ++ ep1 = (ep1 << 8) | ivec4(0x80); ++ } ++ else ++ { ++ ep0 *= 0x101; ++ ep1 *= 0x101; ++ } ++ ++ ivec4 color = (ep0 * (64 - weight) + ep1 * weight + 32) >> 6; ++ return color; ++} ++ ++bvec4 bvec_or(bvec4 a, bvec4 b) ++{ ++ return bvec4(ivec4(a) | ivec4(b)); ++} ++ ++uint round_down_quantize_fp16(int color) ++{ ++ // ASTC has a very peculiar way of converting the decoded result to FP16. ++ // 0xffff -> 1.0, and for everything else we get roundDownQuantizeFP16(vec4(c) / vec4(0x10000)). ++ int msb = findMSB(color); ++ int shamt = msb; ++ int m = ((color << 10) >> shamt) & 0x3ff; ++ int e = msb - 1; ++ uint decoded = color == 0xffff ? 0x3c00u : uint(e < 1 ? (color << 8) : (m | (e << 10))); ++ return decoded; ++} ++ ++uvec4 round_down_quantize_fp16(ivec4 color) ++{ ++ // ASTC has a very peculiar way of converting the decoded result to FP16. ++ // 0xffff -> 1.0, and for everything else we get roundDownQuantizeFP16(vec4(c) / vec4(0x10000)). ++ ivec4 msb = findMSB(color); ++ ivec4 shamt = msb; ++ ivec4 m = ((color << 10) >> shamt) & 0x3ff; ++ ivec4 e = msb - 1; ++ uvec4 decoded = uvec4(m | (e << 10)); ++ uvec4 denorm_decode = uvec4(color << 8); ++ decoded = mix(decoded, uvec4(denorm_decode), lessThan(e, ivec4(1))); ++ decoded = mix(decoded, uvec4(0x3c00), equal(color, ivec4(0xffff))); ++ return decoded; ++} ++ ++uvec4 decode_fp16(ivec4 color, int decode_mode) ++{ ++ if (decode_mode != MODE_LDR) ++ { ++ // Interpret the value as FP16, but with some extra fixups along the way to make the interpolation more ++ // logarithmic (apparently). From spec: ++ ivec4 e = color >> 11; ++ ivec4 m = color & 0x7ff; ++ ivec4 mt = 4 * m - 512; ++ mt = mix(mt, ivec4(3 * m), lessThan(m, ivec4(512))); ++ mt = mix(mt, ivec4(5 * m - 2048), greaterThanEqual(m, ivec4(1536))); ++ ++ ivec4 decoded = (e << 10) + (mt >> 3); ++ // +Inf or NaN are decoded to 0x7bff (max finite value). ++ decoded = mix(decoded, ivec4(0x7bff), bvec_or(greaterThan(decoded & 0x7fff, ivec4(0x7c00)), equal(decoded, ivec4(0x7c00)))); ++ ++ if (decode_mode == MODE_HDR_LDR_ALPHA) ++ decoded.a = int(round_down_quantize_fp16(color.a)); ++ ++ return uvec4(decoded); ++ } ++ else ++ { ++ return round_down_quantize_fp16(color); ++ } ++} ++ ++struct BlockMode ++{ ++ ivec2 weight_grid_size; ++ int weight_mode_index; ++ int num_partitions; ++ int seed; ++ int cem; ++ int config_bits; ++ int primary_config_bits; ++ bool dual_plane; ++ bool void_extent; ++}; ++ ++bool decode_error = false; ++ ++BlockMode decode_block_mode(uvec4 payload) ++{ ++ BlockMode mode; ++ mode.void_extent = (payload.x & 0x1ffu) == 0x1fcu; ++ if (mode.void_extent) ++ return mode; ++ ++ mode.dual_plane = (payload.x & (1u << 10u)) != 0u; ++ ++ uint higher = (payload.x >> 2u) & 3u; ++ uint lower = payload.x & 3u; ++ ++ if (lower != 0u) ++ { ++ mode.weight_mode_index = int((payload.x >> 4u) & 1u); ++ mode.weight_mode_index |= int((payload.x << 1u) & 6u); ++ mode.weight_mode_index |= int((payload.x >> 6u) & 8u); ++ ++ if (higher < 2u) ++ { ++ mode.weight_grid_size.x = int(bitfieldExtract(payload.x, 7, 2) + 4u + 4u * higher); ++ mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 5, 2) + 2u); ++ } ++ else if (higher == 2u) ++ { ++ mode.weight_grid_size.x = int(bitfieldExtract(payload.x, 5, 2) + 2u); ++ mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 7, 2) + 8u); ++ } ++ else ++ { ++ if ((payload.x & (1u << 8u)) != 0u) ++ { ++ mode.weight_grid_size.x = int(bitfieldExtract(payload.x, 7, 1) + 2u); ++ mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 5, 2) + 2u); ++ } ++ else ++ { ++ mode.weight_grid_size.x = int(bitfieldExtract(payload.x, 5, 2) + 2u); ++ mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 7, 1) + 6u); ++ } ++ } ++ } ++ else ++ { ++ int p3 = int(bitfieldExtract(payload.x, 9, 1)); ++ int hi = int(bitfieldExtract(payload.x, 7, 2)); ++ int lo = int(bitfieldExtract(payload.x, 5, 2)); ++ if (hi == 0) ++ { ++ mode.weight_grid_size.x = 12; ++ mode.weight_grid_size.y = lo + 2; ++ } ++ else if (hi == 1) ++ { ++ mode.weight_grid_size.x = lo + 2; ++ mode.weight_grid_size.y = 12; ++ } ++ else if (hi == 2) ++ { ++ mode.dual_plane = false; ++ p3 = 0; ++ mode.weight_grid_size.x = lo + 6; ++ mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 9, 2) + 6u); ++ } ++ else ++ { ++ if (lo == 0) ++ mode.weight_grid_size = ivec2(6, 10); ++ else if (lo == 1) ++ mode.weight_grid_size = ivec2(10, 6); ++ else ++ decode_error = true; ++ } ++ ++ int p0 = int(bitfieldExtract(payload.x, 4, 1)); ++ int p1 = int(bitfieldExtract(payload.x, 2, 1)); ++ int p2 = int(bitfieldExtract(payload.x, 3, 1)); ++ mode.weight_mode_index = p0 + (p1 << 1) + (p2 << 2) + (p3 << 3); ++ } ++ ++ // 11 bits for block mode. ++ // 2 bits for partition select ++ // If partitions > 1: ++ // 4 bits CEM selector ++ // If dual_plane: ++ // 2 bits of CCS ++ // else: ++ // 10 for partition seed ++ // 2 bits for CEM main selector ++ // If CEM[1:0] = 00: ++ // 4 bits for CEM extra selector if all same type. ++ // else: ++ // (1 + 2) * num_partitions if different types. ++ // First 4 bits are encoded next to CEM[1:0], otherwise, packed before weights. ++ // If dual_plane: ++ // 2 bits of CCS before extra CEM bits. ++ const int CONFIG_BITS_BLOCK = 11; ++ const int CONFIG_BITS_PARTITION_MODE = 2; ++ const int CONFIG_BITS_SEED = 10; ++ const int CONFIG_BITS_PRIMARY_MULTI_CEM = 2; ++ const int CONFIG_BITS_CEM = 4; ++ const int CONFIG_BITS_EXTRA_CEM_PER_PARTITION = 3; ++ const int CONFIG_BITS_CCS = 2; ++ ++ mode.num_partitions = int(bitfieldExtract(payload.x, CONFIG_BITS_BLOCK, CONFIG_BITS_PARTITION_MODE)) + 1; ++ ++ if (mode.num_partitions > 1) ++ { ++ mode.seed = int(bitfieldExtract(payload.x, CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE, CONFIG_BITS_SEED)); ++ mode.cem = int(bitfieldExtract(payload.x, CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE + CONFIG_BITS_SEED, ++ CONFIG_BITS_PRIMARY_MULTI_CEM + CONFIG_BITS_CEM)); ++ } ++ else ++ mode.cem = int(bitfieldExtract(payload.x, CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE, CONFIG_BITS_CEM)); ++ ++ int config_bits; ++ if (mode.num_partitions > 1) ++ { ++ bool single_cem = (mode.cem & 3) == 0; ++ if (single_cem) ++ { ++ config_bits = CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE + ++ CONFIG_BITS_SEED + CONFIG_BITS_PRIMARY_MULTI_CEM + CONFIG_BITS_CEM; ++ } ++ else ++ { ++ config_bits = CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE + ++ CONFIG_BITS_SEED + CONFIG_BITS_PRIMARY_MULTI_CEM + ++ CONFIG_BITS_EXTRA_CEM_PER_PARTITION * mode.num_partitions; ++ } ++ } ++ else ++ { ++ config_bits = CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE + CONFIG_BITS_CEM; ++ } ++ ++ // Other config bits are packed before the weights. ++ int primary_config_bits; ++ if (mode.num_partitions > 1) ++ { ++ primary_config_bits = CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE + CONFIG_BITS_SEED + ++ CONFIG_BITS_PRIMARY_MULTI_CEM + CONFIG_BITS_CEM; ++ } ++ else ++ primary_config_bits = config_bits; ++ ++ if (mode.dual_plane) ++ config_bits += CONFIG_BITS_CCS; ++ ++ // This is not allowed. ++ if (any(greaterThan(mode.weight_grid_size, ivec2(gl_WorkGroupSize.xy)))) ++ decode_error = true; ++ if (mode.dual_plane && mode.num_partitions > 3) ++ decode_error = true; ++ ++ mode.config_bits = config_bits; ++ mode.primary_config_bits = primary_config_bits; ++ return mode; ++} ++ ++int idiv3_floor(int v) ++{ ++ return (v * 0x5556) >> 16; ++} ++ ++int idiv3_ceil(int v) ++{ ++ return idiv3_floor(v + 2); ++} ++ ++int idiv5_floor(int v) ++{ ++ return (v * 0x3334) >> 16; ++} ++ ++int idiv5_ceil(int v) ++{ ++ return idiv5_floor(v + 4); ++} ++ ++uvec4 build_bitmask(int bits) ++{ ++ ivec4 num_bits = ivec4(bits, bits - 32, bits - 64, bits - 96); ++ uvec4 mask = uvec4(1) << clamp(num_bits, ivec4(0), ivec4(31)); ++ mask--; ++ mask = mix(mask, uvec4(0xffffffffu), greaterThanEqual(uvec4(bits), uvec4(32, 64, 96, 128))); ++ return mask; ++} ++ ++int decode_integer_sequence(uvec4 payload, int start_bit, int index, ivec3 quant) ++{ ++ int ret; ++ if (quant.y != 0) ++ { ++ // Trit-decoding. ++ int block = idiv5_floor(index); ++ int offset = index - block * 5; ++ start_bit += block * (5 * quant.x + 8); ++ ++ int t0_t1_offset = start_bit + (quant.x * 1 + 0); ++ int t2_t3_offset = start_bit + (quant.x * 2 + 2); ++ int t4_offset = start_bit + (quant.x * 3 + 4); ++ int t5_t6_offset = start_bit + (quant.x * 4 + 5); ++ int t7_offset = start_bit + (quant.x * 5 + 7); ++ ++ int t = (extract_bits(payload, t0_t1_offset, 2) << 0) | ++ (extract_bits(payload, t2_t3_offset, 2) << 2) | ++ (extract_bits(payload, t4_offset, 1) << 4) | ++ (extract_bits(payload, t5_t6_offset, 2) << 5) | ++ (extract_bits(payload, t7_offset, 1) << 7); ++ ++ t = int(texelFetch(LUTTritQuintDecode, t).x); ++ t = (t >> (3 * offset)) & 7; ++ ++ int m_offset = offset * quant.x; ++ m_offset += idiv5_ceil(offset * 8); ++ ++ if (quant.x != 0) ++ { ++ int m = extract_bits(payload, m_offset + start_bit, quant.x); ++ ret = (t << quant.x) | m; ++ } ++ else ++ ret = t; ++ } ++ else if (quant.z != 0) ++ { ++ // Quint-decoding ++ int block = idiv3_floor(index); ++ int offset = index - block * 3; ++ start_bit += block * (3 * quant.x + 7); ++ ++ int q0_q1_q2_offset = start_bit + (quant.x * 1 + 0); ++ int q3_q4_offset = start_bit + (quant.x * 2 + 3); ++ int q5_q6_offset = start_bit + (quant.x * 3 + 5); ++ ++ int q = (extract_bits(payload, q0_q1_q2_offset, 3) << 0) | ++ (extract_bits(payload, q3_q4_offset, 2) << 3) | ++ (extract_bits(payload, q5_q6_offset, 2) << 5); ++ ++ q = int(texelFetch(LUTTritQuintDecode, 256 + q).x); ++ q = (q >> (3 * offset)) & 7; ++ ++ int m_offset = offset * quant.x; ++ m_offset += idiv3_ceil(offset * 7); ++ ++ if (quant.x != 0) ++ { ++ int m = extract_bits(payload, m_offset + start_bit, quant.x); ++ ret = (q << quant.x) | m; ++ } ++ else ++ ret = q; ++ } ++ else ++ { ++ int bit = index * quant.x; ++ ret = extract_bits(payload, start_bit + bit, quant.x); ++ } ++ return ret; ++} ++ ++ivec2 normalize_coord(ivec2 pixel_coord) ++{ ++ ivec2 D = ivec2((vec2((1024 + ivec2(gl_WorkGroupSize.xy >> 1u))) + 0.5) / vec2(gl_WorkGroupSize.xy - 1u)); ++ ivec2 c = D * pixel_coord; ++ return c; ++} ++ ++int decode_weight(uvec4 payload, int weight_index, ivec4 quant) ++{ ++ int primary_weight = decode_integer_sequence(payload, 0, weight_index, quant.xyz); ++ primary_weight = int(texelFetch(LUTWeightUnquantize, primary_weight + quant.w).x); ++ return primary_weight; ++} ++ ++int decode_weight_bilinear(uvec4 payload, ivec2 coord, int weight_resolution, ++ int stride, int offset, ivec2 fractional, ivec4 quant) ++{ ++ int index = coord.y * weight_resolution + coord.x; ++ int p00 = decode_weight(payload, stride * index + offset, quant); ++ int p10, p01, p11; ++ ++ if (fractional.x != 0) ++ p10 = decode_weight(payload, stride * (index + 1) + offset, quant); ++ else ++ p10 = p00; ++ ++ if (fractional.y != 0) ++ { ++ p01 = decode_weight(payload, stride * (index + weight_resolution) + offset, quant); ++ if (fractional.x != 0) ++ p11 = decode_weight(payload, stride * (index + weight_resolution + 1) + offset, quant); ++ else ++ p11 = p01; ++ } ++ else ++ { ++ p01 = p00; ++ p11 = p10; ++ } ++ ++ int w11 = (fractional.x * fractional.y + 8) >> 4; ++ int w10 = fractional.x - w11; ++ int w01 = fractional.y - w11; ++ int w00 = 16 - fractional.x - fractional.y + w11; ++ return (p00 * w00 + p10 * w10 + p01 * w01 + p11 * w11 + 8) >> 4; ++} ++ ++ivec4 decode_weights(uvec4 payload, BlockMode mode, ivec2 normalized_pixel, out int weight_cost_bits) ++{ ++ ivec4 quant = ivec4(texelFetch(LUTWeightQuantizer, mode.weight_mode_index)); ++ int num_weights = mode.weight_grid_size.x * mode.weight_grid_size.y; ++ num_weights <<= int(mode.dual_plane); ++ weight_cost_bits = ++ quant.x * num_weights + ++ idiv5_ceil(num_weights * 8 * quant.y) + ++ idiv3_ceil(num_weights * 7 * quant.z); ++ ++ // Decoders must deal with error conditions and return the correct error color. ++ if (weight_cost_bits < 24 || weight_cost_bits > 96 || num_weights > 64) ++ { ++ decode_error = true; ++ return ivec4(0); ++ } ++ ++ int ccs; ++ if (mode.dual_plane) ++ { ++ int extra_cem_bits = 0; ++ if ((mode.cem & 3) != 0) ++ extra_cem_bits = max(mode.num_partitions * 3 - 4, 0); ++ ccs = extract_bits(payload, 126 - weight_cost_bits - extra_cem_bits, 2); ++ } ++ ++ payload = bitfieldReverse(payload); ++ payload = payload.wzyx; ++ payload &= build_bitmask(weight_cost_bits); ++ ++ // Scale the normalized coordinate to weight grid. ++ ivec2 weight_pixel_fixed_point = (normalized_pixel * (mode.weight_grid_size - 1) + 32) >> 6; ++ ivec2 weight_pixel = weight_pixel_fixed_point >> 4; ++ ivec2 weight_pixel_fractional = weight_pixel_fixed_point & 0xf; ++ ++ ivec4 ret; ++ int primary_weight = decode_weight_bilinear(payload, weight_pixel, mode.weight_grid_size.x, ++ 1 << int(mode.dual_plane), 0, ++ weight_pixel_fractional, quant); ++ if (mode.dual_plane) ++ { ++ int secondary_weight = decode_weight_bilinear(payload, weight_pixel, mode.weight_grid_size.x, ++ 2, 1, ++ weight_pixel_fractional, quant); ++ ret = mix(ivec4(primary_weight), ivec4(secondary_weight), equal(ivec4(ccs), ivec4(0, 1, 2, 3))); ++ } ++ else ++ ret = ivec4(primary_weight); ++ ++ return ret; ++} ++ ++void decode_endpoint_ldr_luma_direct(out ivec4 ep0, out ivec4 ep1, ++ int v0, int v1) ++{ ++ ep0 = ivec4(ivec3(v0), 0xff); ++ ep1 = ivec4(ivec3(v1), 0xff); ++} ++ ++void decode_endpoint_hdr_luma_direct(out ivec4 ep0, out ivec4 ep1, ++ int v0, int v1) ++{ ++ int y0, y1; ++ if (v1 >= v0) ++ { ++ y0 = v0 << 4; ++ y1 = v1 << 4; ++ } ++ else ++ { ++ y0 = (v1 << 4) + 8; ++ y1 = (v0 << 4) - 8; ++ } ++ ++ ep0 = ivec4(ivec3(y0), 0x780); ++ ep1 = ivec4(ivec3(y1), 0x780); ++} ++ ++void decode_endpoint_hdr_luma_direct_small_range(out ivec4 ep0, out ivec4 ep1, ++ int v0, int v1) ++{ ++ int y0, y1, d; ++ ++ if ((v0 & 0x80) != 0) ++ { ++ y0 = ((v1 & 0xe0) << 4) | ((v0 & 0x7f) << 2); ++ d = (v1 & 0x1f) << 2; ++ } ++ else ++ { ++ y0 = ((v1 & 0xf0) << 4) | ((v0 & 0x7f) << 1); ++ d = (v1 & 0x0f) << 1; ++ } ++ ++ y1 = min(y0 + d, 0xfff); ++ ++ ep0 = ivec4(ivec3(y0), 0x780); ++ ep1 = ivec4(ivec3(y1), 0x780); ++} ++ ++void decode_endpoint_ldr_luma_base_offset(out ivec4 ep0, out ivec4 ep1, ++ int v0, int v1) ++{ ++ int l0 = (v0 >> 2) | (v1 & 0xc0); ++ int l1 = l0 + (v1 & 0x3f); ++ l1 = min(l1, 0xff); ++ ep0 = ivec4(ivec3(l0), 0xff); ++ ep1 = ivec4(ivec3(l1), 0xff); ++} ++ ++void decode_endpoint_ldr_luma_alpha_direct(out ivec4 ep0, out ivec4 ep1, ++ int v0, int v1, int v2, int v3) ++{ ++ ep0 = ivec4(ivec3(v0), v2); ++ ep1 = ivec4(ivec3(v1), v3); ++} ++ ++ivec4 blue_contract(int r, int g, int b, int a) ++{ ++ ivec4 ret; ++ ret.r = (r + b) >> 1; ++ ret.g = (g + b) >> 1; ++ ret.b = b; ++ ret.a = a; ++ return ret; ++} ++ ++void bit_transfer_signed(inout int a, inout int b) ++{ ++ b >>= 1; ++ b |= a & 0x80; ++ a >>= 1; ++ a &= 0x3f; ++ a = bitfieldExtract(a, 0, 6); ++} ++ ++void decode_endpoint_ldr_luma_alpha_base_offset(out ivec4 ep0, out ivec4 ep1, ++ int v0, int v1, int v2, int v3) ++{ ++ bit_transfer_signed(v1, v0); ++ bit_transfer_signed(v3, v2); ++ int v0_v1 = clamp(v0 + v1, 0, 0xff); ++ int v2_v3 = clamp(v2 + v3, 0, 0xff); ++ v0 = clamp(v0, 0, 0xff); ++ v2 = clamp(v2, 0, 0xff); ++ ep0 = ivec4(ivec3(v0), v2); ++ ep1 = ivec4(ivec3(v0_v1), v2_v3); ++} ++ ++void decode_endpoint_ldr_rgb_base_scale(out ivec4 ep0, out ivec4 ep1, ++ int v0, int v1, int v2, int v3) ++{ ++ ep0 = ivec4((ivec3(v0, v1, v2) * v3) >> 8, 0xff); ++ ep1 = ivec4(v0, v1, v2, 0xff); ++} ++ ++void decode_endpoint_ldr_rgb_base_scale_two_a(out ivec4 ep0, out ivec4 ep1, ++ int v0, int v1, int v2, int v3, int v4, int v5) ++{ ++ ep0 = ivec4((ivec3(v0, v1, v2) * v3) >> 8, v4); ++ ep1 = ivec4(v0, v1, v2, v5); ++} ++ ++void decode_endpoint_ldr_rgb_direct(out ivec4 ep0, out ivec4 ep1, ++ int v0, int v1, int v2, int v3, int v4, int v5) ++{ ++ int s0 = v0 + v2 + v4; ++ int s1 = v1 + v3 + v5; ++ if (s1 >= s0) ++ { ++ ep0 = ivec4(v0, v2, v4, 0xff); ++ ep1 = ivec4(v1, v3, v5, 0xff); ++ } ++ else ++ { ++ ep0 = blue_contract(v1, v3, v5, 0xff); ++ ep1 = blue_contract(v0, v2, v4, 0xff); ++ } ++} ++ ++void decode_endpoint_hdr_rgb_scale(out ivec4 ep0, out ivec4 ep1, ++ int v0, int v1, int v2, int v3) ++{ ++ // Mind-numbing weird format, just copy from spec ... ++ int mode_value = ((v0 & 0xc0) >> 6) | ((v1 & 0x80) >> 5) | ((v2 & 0x80) >> 4); ++ int major_component; ++ int mode; ++ ++ if ((mode_value & 0xc) != 0xc) ++ { ++ major_component = mode_value >> 2; ++ mode = mode_value & 3; ++ } ++ else if (mode_value != 0xf) ++ { ++ major_component = mode_value & 3; ++ mode = 4; ++ } ++ else ++ { ++ major_component = 0; ++ mode = 5; ++ } ++ ++ int red = v0 & 0x3f; ++ int green = v1 & 0x1f; ++ int blue = v2 & 0x1f; ++ int scale = v3 & 0x1f; ++ ++ int x0 = (v1 >> 6) & 1; ++ int x1 = (v1 >> 5) & 1; ++ int x2 = (v2 >> 6) & 1; ++ int x3 = (v2 >> 5) & 1; ++ int x4 = (v3 >> 7) & 1; ++ int x5 = (v3 >> 6) & 1; ++ int x6 = (v3 >> 5) & 1; ++ ++ int ohm = 1 << mode; ++ if ((ohm & 0x30) != 0) green |= x0 << 6; ++ if ((ohm & 0x3a) != 0) green |= x1 << 5; ++ if ((ohm & 0x30) != 0) blue |= x2 << 6; ++ if ((ohm & 0x3a) != 0) blue |= x3 << 5; ++ if ((ohm & 0x3d) != 0) scale |= x6 << 5; ++ if ((ohm & 0x2d) != 0) scale |= x5 << 6; ++ if ((ohm & 0x04) != 0) scale |= x4 << 7; ++ if ((ohm & 0x3b) != 0) red |= x4 << 6; ++ if ((ohm & 0x04) != 0) red |= x3 << 6; ++ if ((ohm & 0x10) != 0) red |= x5 << 7; ++ if ((ohm & 0x0f) != 0) red |= x2 << 7; ++ if ((ohm & 0x05) != 0) red |= x1 << 8; ++ if ((ohm & 0x0a) != 0) red |= x0 << 8; ++ if ((ohm & 0x05) != 0) red |= x0 << 9; ++ if ((ohm & 0x02) != 0) red |= x6 << 9; ++ if ((ohm & 0x01) != 0) red |= x3 << 10; ++ if ((ohm & 0x02) != 0) red |= x5 << 10; ++ ++ int shamt = max(mode, 1); ++ red <<= shamt; ++ green <<= shamt; ++ blue <<= shamt; ++ scale <<= shamt; ++ ++ if (mode != 5) ++ { ++ green = red - green; ++ blue = red - blue; ++ } ++ ++ if (major_component == 1) ++ swap(red, green); ++ else if (major_component == 2) ++ swap(red, blue); ++ ++ ep1 = ivec4(clamp(ivec3(red, green, blue), ivec3(0), ivec3(0xfff)), 0x780); ++ ep0 = ivec4(clamp(ivec3(red, green, blue) - scale, ivec3(0), ivec3(0xfff)), 0x780); ++} ++ ++void decode_endpoint_hdr_rgb_direct(out ivec4 ep0, out ivec4 ep1, ++ int v0, int v1, int v2, int v3, int v4, int v5) ++{ ++ int major_component = ((v4 & 0x80) >> 7) | ((v5 & 0x80) >> 6); ++ ++ if (major_component == 3) ++ { ++ ep0 = ivec4(v0 << 4, v2 << 4, (v4 & 0x7f) << 5, 0x780); ++ ep1 = ivec4(v1 << 4, v3 << 4, (v5 & 0x7f) << 5, 0x780); ++ return; ++ } ++ ++ int mode = ((v1 & 0x80) >> 7) | ((v2 & 0x80) >> 6) | ((v3 & 0x80) >> 5); ++ int va = v0 | ((v1 & 0x40) << 2); ++ int vb0 = v2 & 0x3f; ++ int vb1 = v3 & 0x3f; ++ int vc = v1 & 0x3f; ++ int vd0 = v4 & 0x7f; ++ int vd1 = v5 & 0x7f; ++ ++ int d_bits = 7 - (mode & 1); ++ if ((mode & 5) == 4) ++ d_bits -= 2; ++ ++ vd0 = bitfieldExtract(vd0, 0, d_bits); ++ vd1 = bitfieldExtract(vd1, 0, d_bits); ++ ++ int x0 = (v2 >> 6) & 1; ++ int x1 = (v3 >> 6) & 1; ++ int x2 = (v4 >> 6) & 1; ++ int x3 = (v5 >> 6) & 1; ++ int x4 = (v4 >> 5) & 1; ++ int x5 = (v5 >> 5) & 1; ++ ++ int ohm = 1 << mode; ++ if ((ohm & 0xa4) != 0) va |= x0 << 9; ++ if ((ohm & 0x08) != 0) va |= x2 << 9; ++ if ((ohm & 0x50) != 0) va |= x4 << 9; ++ if ((ohm & 0x50) != 0) va |= x5 << 10; ++ if ((ohm & 0xa0) != 0) va |= x1 << 10; ++ if ((ohm & 0xc0) != 0) va |= x2 << 11; ++ ++ if ((ohm & 0x04) != 0) vc |= x1 << 6; ++ if ((ohm & 0xe8) != 0) vc |= x3 << 6; ++ if ((ohm & 0x20) != 0) vc |= x2 << 7; ++ ++ if ((ohm & 0x5b) != 0) vb0 |= x0 << 6; ++ if ((ohm & 0x5b) != 0) vb1 |= x1 << 6; ++ if ((ohm & 0x12) != 0) vb0 |= x2 << 7; ++ if ((ohm & 0x12) != 0) vb1 |= x3 << 7; ++ ++ int shamt = (mode >> 1) ^ 3; ++ va <<= shamt; ++ vb0 <<= shamt; ++ vb1 <<= shamt; ++ vc <<= shamt; ++ vd0 <<= shamt; ++ vd1 <<= shamt; ++ ++ ep1 = ivec4(clamp(ivec3(va, va - vb0, va - vb1), ivec3(0), ivec3(0xfff)), 0x780); ++ ep0 = ivec4(clamp(ivec3(va - vc, va - vb0 - vc - vd0, va - vb1 - vc - vd1), ivec3(0), ivec3(0xfff)), 0x780); ++ ++ if (major_component == 1) ++ { ++ swap(ep0.r, ep0.g); ++ swap(ep1.r, ep1.g); ++ } ++ else if (major_component == 2) ++ { ++ swap(ep0.r, ep0.b); ++ swap(ep1.r, ep1.b); ++ } ++} ++ ++void decode_endpoint_ldr_rgb_base_offset(out ivec4 ep0, out ivec4 ep1, ++ int v0, int v1, int v2, int v3, int v4, int v5) ++{ ++ bit_transfer_signed(v1, v0); ++ bit_transfer_signed(v3, v2); ++ bit_transfer_signed(v5, v4); ++ if (v1 + v3 + v5 >= 0) ++ { ++ ep0 = ivec4(v0, v2, v4, 0xff); ++ ep1 = ivec4(v0 + v1, v2 + v3, v4 + v5, 0xff); ++ } ++ else ++ { ++ ep0 = blue_contract(v0 + v1, v2 + v3, v4 + v5, 0xff); ++ ep1 = blue_contract(v0, v2, v4, 0xff); ++ } ++ ++ ep0.rgb = clamp(ep0.rgb, ivec3(0), ivec3(0xff)); ++ ep1.rgb = clamp(ep1.rgb, ivec3(0), ivec3(0xff)); ++} ++ ++void decode_endpoint_ldr_rgba_direct(out ivec4 ep0, out ivec4 ep1, ++ int v0, int v1, int v2, int v3, ++ int v4, int v5, int v6, int v7) ++{ ++ int s0 = v0 + v2 + v4; ++ int s1 = v1 + v3 + v5; ++ if (s1 >= s0) ++ { ++ ep0 = ivec4(v0, v2, v4, v6); ++ ep1 = ivec4(v1, v3, v5, v7); ++ } ++ else ++ { ++ ep0 = blue_contract(v1, v3, v5, v7); ++ ep1 = blue_contract(v0, v2, v4, v6); ++ } ++} ++ ++void decode_endpoint_ldr_rgba_base_offset(out ivec4 ep0, out ivec4 ep1, ++ int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7) ++{ ++ bit_transfer_signed(v1, v0); ++ bit_transfer_signed(v3, v2); ++ bit_transfer_signed(v5, v4); ++ bit_transfer_signed(v7, v6); ++ ++ if (v1 + v3 + v5 >= 0) ++ { ++ ep0 = ivec4(v0, v2, v4, v6); ++ ep1 = ivec4(v0 + v1, v2 + v3, v4 + v5, v6 + v7); ++ } ++ else ++ { ++ ep0 = blue_contract(v0 + v1, v2 + v3, v4 + v5, v6 + v7); ++ ep1 = blue_contract(v0, v2, v4, v6); ++ } ++ ++ ep0 = clamp(ep0, ivec4(0), ivec4(0xff)); ++ ep1 = clamp(ep1, ivec4(0), ivec4(0xff)); ++} ++ ++void decode_endpoint_hdr_alpha(out int ep0, out int ep1, int v6, int v7) ++{ ++ int mode = ((v6 >> 7) & 1) | ((v7 >> 6) & 2); ++ v6 &= 0x7f; ++ v7 &= 0x7f; ++ ++ if (mode == 3) ++ { ++ ep0 = v6 << 5; ++ ep1 = v7 << 5; ++ } ++ else ++ { ++ v6 |= (v7 << (mode + 1)) & 0x780; ++ v7 &= 0x3f >> mode; ++ v7 ^= 0x20 >> mode; ++ v7 -= 0x20 >> mode; ++ v6 <<= 4 - mode; ++ v7 <<= 4 - mode; ++ v7 += v6; ++ v7 = clamp(v7, 0, 0xfff); ++ ep0 = v6; ++ ep1 = v7; ++ } ++} ++ ++void decode_endpoint(out ivec4 ep0, out ivec4 ep1, out int decode_mode, ++ uvec4 payload, int bit_offset, ivec4 quant, int ep_mode, ++ int base_endpoint_index, int num_endpoint_bits) ++{ ++ num_endpoint_bits += bit_offset; ++ payload &= build_bitmask(num_endpoint_bits); ++ ++ // Could of course use an array, but that doesn't lower nicely to indexed registers on all GPUs. ++ int v0, v1, v2, v3, v4, v5, v6, v7; ++ int num_values = 2 * ((ep_mode >> 2) + 1); ++ ++#define DECODE_EP(i) \ ++ int(texelFetch(LUTEndpointUnquantize, quant.w + decode_integer_sequence(payload, bit_offset, i + base_endpoint_index, quant.xyz)).x) ++ ++ int hi_bits = ep_mode >> 2; ++ v0 = DECODE_EP(0); ++ v1 = DECODE_EP(1); ++ ++ if (hi_bits >= 1) ++ { ++ v2 = DECODE_EP(2); ++ v3 = DECODE_EP(3); ++ } ++ ++ if (hi_bits >= 2) ++ { ++ v4 = DECODE_EP(4); ++ v5 = DECODE_EP(5); ++ } ++ ++ if (hi_bits >= 3) ++ { ++ v6 = DECODE_EP(6); ++ v7 = DECODE_EP(7); ++ } ++ ++ switch (ep_mode) ++ { ++ case 0: ++ decode_endpoint_ldr_luma_direct(ep0, ep1, ++ v0, v1); ++ decode_mode = MODE_LDR; ++ break; ++ ++ case 1: ++ decode_endpoint_ldr_luma_base_offset(ep0, ep1, ++ v0, v1); ++ decode_mode = MODE_LDR; ++ break; ++ ++ case 2: ++ decode_endpoint_hdr_luma_direct(ep0, ep1, ++ v0, v1); ++ decode_mode = MODE_HDR; ++ break; ++ ++ case 3: ++ decode_endpoint_hdr_luma_direct_small_range(ep0, ep1, ++ v0, v1); ++ decode_mode = MODE_HDR; ++ break; ++ ++ case 4: ++ decode_endpoint_ldr_luma_alpha_direct(ep0, ep1, ++ v0, v1, v2, v3); ++ decode_mode = MODE_LDR; ++ break; ++ ++ case 5: ++ decode_endpoint_ldr_luma_alpha_base_offset(ep0, ep1, ++ v0, v1, v2, v3); ++ decode_mode = MODE_LDR; ++ break; ++ ++ case 6: ++ decode_endpoint_ldr_rgb_base_scale(ep0, ep1, ++ v0, v1, v2, v3); ++ decode_mode = MODE_LDR; ++ break; ++ ++ case 7: ++ decode_endpoint_hdr_rgb_scale(ep0, ep1, ++ v0, v1, v2, v3); ++ decode_mode = MODE_HDR; ++ break; ++ ++ case 8: ++ decode_endpoint_ldr_rgb_direct(ep0, ep1, ++ v0, v1, v2, v3, v4, v5); ++ decode_mode = MODE_LDR; ++ break; ++ ++ case 9: ++ decode_endpoint_ldr_rgb_base_offset(ep0, ep1, ++ v0, v1, v2, v3, v4, v5); ++ decode_mode = MODE_LDR; ++ break; ++ ++ case 10: ++ decode_endpoint_ldr_rgb_base_scale_two_a(ep0, ep1, ++ v0, v1, v2, v3, v4, v5); ++ decode_mode = MODE_LDR; ++ break; ++ ++ case 11: ++ case 14: ++ case 15: ++ decode_endpoint_hdr_rgb_direct(ep0, ep1, ++ v0, v1, v2, v3, v4, v5); ++ if (ep_mode == 14) ++ { ++ ep0.a = v6; ++ ep1.a = v7; ++ decode_mode = MODE_HDR_LDR_ALPHA; ++ } ++ else if (ep_mode == 15) ++ { ++ decode_endpoint_hdr_alpha(ep0.a, ep1.a, v6, v7); ++ decode_mode = MODE_HDR; ++ } ++ else ++ decode_mode = MODE_HDR; ++ break; ++ ++ case 12: ++ decode_endpoint_ldr_rgba_direct(ep0, ep1, ++ v0, v1, v2, v3, v4, v5, v6, v7); ++ decode_mode = MODE_LDR; ++ break; ++ ++ case 13: ++ decode_endpoint_ldr_rgba_base_offset(ep0, ep1, ++ v0, v1, v2, v3, v4, v5, v6, v7); ++ decode_mode = MODE_LDR; ++ break; ++ } ++ ++ if (DECODE_8BIT && decode_mode != MODE_LDR) ++ decode_error = true; ++} ++ ++#define CHECK_DECODE_ERROR() do { \ ++ if (decode_error) \ ++ { \ ++ emit_decode_error(coord.xy); \ ++ return; \ ++ } \ ++} while(false) ++ ++void emit_decode_error(ivec2 coord) ++{ ++#ifdef VULKAN ++ if (is_3Dimage) ++ imageStore(OutputImage3D, ivec3(coord, gl_WorkGroupID.z), error_color); ++ else ++ imageStore(OutputImage2Darray, ivec3(coord, gl_WorkGroupID.z), error_color); ++#else /* VULKAN */ ++ imageStore(OutputImage, coord, error_color); ++#endif /* VULKAN */ ++} ++ ++int compute_num_endpoint_pairs(int num_partitions, int cem) ++{ ++ int ret; ++ if (num_partitions > 1) ++ { ++ bool single_cem = (cem & 3) == 0; ++ if (single_cem) ++ ret = ((cem >> 4) + 1) * num_partitions; ++ else ++ ret = (cem & 3) * num_partitions + bitCount(bitfieldExtract(uint(cem), 2, num_partitions)); ++ } ++ else ++ { ++ ret = (cem >> 2) + 1; ++ } ++ return ret; ++} ++ ++void decode_cem_base_endpoint(uvec4 payload, int weight_cost_bits, inout int cem, out int base_endpoint_index, ++ int num_partitions, int partition_index) ++{ ++ if (num_partitions > 1) ++ { ++ bool single_cem = (cem & 3) == 0; ++ if (single_cem) ++ { ++ cem >>= 2; ++ base_endpoint_index = ((cem >> 2) + 1) * partition_index; ++ } ++ else ++ { ++ if (partition_index != 0) ++ base_endpoint_index = (cem & 3) * partition_index + bitCount(bitfieldExtract(uint(cem), 2, partition_index)); ++ else ++ base_endpoint_index = 0; ++ ++ int base_class = (cem & 3) - 1; ++ int extra_cem_bits = num_partitions * 3 - 4; ++ int extra_bits = extract_bits(payload, 128 - weight_cost_bits - extra_cem_bits, extra_cem_bits); ++ cem = (extra_bits << 4) | (cem >> 2); ++ ++ int class_offset_bit = (cem >> partition_index) & 1; ++ int ep_bits = (cem >> (num_partitions + 2 * partition_index)) & 3; ++ ++ cem = 4 * (base_class + class_offset_bit) + ep_bits; ++ } ++ base_endpoint_index *= 2; ++ } ++ else ++ { ++ base_endpoint_index = 0; ++ } ++} ++ ++ivec4 void_extent_color(uvec4 payload, out int decode_mode) ++{ ++ int min_s = extract_bits(payload, 12, 13); ++ int max_s = extract_bits(payload, 12 + 13, 13); ++ int min_t = extract_bits(payload, 12 + 2 * 13, 13); ++ int max_t = extract_bits(payload, 12 + 3 * 13, 13); ++ ++ int reserved = extract_bits(payload, 10, 2); ++ if (reserved != 3) ++ { ++ decode_error = true; ++ return ivec4(0); ++ } ++ ++ if (!all(equal(ivec4(min_s, max_s, min_t, max_t), ivec4((1 << 13) - 1)))) ++ { ++ if (any(greaterThanEqual(ivec2(min_s, min_t), ivec2(max_s, max_t)))) ++ { ++ decode_error = true; ++ return ivec4(0); ++ } ++ } ++ ++ decode_mode = (payload.x & (1u << 9)) != 0u ? MODE_HDR : MODE_LDR; ++ ++ int r = extract_bits(payload, 64, 16); ++ int g = extract_bits(payload, 64 + 16, 16); ++ int b = extract_bits(payload, 64 + 32, 16); ++ int a = extract_bits(payload, 64 + 48, 16); ++ ++ return ivec4(r, g, b, a); ++} ++ ++void main() ++{ ++ ivec4 coord = build_coord(); ++#ifdef VULKAN ++ if (any(greaterThanEqual(coord.xy, texel_end.xy))) ++ return; ++#else /* VULKAN */ ++ if (any(greaterThanEqual(coord.xy, imageSize(OutputImage)))) ++ return; ++#endif /* VULKAN */ ++ ++ ivec2 pixel_coord = ivec2(gl_LocalInvocationID.xy); ++ int linear_pixel = int(gl_WorkGroupSize.x) * pixel_coord.y + pixel_coord.x; ++ uvec4 payload; ++#ifdef VULKAN ++ if (is_3Dimage) ++ payload = texelFetch(PayloadInput3D, ivec3(coord.zw, gl_WorkGroupID.z), 0); ++ else ++ payload = texelFetch(PayloadInput2Darray,ivec3(coord.zw, gl_WorkGroupID.z), 0); ++#else /* VULKAN */ ++ payload = texelFetch(PayloadInput, coord.zw, 0); ++#endif /* VULKAN */ ++ ++ BlockMode block_mode = decode_block_mode(payload); ++ CHECK_DECODE_ERROR(); ++ ++ ivec4 final_color; ++ int decode_mode; ++ if (block_mode.void_extent) ++ { ++ final_color = void_extent_color(payload, decode_mode); ++ CHECK_DECODE_ERROR(); ++ } ++ else ++ { ++ int weight_cost_bits; ++ ivec4 weights = decode_weights(payload, block_mode, normalize_coord(pixel_coord), weight_cost_bits); ++ ++ int partition_index = 0; ++ if (block_mode.num_partitions > 1) ++ { ++ int lut_x = pixel_coord.x + int(gl_WorkGroupSize.x) * (block_mode.seed & 31); ++ int lut_y = pixel_coord.y + int(gl_WorkGroupSize.y) * (block_mode.seed >> 5); ++#ifdef VULKAN ++ int lut_width = int(gl_WorkGroupSize.x) * 32; ++ partition_index = int(texelFetch(LUTPartitionTable, lut_y * lut_width + lut_x).x); ++#else /* VULKAN */ ++ partition_index = int(texelFetch(LUTPartitionTable, ivec2(lut_x, lut_y), 0).x); ++#endif /* VULKAN */ ++ partition_index = (partition_index >> (2 * block_mode.num_partitions - 4)) & 3; ++ } ++ ++ int available_endpoint_bits = max(128 - block_mode.config_bits - weight_cost_bits, 0); ++ ++ // In multi-partition mode, the 6-bit CEM field is encoded as ++ // First two bits tell if all CEM field are the same, if not we specify a class offset, and N bits ++ // after that will offset the class by 1. ++ int num_endpoint_pairs = compute_num_endpoint_pairs(block_mode.num_partitions, block_mode.cem); ++ ++ // Error color must be emitted if we need more than 18 integer sequence encoded values of color. ++ if (num_endpoint_pairs > 9) ++ { ++ decode_error = true; ++ emit_decode_error(coord.xy); ++ return; ++ } ++ ++ ivec4 endpoint_quant = ivec4(texelFetch(LUTRemainingBitsToEndpointQuantizer, ++ 128 * (num_endpoint_pairs - 1) + available_endpoint_bits)); ++ ++ // Only read the bits we need for endpoints. ++ int num_endpoint_values = num_endpoint_pairs * 2; ++ available_endpoint_bits = ++ endpoint_quant.x * num_endpoint_values + ++ idiv5_ceil(endpoint_quant.y * 8 * num_endpoint_values) + ++ idiv3_ceil(endpoint_quant.z * 7 * num_endpoint_values); ++ ++ // No space left for color endpoints. ++ if (all(equal(endpoint_quant.xyz, ivec3(0)))) ++ { ++ decode_error = true; ++ emit_decode_error(coord.xy); ++ return; ++ } ++ ++ int endpoint_bit_offset = block_mode.primary_config_bits; ++ ivec4 ep0, ep1; ++ ++ // Decode CEM for multi-partition schemes. ++ int cem = block_mode.cem; ++ int base_endpoint_index; ++ decode_cem_base_endpoint(payload, weight_cost_bits, cem, base_endpoint_index, ++ block_mode.num_partitions, partition_index); ++ ++ decode_endpoint(ep0, ep1, decode_mode, payload, endpoint_bit_offset, endpoint_quant, ++ cem, base_endpoint_index, available_endpoint_bits); ++ CHECK_DECODE_ERROR(); ++ ++ final_color = interpolate_endpoint(ep0, ep1, weights, decode_mode); ++ } ++ ++ if (DECODE_8BIT) ++ { ++#ifdef VULKAN ++ if (is_3Dimage) ++ imageStore(OutputImage3D, ivec3(coord.xy, gl_WorkGroupID.z), uvec4(final_color >> 8)); ++ else ++ imageStore(OutputImage2Darray, ivec3(coord.xy, gl_WorkGroupID.z), uvec4(final_color >> 8)); ++#else /* VULKAN */ ++ imageStore(OutputImage, coord.xy, uvec4(final_color >> 8)); ++#endif /* VULKAN */ ++ } ++ else ++ { ++ uvec4 encoded; ++ if (block_mode.void_extent && decode_mode == MODE_HDR) ++ encoded = uvec4(final_color); ++ else ++ encoded = decode_fp16(final_color, decode_mode); ++#ifdef VULKAN ++ if (is_3Dimage) ++ imageStore(OutputImage3D, ivec3(coord.xy, gl_WorkGroupID.z), encoded); ++ else ++ imageStore(OutputImage2Darray, ivec3(coord.xy, gl_WorkGroupID.z), encoded); ++#else /* VULKAN */ ++ imageStore(OutputImage, coord.xy, encoded); ++#endif /* VULKAN */ ++ } ++} +diff --git a/src/compiler/glsl/bc1.glsl b/src/compiler/glsl/bc1.glsl +new file mode 100644 +index 00000000000..251635d7b69 +--- /dev/null ++++ b/src/compiler/glsl/bc1.glsl +@@ -0,0 +1,544 @@ ++/* ++ * Copyright 2020-2022 Matias N. Goldberg ++ * Copyright 2022 Intel Corporation ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ */ ++ ++#version 310 es ++ ++#if defined(GL_ES) && GL_ES == 1 ++ // Desktop GLSL allows the const keyword for either compile-time or ++ // run-time constants. GLSL ES only allows the keyword for compile-time ++ // constants. Since we use const on run-time constants, define it to ++ // nothing. ++ #define const ++#endif ++ ++%s // include "CrossPlatformSettings_piece_all.glsl" ++ ++#define FLT_MAX 340282346638528859811704183484516925440.0f ++ ++layout( location = 0 ) uniform uint p_numRefinements; ++ ++uniform sampler2D srcTex; ++ ++layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture; ++ ++layout( std430, binding = 1 ) readonly restrict buffer globalBuffer ++{ ++ float2 c_oMatch5[256]; ++ float2 c_oMatch6[256]; ++}; ++ ++layout( local_size_x = 8, // ++ local_size_y = 8, // ++ local_size_z = 1 ) in; ++ ++float3 rgb565to888( float rgb565 ) ++{ ++ float3 retVal; ++ retVal.x = floor( rgb565 / 2048.0f ); ++ retVal.y = floor( mod( rgb565, 2048.0f ) / 32.0f ); ++ retVal.z = floor( mod( rgb565, 32.0f ) ); ++ ++ // This is the correct 565 to 888 conversion: ++ // rgb = floor( rgb * ( 255.0f / float3( 31.0f, 63.0f, 31.0f ) ) + 0.5f ) ++ // ++ // However stb_dxt follows a different one: ++ // rb = floor( rb * ( 256 / 32 + 8 / 32 ) ); ++ // g = floor( g * ( 256 / 64 + 4 / 64 ) ); ++ // ++ // I'm not sure exactly why but it's possible this is how the S3TC specifies it should be decoded ++ // It's quite possible this is the reason: ++ // http://www.ludicon.com/castano/blog/2009/03/gpu-dxt-decompression/ ++ // ++ // Or maybe it's just because it's cheap to do with integer shifts. ++ // Anyway, we follow stb_dxt's conversion just in case ++ // (gives almost the same result, with 1 or -1 of difference for a very few values) ++ // ++ // Perhaps when we make 888 -> 565 -> 888 it doesn't matter ++ // because they end up mapping to the original number ++ ++ return floor( retVal * float3( 8.25f, 4.0625f, 8.25f ) ); ++} ++ ++float rgb888to565( float3 rgbValue ) ++{ ++ rgbValue.rb = floor( rgbValue.rb * 31.0f / 255.0f + 0.5f ); ++ rgbValue.g = floor( rgbValue.g * 63.0f / 255.0f + 0.5f ); ++ ++ return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b; ++} ++ ++// linear interpolation at 1/3 point between a and b, using desired rounding type ++float3 lerp13( float3 a, float3 b ) ++{ ++#ifdef STB_DXT_USE_ROUNDING_BIAS ++ // with rounding bias ++ return a + floor( ( b - a ) * ( 1.0f / 3.0f ) + 0.5f ); ++#else ++ // without rounding bias ++ return floor( ( 2.0f * a + b ) / 3.0f ); ++#endif ++} ++ ++/// Unpacks a block of 4 colours from two 16-bit endpoints ++void EvalColors( out float3 colours[4], float c0, float c1 ) ++{ ++ colours[0] = rgb565to888( c0 ); ++ colours[1] = rgb565to888( c1 ); ++ colours[2] = lerp13( colours[0], colours[1] ); ++ colours[3] = lerp13( colours[1], colours[0] ); ++} ++ ++/** The color optimization function. (Clever code, part 1) ++@param outMinEndp16 [out] ++ Minimum endpoint, in RGB565 ++@param outMaxEndp16 [out] ++ Maximum endpoint, in RGB565 ++*/ ++void OptimizeColorsBlock( const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16 ) ++{ ++ // determine color distribution ++ float3 avgColour; ++ float3 minColour; ++ float3 maxColour; ++ ++ avgColour = minColour = maxColour = unpackUnorm4x8( srcPixelsBlock[0] ).xyz; ++ for( int i = 1; i < 16; ++i ) ++ { ++ const float3 currColourUnorm = unpackUnorm4x8( srcPixelsBlock[i] ).xyz; ++ avgColour += currColourUnorm; ++ minColour = min( minColour, currColourUnorm ); ++ maxColour = max( maxColour, currColourUnorm ); ++ } ++ ++ avgColour = round( avgColour * 255.0f / 16.0f ); ++ maxColour *= 255.0f; ++ minColour *= 255.0f; ++ ++ // determine covariance matrix ++ float cov[6]; ++ for( int i = 0; i < 6; ++i ) ++ cov[i] = 0.0f; ++ ++ for( int i = 0; i < 16; ++i ) ++ { ++ const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f; ++ float3 rgbDiff = currColour - avgColour; ++ ++ cov[0] += rgbDiff.r * rgbDiff.r; ++ cov[1] += rgbDiff.r * rgbDiff.g; ++ cov[2] += rgbDiff.r * rgbDiff.b; ++ cov[3] += rgbDiff.g * rgbDiff.g; ++ cov[4] += rgbDiff.g * rgbDiff.b; ++ cov[5] += rgbDiff.b * rgbDiff.b; ++ } ++ ++ // convert covariance matrix to float, find principal axis via power iter ++ for( int i = 0; i < 6; ++i ) ++ cov[i] /= 255.0f; ++ ++ float3 vF = maxColour - minColour; ++ ++ const int nIterPower = 4; ++ for( int iter = 0; iter < nIterPower; ++iter ) ++ { ++ const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2]; ++ const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4]; ++ const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5]; ++ ++ vF.r = r; ++ vF.g = g; ++ vF.b = b; ++ } ++ ++ float magn = max3( abs( vF.r ), abs( vF.g ), abs( vF.b ) ); ++ float3 v; ++ ++ if( magn < 4.0f ) ++ { // too small, default to luminance ++ v.r = 299.0f; // JPEG YCbCr luma coefs, scaled by 1000. ++ v.g = 587.0f; ++ v.b = 114.0f; ++ } ++ else ++ { ++ v = trunc( vF * ( 512.0f / magn ) ); ++ } ++ ++ // Pick colors at extreme points ++ float3 minEndpoint, maxEndpoint; ++ float minDot = FLT_MAX; ++ float maxDot = -FLT_MAX; ++ for( int i = 0; i < 16; ++i ) ++ { ++ const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f; ++ const float dotValue = dot( currColour, v ); ++ ++ if( dotValue < minDot ) ++ { ++ minDot = dotValue; ++ minEndpoint = currColour; ++ } ++ ++ if( dotValue > maxDot ) ++ { ++ maxDot = dotValue; ++ maxEndpoint = currColour; ++ } ++ } ++ ++ outMinEndp16 = rgb888to565( minEndpoint ); ++ outMaxEndp16 = rgb888to565( maxEndpoint ); ++} ++ ++// The color matching function ++uint MatchColorsBlock( const uint srcPixelsBlock[16], float3 colour[4] ) ++{ ++ uint mask = 0u; ++ float3 dir = colour[0] - colour[1]; ++ float stops[4]; ++ ++ for( int i = 0; i < 4; ++i ) ++ stops[i] = dot( colour[i], dir ); ++ ++ // think of the colors as arranged on a line; project point onto that line, then choose ++ // next color out of available ones. we compute the crossover points for "best color in top ++ // half"/"best in bottom half" and then the same inside that subinterval. ++ // ++ // relying on this 1d approximation isn't always optimal in terms of euclidean distance, ++ // but it's very close and a lot faster. ++ // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html ++ ++ float c0Point = trunc( ( stops[1] + stops[3] ) * 0.5f ); ++ float halfPoint = trunc( ( stops[3] + stops[2] ) * 0.5f ); ++ float c3Point = trunc( ( stops[2] + stops[0] ) * 0.5f ); ++ ++#ifndef BC1_DITHER ++ // the version without dithering is straightforward ++ for( uint i = 16u; i-- > 0u; ) ++ { ++ const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f; ++ ++ const float dotValue = dot( currColour, dir ); ++ mask <<= 2u; ++ ++ if( dotValue < halfPoint ) ++ mask |= ( ( dotValue < c0Point ) ? 1u : 3u ); ++ else ++ mask |= ( ( dotValue < c3Point ) ? 2u : 0u ); ++ } ++#else ++ // with floyd-steinberg dithering ++ float4 ep1 = float4( 0, 0, 0, 0 ); ++ float4 ep2 = float4( 0, 0, 0, 0 ); ++ ++ c0Point *= 16.0f; ++ halfPoint *= 16.0f; ++ c3Point *= 16.0f; ++ ++ for( uint y = 0u; y < 4u; ++y ) ++ { ++ float ditherDot; ++ uint lmask, step; ++ ++ float3 currColour; ++ float dotValue; ++ ++ currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 0u] ).xyz * 255.0f; ++ dotValue = dot( currColour, dir ); ++ ++ ditherDot = ( dotValue * 16.0f ) + ( 3.0f * ep2[1] + 5.0f * ep2[0] ); ++ if( ditherDot < halfPoint ) ++ step = ( ditherDot < c0Point ) ? 1u : 3u; ++ else ++ step = ( ditherDot < c3Point ) ? 2u : 0u; ++ ep1[0] = dotValue - stops[step]; ++ lmask = step; ++ ++ currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 1u] ).xyz * 255.0f; ++ dotValue = dot( currColour, dir ); ++ ++ ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] ); ++ if( ditherDot < halfPoint ) ++ step = ( ditherDot < c0Point ) ? 1u : 3u; ++ else ++ step = ( ditherDot < c3Point ) ? 2u : 0u; ++ ep1[1] = dotValue - stops[step]; ++ lmask |= step << 2u; ++ ++ currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f; ++ dotValue = dot( currColour, dir ); ++ ++ ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] ); ++ if( ditherDot < halfPoint ) ++ step = ( ditherDot < c0Point ) ? 1u : 3u; ++ else ++ step = ( ditherDot < c3Point ) ? 2u : 0u; ++ ep1[2] = dotValue - stops[step]; ++ lmask |= step << 4u; ++ ++ currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f; ++ dotValue = dot( currColour, dir ); ++ ++ ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] ); ++ if( ditherDot < halfPoint ) ++ step = ( ditherDot < c0Point ) ? 1u : 3u; ++ else ++ step = ( ditherDot < c3Point ) ? 2u : 0u; ++ ep1[3] = dotValue - stops[step]; ++ lmask |= step << 6u; ++ ++ mask |= lmask << ( y * 8u ); ++ { ++ float4 tmp = ep1; ++ ep1 = ep2; ++ ep2 = tmp; ++ } // swap ++ } ++#endif ++ ++ return mask; ++} ++ ++// The refinement function. (Clever code, part 2) ++// Tries to optimize colors to suit block contents better. ++// (By solving a least squares system via normal equations+Cramer's rule) ++bool RefineBlock( const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16, ++ inout float inOutMaxEndp16 ) ++{ ++ float newMin16, newMax16; ++ const float oldMin = inOutMinEndp16; ++ const float oldMax = inOutMaxEndp16; ++ ++ if( ( mask ^ ( mask << 2u ) ) < 4u ) // all pixels have the same index? ++ { ++ // yes, linear system would be singular; solve using optimal ++ // single-color match on average color ++ float3 rgbVal = float3( 8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f ); ++ for( int i = 0; i < 16; ++i ) ++ rgbVal += unpackUnorm4x8( srcPixelsBlock[i] ).xyz; ++ ++ rgbVal = floor( rgbVal * ( 255.0f / 16.0f ) ); ++ ++ newMax16 = c_oMatch5[uint( rgbVal.r )][0] * 2048.0f + // ++ c_oMatch6[uint( rgbVal.g )][0] * 32.0f + // ++ c_oMatch5[uint( rgbVal.b )][0]; ++ newMin16 = c_oMatch5[uint( rgbVal.r )][1] * 2048.0f + // ++ c_oMatch6[uint( rgbVal.g )][1] * 32.0f + // ++ c_oMatch5[uint( rgbVal.b )][1]; ++ } ++ else ++ { ++ const float w1Tab[4] = float[4]( 3.0f, 0.0f, 2.0f, 1.0f ); ++ const float prods[4] = float[4]( 589824.0f, 2304.0f, 262402.0f, 66562.0f ); ++ // ^some magic to save a lot of multiplies in the accumulating loop... ++ // (precomputed products of weights for least squares system, accumulated inside one 32-bit ++ // register) ++ ++ float akku = 0.0f; ++ uint cm = mask; ++ float3 at1 = float3( 0, 0, 0 ); ++ float3 at2 = float3( 0, 0, 0 ); ++ for( int i = 0; i < 16; ++i, cm >>= 2u ) ++ { ++ const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f; ++ ++ const uint step = cm & 3u; ++ const float w1 = w1Tab[step]; ++ akku += prods[step]; ++ at1 += currColour * w1; ++ at2 += currColour; ++ } ++ ++ at2 = 3.0f * at2 - at1; ++ ++ // extract solutions and decide solvability ++ const float xx = floor( akku / 65535.0f ); ++ const float yy = floor( mod( akku, 65535.0f ) / 256.0f ); ++ const float xy = mod( akku, 256.0f ); ++ ++ float2 f_rb_g; ++ f_rb_g.x = 3.0f * 31.0f / 255.0f / ( xx * yy - xy * xy ); ++ f_rb_g.y = f_rb_g.x * 63.0f / 31.0f; ++ ++ // solve. ++ const float3 newMaxVal = clamp( floor( ( at1 * yy - at2 * xy ) * f_rb_g.xyx + 0.5f ), ++ float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) ); ++ newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z; ++ ++ const float3 newMinVal = clamp( floor( ( at2 * xx - at1 * xy ) * f_rb_g.xyx + 0.5f ), ++ float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) ); ++ newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z; ++ } ++ ++ inOutMinEndp16 = newMin16; ++ inOutMaxEndp16 = newMax16; ++ ++ return oldMin != newMin16 || oldMax != newMax16; ++} ++ ++#ifdef BC1_DITHER ++/// Quantizes 'srcValue' which is originally in 888 (full range), ++/// converting it to 565 and then back to 888 (quantized) ++float3 quant( float3 srcValue ) ++{ ++ srcValue = clamp( srcValue, 0.0f, 255.0f ); ++ // Convert 888 -> 565 ++ srcValue = floor( srcValue * float3( 31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f ) + 0.5f ); ++ // Convert 565 -> 888 back ++ srcValue = floor( srcValue * float3( 8.25f, 4.0625f, 8.25f ) ); ++ ++ return srcValue; ++} ++ ++void DitherBlock( const uint srcPixBlck[16], out uint dthPixBlck[16] ) ++{ ++ float3 ep1[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) ); ++ float3 ep2[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) ); ++ ++ for( uint y = 0u; y < 16u; y += 4u ) ++ { ++ float3 srcPixel, dithPixel; ++ ++ srcPixel = unpackUnorm4x8( srcPixBlck[y + 0u] ).xyz * 255.0f; ++ dithPixel = quant( srcPixel + trunc( ( 3.0f * ep2[1] + 5.0f * ep2[0] ) * ( 1.0f / 16.0f ) ) ); ++ ep1[0] = srcPixel - dithPixel; ++ dthPixBlck[y + 0u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) ); ++ ++ srcPixel = unpackUnorm4x8( srcPixBlck[y + 1u] ).xyz * 255.0f; ++ dithPixel = quant( ++ srcPixel + trunc( ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] ) * ( 1.0f / 16.0f ) ) ); ++ ep1[1] = srcPixel - dithPixel; ++ dthPixBlck[y + 1u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) ); ++ ++ srcPixel = unpackUnorm4x8( srcPixBlck[y + 2u] ).xyz * 255.0f; ++ dithPixel = quant( ++ srcPixel + trunc( ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] ) * ( 1.0f / 16.0f ) ) ); ++ ep1[2] = srcPixel - dithPixel; ++ dthPixBlck[y + 2u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) ); ++ ++ srcPixel = unpackUnorm4x8( srcPixBlck[y + 3u] ).xyz * 255.0f; ++ dithPixel = quant( srcPixel + trunc( ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] ) * ( 1.0f / 16.0f ) ) ); ++ ep1[3] = srcPixel - dithPixel; ++ dthPixBlck[y + 3u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) ); ++ ++ // swap( ep1, ep2 ) ++ for( uint i = 0u; i < 4u; ++i ) ++ { ++ float3 tmp = ep1[i]; ++ ep1[i] = ep2[i]; ++ ep2[i] = tmp; ++ } ++ } ++} ++#endif ++ ++void main() ++{ ++ uint srcPixelsBlock[16]; ++ ++ bool bAllColoursEqual = true; ++ ++ // Load the whole 4x4 block ++ const uint2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u; ++ for( uint i = 0u; i < 16u; ++i ) ++ { ++ const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i & 0x03u, i >> 2u ); ++ const float3 srcPixels0 = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyz; ++ srcPixelsBlock[i] = packUnorm4x8( float4( srcPixels0, 1.0f ) ); ++ bAllColoursEqual = bAllColoursEqual && srcPixelsBlock[0] == srcPixelsBlock[i]; ++ } ++ ++ float maxEndp16, minEndp16; ++ uint mask = 0u; ++ ++ if( bAllColoursEqual ) ++ { ++ const uint3 rgbVal = uint3( unpackUnorm4x8( srcPixelsBlock[0] ).xyz * 255.0f ); ++ mask = 0xAAAAAAAAu; ++ maxEndp16 = ++ c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0]; ++ minEndp16 = ++ c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1]; ++ } ++ else ++ { ++#ifdef BC1_DITHER ++ uint ditherPixelsBlock[16]; ++ // first step: compute dithered version for PCA if desired ++ DitherBlock( srcPixelsBlock, ditherPixelsBlock ); ++#else ++# define ditherPixelsBlock srcPixelsBlock ++#endif ++ ++ // second step: pca+map along principal axis ++ OptimizeColorsBlock( ditherPixelsBlock, minEndp16, maxEndp16 ); ++ if( minEndp16 != maxEndp16 ) ++ { ++ float3 colours[4]; ++ EvalColors( colours, maxEndp16, minEndp16 ); // Note min/max are inverted ++ mask = MatchColorsBlock( srcPixelsBlock, colours ); ++ } ++ ++ // third step: refine (multiple times if requested) ++ bool bStopRefinement = false; ++ for( uint i = 0u; i < p_numRefinements && !bStopRefinement; ++i ) ++ { ++ const uint lastMask = mask; ++ ++ if( RefineBlock( ditherPixelsBlock, mask, minEndp16, maxEndp16 ) ) ++ { ++ if( minEndp16 != maxEndp16 ) ++ { ++ float3 colours[4]; ++ EvalColors( colours, maxEndp16, minEndp16 ); // Note min/max are inverted ++ mask = MatchColorsBlock( srcPixelsBlock, colours ); ++ } ++ else ++ { ++ mask = 0u; ++ bStopRefinement = true; ++ } ++ } ++ ++ bStopRefinement = mask == lastMask || bStopRefinement; ++ } ++ } ++ ++ // write the color block ++ if( maxEndp16 < minEndp16 ) ++ { ++ const float tmpValue = minEndp16; ++ minEndp16 = maxEndp16; ++ maxEndp16 = tmpValue; ++ mask ^= 0x55555555u; ++ } ++ ++ uint4 outputBytes; ++ outputBytes.x = uint( maxEndp16 ); ++ outputBytes.y = uint( minEndp16 ); ++ outputBytes.z = mask & 0xFFFFu; ++ outputBytes.w = mask >> 16u; ++ ++ uint2 dstUV = gl_GlobalInvocationID.xy; ++ imageStore( dstTexture, int2( dstUV ), outputBytes ); ++} +diff --git a/src/compiler/glsl/bc4.glsl b/src/compiler/glsl/bc4.glsl +new file mode 100644 +index 00000000000..2486fa4ae0c +--- /dev/null ++++ b/src/compiler/glsl/bc4.glsl +@@ -0,0 +1,187 @@ ++/* ++ * Copyright 2020-2022 Matias N. Goldberg ++ * Copyright 2022 Intel Corporation ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ */ ++ ++#version 310 es ++ ++#if defined(GL_ES) && GL_ES == 1 ++ // Desktop GLSL allows the const keyword for either compile-time or ++ // run-time constants. GLSL ES only allows the keyword for compile-time ++ // constants. Since we use const on run-time constants, define it to ++ // nothing. ++ #define const ++#endif ++ ++#define __sharedOnlyBarrier memoryBarrierShared();barrier(); ++ ++%s // include "CrossPlatformSettings_piece_all.glsl" ++ ++shared float2 g_minMaxValues[4u * 4u * 4u]; ++shared uint2 g_mask[4u * 4u]; ++ ++layout( location = 0 ) uniform uint2 params; ++ ++#define p_channelIdx params.x ++#define p_useSNorm params.y ++ ++uniform sampler2D srcTex; ++ ++layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture; ++ ++layout( local_size_x = 4, // ++ local_size_y = 4, // ++ local_size_z = 4 ) in; ++ ++/// Each block is 16 pixels ++/// Each thread works on 4 pixels ++/// Therefore each block needs 4 threads, generating 8 masks ++/// At the end these 8 masks get merged into 2 and results written to output ++/// ++/// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?** ++/// ++/// A: It's a sweetspot. ++/// - Very short threads cannot fill expensive GPUs with enough work (dispatch bound) ++/// - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks) ++/// overhead, and also more LDS usage which reduces occupancy. ++/// - Long threads (e.g. 1 thread per block) misses parallelism opportunities ++void main() ++{ ++ float minVal, maxVal; ++ float4 srcPixel; ++ ++ const uint blockThreadId = gl_LocalInvocationID.x; ++ ++ const uint2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u; ++ ++ for( uint i = 0u; i < 4u; ++i ) ++ { ++ const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i, blockThreadId ); ++ ++ const float4 value = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyzw; ++ srcPixel[i] = p_channelIdx == 0u ? value.x : ( p_channelIdx == 1u ? value.y : value.w ); ++ srcPixel[i] *= 255.0f; ++ } ++ ++ minVal = min3( srcPixel.x, srcPixel.y, srcPixel.z ); ++ maxVal = max3( srcPixel.x, srcPixel.y, srcPixel.z ); ++ minVal = min( minVal, srcPixel.w ); ++ maxVal = max( maxVal, srcPixel.w ); ++ ++ const uint minMaxIdxBase = ( gl_LocalInvocationID.z << 4u ) + ( gl_LocalInvocationID.y << 2u ); ++ const uint maskIdxBase = ( gl_LocalInvocationID.z << 2u ) + gl_LocalInvocationID.y; ++ ++ g_minMaxValues[minMaxIdxBase + blockThreadId] = float2( minVal, maxVal ); ++ g_mask[maskIdxBase] = uint2( 0u, 0u ); ++ ++ __sharedOnlyBarrier; ++ ++ // Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded ++ for( uint i = 0u; i < 4u; ++i ) ++ { ++ minVal = min( g_minMaxValues[minMaxIdxBase + i].x, minVal ); ++ maxVal = max( g_minMaxValues[minMaxIdxBase + i].y, maxVal ); ++ } ++ ++ // determine bias and emit color indices ++ // given the choice of maxVal/minVal, these indices are optimal: ++ // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/ ++ float dist = maxVal - minVal; ++ float dist4 = dist * 4.0f; ++ float dist2 = dist * 2.0f; ++ float bias = ( dist < 8.0f ) ? ( dist - 1.0f ) : ( trunc( dist * 0.5f ) + 2.0f ); ++ bias -= minVal * 7.0f; ++ ++ uint mask0 = 0u, mask1 = 0u; ++ ++ for( uint i = 0u; i < 4u; ++i ) ++ { ++ float a = srcPixel[i] * 7.0f + bias; ++ ++ int ind = 0; ++ ++ // select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max). ++ if( a >= dist4 ) ++ { ++ ind = 4; ++ a -= dist4; ++ } ++ ++ if( a >= dist2 ) ++ { ++ ind += 2; ++ a -= dist2; ++ } ++ ++ if( a >= dist ) ++ ind += 1; ++ ++ // turn linear scale into DXT index (0/1 are extremal pts) ++ ind = -ind & 7; ++ ind ^= ( 2 > ind ) ? 1 : 0; ++ ++ // write index ++ const uint bits = 16u + ( ( blockThreadId << 2u ) + i ) * 3u; ++ if( bits < 32u ) ++ { ++ mask0 |= uint( ind ) << bits; ++ if( bits + 3u > 32u ) ++ { ++ mask1 |= uint( ind ) >> ( 32u - bits ); ++ } ++ } ++ else ++ { ++ mask1 |= uint( ind ) << ( bits - 32u ); ++ } ++ } ++ ++ if( mask0 != 0u ) ++ atomicOr( g_mask[maskIdxBase].x, mask0 ); ++ if( mask1 != 0u ) ++ atomicOr( g_mask[maskIdxBase].y, mask1 ); ++ ++ __sharedOnlyBarrier; ++ ++ if( blockThreadId == 0u ) ++ { ++ // Save data ++ uint4 outputBytes; ++ ++ if( p_useSNorm != 0u ) ++ { ++ outputBytes.x = ++ packSnorm4x8( float4( maxVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, ++ minVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, 0.0f, 0.0f ) ); ++ } ++ else ++ { ++ outputBytes.x = packUnorm4x8( ++ float4( maxVal * ( 1.0f / 255.0f ), minVal * ( 1.0f / 255.0f ), 0.0f, 0.0f ) ); ++ } ++ outputBytes.y = g_mask[maskIdxBase].x >> 16u; ++ outputBytes.z = g_mask[maskIdxBase].y & 0xFFFFu; ++ outputBytes.w = g_mask[maskIdxBase].y >> 16u; ++ ++ uint2 dstUV = gl_GlobalInvocationID.yz; ++ imageStore( dstTexture, int2( dstUV ), outputBytes ); ++ } ++} +diff --git a/src/compiler/glsl/etc2_rgba_stitch.glsl b/src/compiler/glsl/etc2_rgba_stitch.glsl +new file mode 100644 +index 00000000000..f90accb82e1 +--- /dev/null ++++ b/src/compiler/glsl/etc2_rgba_stitch.glsl +@@ -0,0 +1,46 @@ ++/* ++ * Copyright 2020-2022 Matias N. Goldberg ++ * Copyright 2022 Intel Corporation ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ */ ++ ++// RGB and Alpha components of ETC2 RGBA are computed separately. ++// This compute shader merely stitches them together to form the final result ++// It's also used by RG11 driver to stitch two R11 into one RG11 ++ ++#version 310 es ++ ++%s // include "CrossPlatformSettings_piece_all.glsl" ++ ++layout( local_size_x = 8, // ++ local_size_y = 8, // ++ local_size_z = 1 ) in; ++ ++layout( binding = 0 ) uniform highp usampler2D srcRGB; ++layout( binding = 1 ) uniform highp usampler2D srcAlpha; ++layout( rgba32ui ) uniform restrict writeonly highp uimage2D dstTexture; ++ ++void main() ++{ ++ uint2 etcRgb = OGRE_Load2D( srcRGB, int2( gl_GlobalInvocationID.xy ), 0 ).xy; ++ uint2 etcAlpha = OGRE_Load2D( srcAlpha, int2( gl_GlobalInvocationID.xy ), 0 ).xy; ++ ++ imageStore( dstTexture, int2( gl_GlobalInvocationID.xy ), uint4( etcAlpha.xy, etcRgb.xy ) ); ++} +diff --git a/src/compiler/glsl/meson.build b/src/compiler/glsl/meson.build +index abae5adeabe..a130a963b2b 100644 +--- a/src/compiler/glsl/meson.build ++++ b/src/compiler/glsl/meson.build +@@ -70,6 +70,41 @@ float64_glsl_h = custom_target( + command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'float64_source'], + ) + ++cross_platform_settings_piece_all_h = custom_target( ++ 'cross_platform_settings_piece_all.h', ++ input : [files_xxd, 'CrossPlatformSettings_piece_all.glsl'], ++ output : 'cross_platform_settings_piece_all.h', ++ command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'cross_platform_settings_piece_all_header'], ++) ++ ++bc1_glsl_h = custom_target( ++ 'bc1_glsl.h', ++ input : [files_xxd, 'bc1.glsl'], ++ output : 'bc1_glsl.h', ++ command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'bc1_source'], ++) ++ ++bc4_glsl_h = custom_target( ++ 'bc4_glsl.h', ++ input : [files_xxd, 'bc4.glsl'], ++ output : 'bc4_glsl.h', ++ command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'bc4_source'], ++) ++ ++etc2_rgba_stitch_glsl_h = custom_target( ++ 'etc2_rgba_stitch_glsl.h', ++ input : [files_xxd, 'etc2_rgba_stitch.glsl'], ++ output : 'etc2_rgba_stitch_glsl.h', ++ command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'etc2_rgba_stitch_source'], ++) ++ ++astc_glsl_h = custom_target( ++ 'astc_glsl.h', ++ input : [files_xxd, 'astc_decoder.glsl'], ++ output : 'astc_glsl.h', ++ command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'astc_source'], ++) ++ + files_libglsl = files( + 'ast.h', + 'ast_array_index.cpp', +@@ -233,7 +268,8 @@ libglsl = static_library( + 'glsl', + [files_libglsl, glsl_parser, glsl_lexer_cpp, ir_expression_operation_h, + ir_expression_operation_strings_h, ir_expression_operation_constant_h, +- float64_glsl_h], ++ float64_glsl_h, cross_platform_settings_piece_all_h, bc1_glsl_h, bc4_glsl_h, ++ etc2_rgba_stitch_glsl_h, astc_glsl_h], + c_args : [c_msvc_compat_args, no_override_init_args], + cpp_args : [cpp_msvc_compat_args], + gnu_symbol_visibility : 'hidden', +diff --git a/src/compiler/meson.build b/src/compiler/meson.build +index ce6c6c06c44..e18af3bf335 100644 +--- a/src/compiler/meson.build ++++ b/src/compiler/meson.build +@@ -22,6 +22,8 @@ inc_compiler = include_directories('.') + inc_glsl = include_directories('glsl') + inc_spirv = include_directories('spirv') + ++astc_decoder_glsl_file = files('glsl/astc_decoder.glsl') ++ + files_libcompiler = files( + 'builtin_type_macros.h', + 'glsl_types.cpp', +diff --git a/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h b/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h +index aacf408e608..e8c70fd4ae7 100644 +--- a/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h ++++ b/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h +@@ -39,7 +39,7 @@ DRI_CONF_SECTION_DEBUG + DRI_CONF_FORCE_GL_NAMES_REUSE(false) + DRI_CONF_FORCE_GL_MAP_BUFFER_SYNCHRONIZED(false) + DRI_CONF_TRANSCODE_ETC(false) +- DRI_CONF_TRANSCODE_ASTC(false) ++ DRI_CONF_TRANSCODE_ASTC(true) + DRI_CONF_FORCE_GL_VENDOR() + DRI_CONF_FORCE_GL_RENDERER() + DRI_CONF_OVERRIDE_VRAM_SIZE() +diff --git a/src/util/driconf.h b/src/util/driconf.h +index 7ca309f1ab3..37e617ca413 100644 +--- a/src/util/driconf.h ++++ b/src/util/driconf.h +@@ -571,6 +571,11 @@ + DRI_CONF_OPT_B(radv_require_etc2, def, \ + "Implement emulated ETC2 on HW that does not support it") + ++ ++#define DRI_CONF_RADV_REQUIRE_ASTC(def) \ ++ DRI_CONF_OPT_B(radv_require_astc, def, \ ++ "Implement emulated ASTC on HW that does not support it") ++ + #define DRI_CONF_RADV_DISABLE_HTILE_LAYERS(def) \ + DRI_CONF_OPT_B(radv_disable_htile_layers, def, \ + "Disable HTILE for layered depth/stencil formats") +diff --git a/src/util/meson.build b/src/util/meson.build +index 2a1028f0d3a..03db5cf524b 100644 +--- a/src/util/meson.build ++++ b/src/util/meson.build +@@ -115,6 +115,10 @@ files_mesa_util = files( + 'strndup.h', + 'strtod.c', + 'strtod.h', ++ 'texcompress_astc_luts.cpp', ++ 'texcompress_astc_luts.h', ++ 'texcompress_astc_luts_wrap.cpp', ++ 'texcompress_astc_luts_wrap.h', + 'texcompress_rgtc_tmp.h', + 'timespec.h', + 'u_atomic.c', +diff --git a/src/util/texcompress_astc_luts.cpp b/src/util/texcompress_astc_luts.cpp +new file mode 100644 +index 00000000000..6ffaf23ea88 +--- /dev/null ++++ b/src/util/texcompress_astc_luts.cpp +@@ -0,0 +1,532 @@ ++/* Copyright (c) 2017-2022 Hans-Kristian Arntzen ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining ++ * a copy of this software and associated documentation files (the ++ * "Software"), to deal in the Software without restriction, including ++ * without limitation the rights to use, copy, modify, merge, publish, ++ * distribute, sublicense, and/or sell copies of the Software, and to ++ * permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be ++ * included in all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "texcompress_astc_luts.h" ++ ++namespace Granite ++{ ++static void build_astc_unquant_weight_lut(uint8_t *lut, size_t range, const ASTCQuantizationMode &mode) ++{ ++ for (size_t i = 0; i < range; i++) ++ { ++ auto &v = lut[i]; ++ ++ if (!mode.quints && !mode.trits) ++ { ++ switch (mode.bits) ++ { ++ case 1: ++ v = i * 63; ++ break; ++ ++ case 2: ++ v = i * 0x15; ++ break; ++ ++ case 3: ++ v = i * 9; ++ break; ++ ++ case 4: ++ v = (i << 2) | (i >> 2); ++ break; ++ ++ case 5: ++ v = (i << 1) | (i >> 4); ++ break; ++ ++ default: ++ v = 0; ++ break; ++ } ++ } ++ else if (mode.bits == 0) ++ { ++ if (mode.trits) ++ v = 32 * i; ++ else ++ v = 16 * i; ++ } ++ else ++ { ++ unsigned b = (i >> 1) & 1; ++ unsigned c = (i >> 2) & 1; ++ unsigned A, B, C, D; ++ ++ A = 0x7f * (i & 1); ++ D = i >> mode.bits; ++ B = 0; ++ ++ if (mode.trits) ++ { ++ static const unsigned Cs[3] = { 50, 23, 11 }; ++ C = Cs[mode.bits - 1]; ++ if (mode.bits == 2) ++ B = 0x45 * b; ++ else if (mode.bits == 3) ++ B = 0x21 * b + 0x42 * c; ++ } ++ else ++ { ++ static const unsigned Cs[2] = { 28, 13 }; ++ C = Cs[mode.bits - 1]; ++ if (mode.bits == 2) ++ B = 0x42 * b; ++ } ++ ++ unsigned unq = D * C + B; ++ unq ^= A; ++ unq = (A & 0x20) | (unq >> 2); ++ v = unq; ++ } ++ ++ // Expand [0, 63] to [0, 64]. ++ if (mode.bits != 0 && v > 32) ++ v++; ++ } ++} ++ ++static void build_astc_unquant_endpoint_lut(uint8_t *lut, size_t range, const ASTCQuantizationMode &mode) ++{ ++ for (size_t i = 0; i < range; i++) ++ { ++ auto &v = lut[i]; ++ ++ if (!mode.quints && !mode.trits) ++ { ++ // Bit-replication. ++ switch (mode.bits) ++ { ++ case 1: ++ v = i * 0xff; ++ break; ++ ++ case 2: ++ v = i * 0x55; ++ break; ++ ++ case 3: ++ v = (i << 5) | (i << 2) | (i >> 1); ++ break; ++ ++ case 4: ++ v = i * 0x11; ++ break; ++ ++ case 5: ++ v = (i << 3) | (i >> 2); ++ break; ++ ++ case 6: ++ v = (i << 2) | (i >> 4); ++ break; ++ ++ case 7: ++ v = (i << 1) | (i >> 6); ++ break; ++ ++ default: ++ v = i; ++ break; ++ } ++ } ++ else ++ { ++ unsigned A, B, C, D; ++ unsigned b = (i >> 1) & 1; ++ unsigned c = (i >> 2) & 1; ++ unsigned d = (i >> 3) & 1; ++ unsigned e = (i >> 4) & 1; ++ unsigned f = (i >> 5) & 1; ++ ++ B = 0; ++ D = i >> mode.bits; ++ A = (i & 1) * 0x1ff; ++ ++ if (mode.trits) ++ { ++ static const unsigned Cs[6] = { 204, 93, 44, 22, 11, 5 }; ++ C = Cs[mode.bits - 1]; ++ ++ switch (mode.bits) ++ { ++ case 2: ++ B = b * 0x116; ++ break; ++ ++ case 3: ++ B = b * 0x85 + c * 0x10a; ++ break; ++ ++ case 4: ++ B = b * 0x41 + c * 0x82 + d * 0x104; ++ break; ++ ++ case 5: ++ B = b * 0x20 + c * 0x40 + d * 0x81 + e * 0x102; ++ break; ++ ++ case 6: ++ B = b * 0x10 + c * 0x20 + d * 0x40 + e * 0x80 + f * 0x101; ++ break; ++ } ++ } ++ else ++ { ++ static const unsigned Cs[5] = { 113, 54, 26, 13, 6 }; ++ C = Cs[mode.bits - 1]; ++ ++ switch (mode.bits) ++ { ++ case 2: ++ B = b * 0x10c; ++ break; ++ ++ case 3: ++ B = b * 0x82 + c * 0x105; ++ break; ++ ++ case 4: ++ B = b * 0x40 + c * 0x81 + d * 0x102; ++ break; ++ ++ case 5: ++ B = b * 0x20 + c * 0x40 + d * 0x80 + e * 0x101; ++ break; ++ } ++ } ++ ++ unsigned unq = D * C + B; ++ unq ^= A; ++ unq = (A & 0x80) | (unq >> 2); ++ v = uint8_t(unq); ++ } ++ } ++} ++ ++static unsigned astc_value_range(const ASTCQuantizationMode &mode) ++{ ++ unsigned value_range = 1u << mode.bits; ++ if (mode.trits) ++ value_range *= 3; ++ if (mode.quints) ++ value_range *= 5; ++ ++ if (value_range == 1) ++ value_range = 0; ++ return value_range; ++} ++ ++static uint32_t astc_hash52(uint32_t p) ++{ ++ p ^= p >> 15; p -= p << 17; p += p << 7; p += p << 4; ++ p ^= p >> 5; p += p << 16; p ^= p >> 7; p ^= p >> 3; ++ p ^= p << 6; p ^= p >> 17; ++ return p; ++} ++ ++// Copy-paste from spec. ++static int astc_select_partition(int seed, int x, int y, int z, int partitioncount, bool small_block) ++{ ++ if (small_block) ++ { ++ x <<= 1; ++ y <<= 1; ++ z <<= 1; ++ } ++ ++ seed += (partitioncount - 1) * 1024; ++ uint32_t rnum = astc_hash52(seed); ++ uint8_t seed1 = rnum & 0xF; ++ uint8_t seed2 = (rnum >> 4) & 0xF; ++ uint8_t seed3 = (rnum >> 8) & 0xF; ++ uint8_t seed4 = (rnum >> 12) & 0xF; ++ uint8_t seed5 = (rnum >> 16) & 0xF; ++ uint8_t seed6 = (rnum >> 20) & 0xF; ++ uint8_t seed7 = (rnum >> 24) & 0xF; ++ uint8_t seed8 = (rnum >> 28) & 0xF; ++ uint8_t seed9 = (rnum >> 18) & 0xF; ++ uint8_t seed10 = (rnum >> 22) & 0xF; ++ uint8_t seed11 = (rnum >> 26) & 0xF; ++ uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF; ++ ++ seed1 *= seed1; seed2 *= seed2; seed3 *= seed3; seed4 *= seed4; ++ seed5 *= seed5; seed6 *= seed6; seed7 *= seed7; seed8 *= seed8; ++ seed9 *= seed9; seed10 *= seed10; seed11 *= seed11; seed12 *= seed12; ++ ++ int sh1, sh2, sh3; ++ if (seed & 1) ++ { ++ sh1 = seed & 2 ? 4 : 5; ++ sh2 = partitioncount == 3 ? 6 : 5; ++ } ++ else ++ { ++ sh1 = partitioncount == 3 ? 6 : 5; ++ sh2 = seed & 2 ? 4 : 5; ++ } ++ sh3 = (seed & 0x10) ? sh1 : sh2; ++ ++ seed1 >>= sh1; seed2 >>= sh2; seed3 >>= sh1; seed4 >>= sh2; ++ seed5 >>= sh1; seed6 >>= sh2; seed7 >>= sh1; seed8 >>= sh2; ++ seed9 >>= sh3; seed10 >>= sh3; seed11 >>= sh3; seed12 >>= sh3; ++ ++ int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14); ++ int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10); ++ int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6); ++ int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2); ++ ++ a &= 0x3f; b &= 0x3f; c &= 0x3f; d &= 0x3f; ++ ++ if (partitioncount < 4) ++ d = 0; ++ if (partitioncount < 3) ++ c = 0; ++ ++ if (a >= b && a >= c && a >= d) ++ return 0; ++ else if (b >= c && b >= d) ++ return 1; ++ else if (c >= d) ++ return 2; ++ else ++ return 3; ++} ++ ++ASTCLutHolder::PartitionTable::PartitionTable(unsigned block_width, unsigned block_height) ++{ ++ bool small_block = (block_width * block_height) < 31; ++ ++ lut_width = block_width * 32; ++ lut_height = block_height * 32; ++ lut_buffer.resize(lut_width * lut_height); ++ ++ for (unsigned seed_y = 0; seed_y < 32; seed_y++) ++ { ++ for (unsigned seed_x = 0; seed_x < 32; seed_x++) ++ { ++ unsigned seed = seed_y * 32 + seed_x; ++ for (unsigned block_y = 0; block_y < block_height; block_y++) ++ { ++ for (unsigned block_x = 0; block_x < block_width; block_x++) ++ { ++ int part2 = astc_select_partition(seed, block_x, block_y, 0, 2, small_block); ++ int part3 = astc_select_partition(seed, block_x, block_y, 0, 3, small_block); ++ int part4 = astc_select_partition(seed, block_x, block_y, 0, 4, small_block); ++ lut_buffer[(seed_y * block_height + block_y) * lut_width + (seed_x * block_width + block_x)] = ++ (part2 << 0) | (part3 << 2) | (part4 << 4); ++ } ++ } ++ } ++ } ++} ++ ++ASTCLutHolder::PartitionTable &ASTCLutHolder::get_partition_table(unsigned width, unsigned height) ++{ ++ std::lock_guard holder{table_lock}; ++ auto itr = tables.find(width * 16 + height); ++ if (itr != tables.end()) ++ { ++ return itr->second; ++ } ++ else ++ { ++ auto &t = tables[width * 16 + height]; ++ t = { width, height }; ++ return t; ++ } ++} ++ ++ASTCLutHolder &get_astc_luts() ++{ ++ static ASTCLutHolder holder; ++ return holder; ++} ++ ++ASTCLutHolder::ASTCLutHolder() ++{ ++ init_color_endpoint(); ++ init_weight_luts(); ++ init_trits_quints(); ++} ++ ++void ASTCLutHolder::init_color_endpoint() ++{ ++ auto &unquant_lut = color_endpoint.unquant_lut; ++ ++ for (size_t i = 0; i < astc_num_quantization_modes; i++) ++ { ++ auto value_range = astc_value_range(astc_quantization_modes[i]); ++ color_endpoint.unquant_lut_offsets[i] = color_endpoint.unquant_offset; ++ build_astc_unquant_endpoint_lut(unquant_lut + color_endpoint.unquant_offset, value_range, astc_quantization_modes[i]); ++ color_endpoint.unquant_offset += value_range; ++ } + -+ encode_bc4(cmd_buffer, layout, &extent_copy, subresource, rgba_image, image4); ++ auto &lut = color_endpoint.lut; + -+ encode_bc3(cmd_buffer, &extent_copy, subresource, image1, image4, image); ++ // We can have a maximum of 9 endpoint pairs, i.e. 18 endpoint values in total. ++ for (unsigned pairs_minus_1 = 0; pairs_minus_1 < 9; pairs_minus_1++) ++ { ++ for (unsigned remaining = 0; remaining < 128; remaining++) ++ { ++ bool found_mode = false; ++ for (auto &mode : astc_quantization_modes) ++ { ++ unsigned num_values = (pairs_minus_1 + 1) * 2; ++ unsigned total_bits = mode.bits * num_values + ++ (mode.quints * 7 * num_values + 2) / 3 + ++ (mode.trits * 8 * num_values + 4) / 5; ++ ++ if (total_bits <= remaining) ++ { ++ found_mode = true; ++ lut[pairs_minus_1][remaining][0] = mode.bits; ++ lut[pairs_minus_1][remaining][1] = mode.trits; ++ lut[pairs_minus_1][remaining][2] = mode.quints; ++ lut[pairs_minus_1][remaining][3] = color_endpoint.unquant_lut_offsets[&mode - astc_quantization_modes]; ++ break; ++ } ++ } + -+ radv_meta_restore(&saved_state, cmd_buffer); ++ if (!found_mode) ++ memset(lut[pairs_minus_1][remaining], 0, sizeof(lut[pairs_minus_1][remaining])); ++ } ++ } +} -diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h -index 472858dd401..04a670d714d 100644 ---- a/src/amd/vulkan/radv_private.h -+++ b/src/amd/vulkan/radv_private.h -@@ -98,6 +98,7 @@ typedef uint32_t xcb_window_t; - - #include "vk_texcompress_astc.h" - #include "wsi_common.h" -+#include "vk_texcompress_bcn.h" - - #ifdef __cplusplus - extern "C" -@@ -681,6 +682,8 @@ struct radv_meta_state { - } etc_decode; - - struct vk_texcompress_astc_state *astc_decode; + -+ struct vk_texcompress_bcn_state *bcn_encoded[VK_TEXCOMPRESS_BCN_NUM_COMPUTE_PIPELINES]; - }; - - #define RADV_NUM_HW_CTX (RADEON_CTX_PRIORITY_REALTIME + 1) -@@ -2090,8 +2093,12 @@ struct radv_image { - bool dcc_sign_reinterpret; - bool support_comp_to_single; - -- bool isNeedSoftEncode;// 是否需要使用 软编 -- bool isNeedSoftDecode;// 是否需要使用 软解 -+ bool isNeedSoftEncode; // 是否需要使用 软编 -+ bool isNeedSoftDecode; // 是否需要使用 软解 ++void ASTCLutHolder::init_weight_luts() ++{ ++ auto &lut = weights.lut; ++ auto &unquant_lut = weights.unquant_lut; ++ auto &unquant_offset = weights.unquant_offset; ++ ++ for (size_t i = 0; i < astc_num_weight_modes; i++) ++ { ++ auto value_range = astc_value_range(astc_weight_modes[i]); ++ lut[i][0] = astc_weight_modes[i].bits; ++ lut[i][1] = astc_weight_modes[i].trits; ++ lut[i][2] = astc_weight_modes[i].quints; ++ lut[i][3] = unquant_offset; ++ build_astc_unquant_weight_lut(unquant_lut + unquant_offset, value_range, astc_weight_modes[i]); ++ unquant_offset += value_range; ++ } + -+ bool isNeedHardEncode; // 是否需要使用 硬编 -+ struct vk_texcompress_bcn_image *staging_images; // 硬编的中间资源 ++ assert(unquant_offset <= 256); ++} + - VkFormat srcFormat; //用于保存纹理的原始格式 - VkFormat unpackETCFormat;//用于保存ETC纹理解压后的格式 - -@@ -2474,6 +2481,8 @@ void radv_image_view_init(struct radv_image_view *view, struct radv_device *devi - const struct radv_image_view_extra_create_info *extra_create_info); - void radv_image_view_finish(struct radv_image_view *iview); - -+void radv_internal_image_finish(struct radv_image *image); ++void ASTCLutHolder::init_trits_quints() ++{ ++ // From specification. ++ auto &trits_quints = integer.trits_quints; + - VkFormat radv_get_aspect_format(struct radv_image *image, VkImageAspectFlags mask); - - struct radv_sampler_ycbcr_conversion_state { -diff --git a/src/compiler/glsl/bc1.glsl b/src/compiler/glsl/bc1.glsl -index 251635d7b69..627b2e5419c 100644 ---- a/src/compiler/glsl/bc1.glsl -+++ b/src/compiler/glsl/bc1.glsl -@@ -1,3 +1,4 @@ -+#version 310 es - /* - * Copyright 2020-2022 Matias N. Goldberg - * Copyright 2022 Intel Corporation -@@ -21,7 +22,6 @@ - * DEALINGS IN THE SOFTWARE. - */ - --#version 310 es - - #if defined(GL_ES) && GL_ES == 1 - // Desktop GLSL allows the const keyword for either compile-time or -@@ -31,10 +31,31 @@ - #define const - #endif - --%s // include "CrossPlatformSettings_piece_all.glsl" -- - #define FLT_MAX 340282346638528859811704183484516925440.0f - -+#ifdef VULKAN ++ for (unsigned T = 0; T < 256; T++) ++ { ++ unsigned C; ++ uint8_t t0, t1, t2, t3, t4; + -+#extension GL_GOOGLE_include_directive : require -+#include "CrossPlatformSettings_piece_all.glsl" ++ if (((T >> 2) & 7) == 7) ++ { ++ C = (((T >> 5) & 7) << 2) | (T & 3); ++ t4 = t3 = 2; ++ } ++ else ++ { ++ C = T & 0x1f; ++ if (((T >> 5) & 3) == 3) ++ { ++ t4 = 2; ++ t3 = (T >> 7) & 1; ++ } ++ else ++ { ++ t4 = (T >> 7) & 1; ++ t3 = (T >> 5) & 3; ++ } ++ } + -+layout(push_constant, std430) uniform pc { -+ uint p_numRefinements; -+}; ++ if ((C & 3) == 3) ++ { ++ t2 = 2; ++ t1 = (C >> 4) & 1; ++ t0 = (((C >> 3) & 1) << 1) | (((C >> 2) & 1) & ~(((C >> 3) & 1))); ++ } ++ else if (((C >> 2) & 3) == 3) ++ { ++ t2 = 2; ++ t1 = 2; ++ t0 = C & 3; ++ } ++ else ++ { ++ t2 = (C >> 4) & 1; ++ t1 = (C >> 2) & 3; ++ t0 = (((C >> 1) & 1) << 1) | ((C & 1) & ~(((C >> 1) & 1))); ++ } + -+layout ( set = 0, binding = 0 ) uniform sampler2D srcTex; ++ trits_quints[T] = t0 | (t1 << 3) | (t2 << 6) | (t3 << 9) | (t4 << 12); ++ } + -+layout (std430, set = 0, binding = 1) readonly restrict buffer globalBuffer -+{ -+ float2 c_oMatch5[256]; -+ float2 c_oMatch6[256]; -+}; ++ for (unsigned Q = 0; Q < 128; Q++) ++ { ++ unsigned C; ++ uint8_t q0, q1, q2; ++ if (((Q >> 1) & 3) == 3 && ((Q >> 5) & 3) == 0) ++ { ++ q2 = ((Q & 1) << 2) | ((((Q >> 4) & 1) & ~(Q & 1)) << 1) | (((Q >> 3) & 1) & ~(Q & 1)); ++ q1 = q0 = 4; ++ } ++ else ++ { ++ if (((Q >> 1) & 3) == 3) ++ { ++ q2 = 4; ++ C = (((Q >> 3) & 3) << 3) | ((~(Q >> 5) & 3) << 1) | (Q & 1); ++ } ++ else ++ { ++ q2 = (Q >> 5) & 3; ++ C = Q & 0x1f; ++ } + -+layout (rgba16ui, set = 0, binding = 2 ) uniform restrict writeonly mediump uimage2D dstTexture; ++ if ((C & 7) == 5) ++ { ++ q1 = 4; ++ q0 = (C >> 3) & 3; ++ } ++ else ++ { ++ q1 = (C >> 3) & 3; ++ q0 = C & 7; ++ } ++ } + -+#else ++ trits_quints[256 + Q] = q0 | (q1 << 3) | (q2 << 6); ++ } ++} ++} +diff --git a/src/util/texcompress_astc_luts.h b/src/util/texcompress_astc_luts.h +new file mode 100644 +index 00000000000..b3c26033a9a +--- /dev/null ++++ b/src/util/texcompress_astc_luts.h +@@ -0,0 +1,127 @@ ++/* Copyright (c) 2017-2022 Hans-Kristian Arntzen ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining ++ * a copy of this software and associated documentation files (the ++ * "Software"), to deal in the Software without restriction, including ++ * without limitation the rights to use, copy, modify, merge, publish, ++ * distribute, sublicense, and/or sell copies of the Software, and to ++ * permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be ++ * included in all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ + -+%s // include "CrossPlatformSettings_piece_all.glsl" ++#pragma once + - layout( location = 0 ) uniform uint p_numRefinements; - - uniform sampler2D srcTex; -@@ -47,6 +68,8 @@ layout( std430, binding = 1 ) readonly restrict buffer globalBuffer - float2 c_oMatch6[256]; - }; - -+#endif ++#include ++#include ++#include + - layout( local_size_x = 8, // - local_size_y = 8, // - local_size_z = 1 ) in; -diff --git a/src/compiler/glsl/bc4.glsl b/src/compiler/glsl/bc4.glsl -index 2486fa4ae0c..0b9d236d624 100644 ---- a/src/compiler/glsl/bc4.glsl -+++ b/src/compiler/glsl/bc4.glsl -@@ -1,3 +1,5 @@ -+#version 310 es ++namespace Granite ++{ + - /* - * Copyright 2020-2022 Matias N. Goldberg - * Copyright 2022 Intel Corporation -@@ -21,8 +23,6 @@ - * DEALINGS IN THE SOFTWARE. - */ - --#version 310 es -- - #if defined(GL_ES) && GL_ES == 1 - // Desktop GLSL allows the const keyword for either compile-time or - // run-time constants. GLSL ES only allows the keyword for compile-time -@@ -33,20 +33,37 @@ - - #define __sharedOnlyBarrier memoryBarrierShared();barrier(); - --%s // include "CrossPlatformSettings_piece_all.glsl" -+#ifdef VULKAN - --shared float2 g_minMaxValues[4u * 4u * 4u]; --shared uint2 g_mask[4u * 4u]; -+#extension GL_GOOGLE_include_directive : require -+#include "CrossPlatformSettings_piece_all.glsl" - --layout( location = 0 ) uniform uint2 params; -+layout(push_constant, std430) uniform pc { -+ uint2 params; ++struct ASTCQuantizationMode ++{ ++ uint8_t bits, trits, quints; +}; - --#define p_channelIdx params.x --#define p_useSNorm params.y -+layout ( set = 0, binding = 0 ) uniform sampler2D srcTex; -+ -+layout (rgba16ui, set = 0, binding = 1) uniform restrict writeonly mediump uimage2D dstTexture; -+ -+#else -+ -+%s // include "CrossPlatformSettings_piece_all.glsl" - - uniform sampler2D srcTex; - - layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture; - -+layout( location = 0 ) uniform uint2 params; + -+#endif ++// In order to decode color endpoints, we need to convert available bits and number of values ++// into a format of (bits, trits, quints). A simple LUT texture is a reasonable approach for this. ++// Decoders are expected to have some form of LUT to deal with this ... ++static const ASTCQuantizationMode astc_quantization_modes[] = { ++ { 8, 0, 0 }, ++ { 6, 1, 0 }, ++ { 5, 0, 1 }, ++ { 7, 0, 0 }, ++ { 5, 1, 0 }, ++ { 4, 0, 1 }, ++ { 6, 0, 0 }, ++ { 4, 1, 0 }, ++ { 3, 0, 1 }, ++ { 5, 0, 0 }, ++ { 3, 1, 0 }, ++ { 2, 0, 1 }, ++ { 4, 0, 0 }, ++ { 2, 1, 0 }, ++ { 1, 0, 1 }, ++ { 3, 0, 0 }, ++ { 1, 1, 0 }, ++}; + -+shared float2 g_minMaxValues[4u * 4u * 4u]; -+shared uint2 g_mask[4u * 4u]; ++constexpr size_t astc_num_quantization_modes = sizeof(astc_quantization_modes) / sizeof(astc_quantization_modes[0]); ++ ++static const ASTCQuantizationMode astc_weight_modes[] = { ++ { 0, 0, 0 }, // Invalid ++ { 0, 0, 0 }, // Invalid ++ { 1, 0, 0 }, ++ { 0, 1, 0 }, ++ { 2, 0, 0 }, ++ { 0, 0, 1 }, ++ { 1, 1, 0 }, ++ { 3, 0, 0 }, ++ { 0, 0, 0 }, // Invalid ++ { 0, 0, 0 }, // Invalid ++ { 1, 0, 1 }, ++ { 2, 1, 0 }, ++ { 4, 0, 0 }, ++ { 2, 0, 1 }, ++ { 3, 1, 0 }, ++ { 5, 0, 0 }, ++}; + -+#define p_channelIdx params.x -+#define p_useSNorm params.y ++constexpr size_t astc_num_weight_modes = sizeof(astc_weight_modes) / sizeof(astc_weight_modes[0]); + - layout( local_size_x = 4, // - local_size_y = 4, // - local_size_z = 4 ) in; -diff --git a/src/compiler/glsl/etc2_rgba_stitch.glsl b/src/compiler/glsl/etc2_rgba_stitch.glsl -index f90accb82e1..8d50c469956 100644 ---- a/src/compiler/glsl/etc2_rgba_stitch.glsl -+++ b/src/compiler/glsl/etc2_rgba_stitch.glsl -@@ -1,3 +1,5 @@ -+#version 310 es ++struct ASTCLutHolder ++{ ++ ASTCLutHolder(); ++ ++ void init_color_endpoint(); ++ void init_weight_luts(); ++ void init_trits_quints(); ++ ++ struct ++ { ++ size_t unquant_offset = 0; ++ uint8_t unquant_lut[2048]; ++ uint16_t lut[9][128][4]; ++ size_t unquant_lut_offsets[astc_num_quantization_modes]; ++ } color_endpoint; ++ ++ struct ++ { ++ size_t unquant_offset = 0; ++ uint8_t unquant_lut[2048]; ++ uint8_t lut[astc_num_weight_modes][4]; ++ } weights; ++ ++ struct ++ { ++ uint16_t trits_quints[256 + 128]; ++ } integer; ++ ++ struct PartitionTable ++ { ++ PartitionTable() = default; ++ PartitionTable(unsigned width, unsigned height); ++ std::vector lut_buffer; ++ unsigned lut_width = 0; ++ unsigned lut_height = 0; ++ }; ++ ++ std::mutex table_lock; ++ std::unordered_map tables; ++ ++ PartitionTable &get_partition_table(unsigned width, unsigned height); ++}; + - /* - * Copyright 2020-2022 Matias N. Goldberg - * Copyright 2022 Intel Corporation -@@ -25,18 +27,30 @@ - // This compute shader merely stitches them together to form the final result - // It's also used by RG11 driver to stitch two R11 into one RG11 - --#version 310 es -+#ifdef VULKAN - --%s // include "CrossPlatformSettings_piece_all.glsl" -+#extension GL_GOOGLE_include_directive : require -+#include "CrossPlatformSettings_piece_all.glsl" - --layout( local_size_x = 8, // -- local_size_y = 8, // -- local_size_z = 1 ) in; -+layout ( binding = 0 ) uniform highp usampler2D srcRGB; -+layout ( binding = 1 ) uniform highp usampler2D srcAlpha; ++ASTCLutHolder &get_astc_luts(); ++} +diff --git a/src/util/texcompress_astc_luts_wrap.cpp b/src/util/texcompress_astc_luts_wrap.cpp +new file mode 100644 +index 00000000000..6020a34e2c9 +--- /dev/null ++++ b/src/util/texcompress_astc_luts_wrap.cpp +@@ -0,0 +1,64 @@ ++/* ++ * Copyright © 2022 Intel Corporation ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included ++ * in all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ */ + -+layout ( rgba32ui, binding = 2 ) uniform restrict writeonly mediump uimage2D dstTexture; ++#include "texcompress_astc_luts_wrap.h" ++#include "texcompress_astc_luts.h" + -+#else ++extern "C" void ++_mesa_init_astc_decoder_luts(astc_decoder_lut_holder *holder) ++{ ++ auto &luts = Granite::get_astc_luts(); + -+%s // include "CrossPlatformSettings_piece_all.glsl" - - layout( binding = 0 ) uniform highp usampler2D srcRGB; - layout( binding = 1 ) uniform highp usampler2D srcAlpha; - layout( rgba32ui ) uniform restrict writeonly highp uimage2D dstTexture; - -+#endif ++ holder->color_endpoint.data = luts.color_endpoint.lut; ++ holder->color_endpoint.size_B = sizeof(luts.color_endpoint.lut); ++ holder->color_endpoint.format = PIPE_FORMAT_R16G16B16A16_UINT; + -+layout( local_size_x = 8, // -+ local_size_y = 8, // -+ local_size_z = 1 ) in; ++ holder->color_endpoint_unquant.data = luts.color_endpoint.unquant_lut; ++ holder->color_endpoint_unquant.size_B = luts.color_endpoint.unquant_offset; ++ holder->color_endpoint_unquant.format = PIPE_FORMAT_R8_UINT; + - void main() - { - uint2 etcRgb = OGRE_Load2D( srcRGB, int2( gl_GlobalInvocationID.xy ), 0 ).xy; -diff --git a/src/compiler/meson.build b/src/compiler/meson.build -index e18af3bf335..85b19b84bec 100644 ---- a/src/compiler/meson.build -+++ b/src/compiler/meson.build -@@ -24,6 +24,14 @@ inc_spirv = include_directories('spirv') - - astc_decoder_glsl_file = files('glsl/astc_decoder.glsl') - -+bc3_encode_head_dir = meson.current_source_dir() / 'glsl' ++ holder->weights.data = luts.weights.lut; ++ holder->weights.size_B = sizeof(luts.weights.lut); ++ holder->weights.format = PIPE_FORMAT_R8G8B8A8_UINT; + -+bc1_glsl_file = files('glsl/bc1.glsl') ++ holder->weights_unquant.data = luts.weights.unquant_lut; ++ holder->weights_unquant.size_B = luts.weights.unquant_offset; ++ holder->weights_unquant.format = PIPE_FORMAT_R8_UINT; + -+bc4_glsl_file = files('glsl/bc4.glsl') ++ holder->trits_quints.data = luts.integer.trits_quints; ++ holder->trits_quints.size_B = sizeof(luts.integer.trits_quints); ++ holder->trits_quints.format = PIPE_FORMAT_R16_UINT; ++} + -+etc2_rgba_stitch_glsl_file = files('glsl/etc2_rgba_stitch.glsl') ++extern "C" void * ++_mesa_get_astc_decoder_partition_table(uint32_t block_width, ++ uint32_t block_height, ++ unsigned *lut_width, ++ unsigned *lut_height) ++{ ++ auto &luts = Granite::get_astc_luts(); ++ auto &table = luts.get_partition_table(block_width, block_height); + - files_libcompiler = files( - 'builtin_type_macros.h', - 'glsl_types.cpp', -diff --git a/src/util/bc1_tables.h b/src/util/bc1_tables.h ++ *lut_width = table.lut_width; ++ *lut_height = table.lut_height; ++ return table.lut_buffer.data(); ++} +diff --git a/src/util/texcompress_astc_luts_wrap.h b/src/util/texcompress_astc_luts_wrap.h new file mode 100644 -index 00000000000..654ccdadb78 +index 00000000000..2d4a6eb6d85 --- /dev/null -+++ b/src/util/bc1_tables.h -@@ -0,0 +1,124 @@ ++++ b/src/util/texcompress_astc_luts_wrap.h +@@ -0,0 +1,62 @@ +/* -+ * Copyright 2020-2022 Matias N. Goldberg ++ * Copyright © 2022 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), @@ -998,243 +4976,85 @@ index 00000000000..654ccdadb78 + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * -+ * The above copyright notice and this permission notice shall be included in -+ * all copies or substantial portions of the Software. ++ * The above copyright notice and this permission notice shall be included ++ * in all copies or substantial portions of the Software. + * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + -+/* These tables have been generated with: ++#ifndef TEXCOMPRESS_ASTC_LUTS_WRAP_H ++#define TEXCOMPRESS_ASTC_LUTS_WRAP_H + -+ #define STB_DXT_IMPLEMENTATION -+ #include "stb_dxt.h" -+ #include -+ -+ int main() -+ { -+ stb__InitDXT(); ++#include ++#include ++#include "format/u_format.h" + -+ printf( "static unsigned char stb__OMatch5[256][2] = {\n" ); -+ for( size_t i = 0u; i < 256u; ++i ) -+ { -+ printf( "{ %i, %i }", stb__OMatch5[i][0], stb__OMatch5[i][1] ); -+ if( i != 255u ) -+ printf( ",\n" ); -+ } -+ printf( "\n};\n" ); ++/* C wrapper for Granite::ASTCLutHolder. */ + -+ printf( "static unsigned char stb__OMatch6[256][2] = {\n" ); -+ for( size_t i = 0u; i < 256u; ++i ) -+ { -+ printf( "{ %i, %i }", stb__OMatch6[i][0], stb__OMatch6[i][1] ); -+ if( i != 255u ) -+ printf( ",\n" ); -+ } -+ printf( "\n};\n" ); ++#ifdef __cplusplus ++extern "C" { ++#endif + -+ return 0; -+ } ++typedef struct ++{ ++ void *data; ++ size_t size_B; ++ enum pipe_format format; ++} astc_decoder_lut; + -+ Note that stb__OMatch5[i][0] = max and stb__OMatch5[i][1] = min -+*/ -+static unsigned char stb__OMatch5[256][2] = { -+ { 0, 0 }, { 0, 0 }, { 0, 1 }, { 0, 1 }, { 1, 0 }, { 1, 0 }, { 1, 0 }, { 1, 1 }, -+ { 1, 1 }, { 2, 0 }, { 2, 0 }, { 0, 4 }, { 2, 1 }, { 2, 1 }, { 2, 1 }, { 3, 0 }, -+ { 3, 0 }, { 3, 0 }, { 3, 1 }, { 1, 5 }, { 3, 2 }, { 3, 2 }, { 4, 0 }, { 4, 0 }, -+ { 4, 1 }, { 4, 1 }, { 4, 2 }, { 4, 2 }, { 4, 2 }, { 3, 5 }, { 5, 1 }, { 5, 1 }, -+ { 5, 2 }, { 4, 4 }, { 5, 3 }, { 5, 3 }, { 5, 3 }, { 6, 2 }, { 6, 2 }, { 6, 2 }, -+ { 6, 3 }, { 5, 5 }, { 6, 4 }, { 6, 4 }, { 4, 8 }, { 7, 3 }, { 7, 3 }, { 7, 3 }, -+ { 7, 4 }, { 7, 4 }, { 7, 4 }, { 7, 5 }, { 5, 9 }, { 7, 6 }, { 7, 6 }, { 8, 4 }, -+ { 8, 4 }, { 8, 5 }, { 8, 5 }, { 8, 6 }, { 8, 6 }, { 8, 6 }, { 7, 9 }, { 9, 5 }, -+ { 9, 5 }, { 9, 6 }, { 8, 8 }, { 9, 7 }, { 9, 7 }, { 9, 7 }, { 10, 6 }, { 10, 6 }, -+ { 10, 6 }, { 10, 7 }, { 9, 9 }, { 10, 8 }, { 10, 8 }, { 8, 12 }, { 11, 7 }, { 11, 7 }, -+ { 11, 7 }, { 11, 8 }, { 11, 8 }, { 11, 8 }, { 11, 9 }, { 9, 13 }, { 11, 10 }, { 11, 10 }, -+ { 12, 8 }, { 12, 8 }, { 12, 9 }, { 12, 9 }, { 12, 10 }, { 12, 10 }, { 12, 10 }, { 11, 13 }, -+ { 13, 9 }, { 13, 9 }, { 13, 10 }, { 12, 12 }, { 13, 11 }, { 13, 11 }, { 13, 11 }, { 14, 10 }, -+ { 14, 10 }, { 14, 10 }, { 14, 11 }, { 13, 13 }, { 14, 12 }, { 14, 12 }, { 12, 16 }, { 15, 11 }, -+ { 15, 11 }, { 15, 11 }, { 15, 12 }, { 15, 12 }, { 15, 12 }, { 15, 13 }, { 13, 17 }, { 15, 14 }, -+ { 15, 14 }, { 16, 12 }, { 16, 12 }, { 16, 13 }, { 16, 13 }, { 16, 14 }, { 16, 14 }, { 16, 14 }, -+ { 15, 17 }, { 17, 13 }, { 17, 13 }, { 17, 14 }, { 16, 16 }, { 17, 15 }, { 17, 15 }, { 17, 15 }, -+ { 18, 14 }, { 18, 14 }, { 18, 14 }, { 18, 15 }, { 17, 17 }, { 18, 16 }, { 18, 16 }, { 16, 20 }, -+ { 19, 15 }, { 19, 15 }, { 19, 15 }, { 19, 16 }, { 19, 16 }, { 19, 16 }, { 19, 17 }, { 17, 21 }, -+ { 19, 18 }, { 19, 18 }, { 20, 16 }, { 20, 16 }, { 20, 17 }, { 20, 17 }, { 20, 18 }, { 20, 18 }, -+ { 20, 18 }, { 19, 21 }, { 21, 17 }, { 21, 17 }, { 21, 18 }, { 20, 20 }, { 21, 19 }, { 21, 19 }, -+ { 21, 19 }, { 22, 18 }, { 22, 18 }, { 22, 18 }, { 22, 19 }, { 21, 21 }, { 22, 20 }, { 22, 20 }, -+ { 20, 24 }, { 23, 19 }, { 23, 19 }, { 23, 19 }, { 23, 20 }, { 23, 20 }, { 23, 20 }, { 23, 21 }, -+ { 21, 25 }, { 23, 22 }, { 23, 22 }, { 24, 20 }, { 24, 20 }, { 24, 21 }, { 24, 21 }, { 24, 22 }, -+ { 24, 22 }, { 24, 22 }, { 23, 25 }, { 25, 21 }, { 25, 21 }, { 25, 22 }, { 24, 24 }, { 25, 23 }, -+ { 25, 23 }, { 25, 23 }, { 26, 22 }, { 26, 22 }, { 26, 22 }, { 26, 23 }, { 25, 25 }, { 26, 24 }, -+ { 26, 24 }, { 24, 28 }, { 27, 23 }, { 27, 23 }, { 27, 23 }, { 27, 24 }, { 27, 24 }, { 27, 24 }, -+ { 27, 25 }, { 25, 29 }, { 27, 26 }, { 27, 26 }, { 28, 24 }, { 28, 24 }, { 28, 25 }, { 28, 25 }, -+ { 28, 26 }, { 28, 26 }, { 28, 26 }, { 27, 29 }, { 29, 25 }, { 29, 25 }, { 29, 26 }, { 28, 28 }, -+ { 29, 27 }, { 29, 27 }, { 29, 27 }, { 30, 26 }, { 30, 26 }, { 30, 26 }, { 30, 27 }, { 29, 29 }, -+ { 30, 28 }, { 30, 28 }, { 30, 28 }, { 31, 27 }, { 31, 27 }, { 31, 27 }, { 31, 28 }, { 31, 28 }, -+ { 31, 28 }, { 31, 29 }, { 31, 29 }, { 31, 30 }, { 31, 30 }, { 31, 30 }, { 31, 31 }, { 31, 31 } -+}; ++typedef struct ++{ ++ astc_decoder_lut color_endpoint; ++ astc_decoder_lut color_endpoint_unquant; ++ astc_decoder_lut weights; ++ astc_decoder_lut weights_unquant; ++ astc_decoder_lut trits_quints; ++} astc_decoder_lut_holder; ++ ++void _mesa_init_astc_decoder_luts(astc_decoder_lut_holder *holder); ++void *_mesa_get_astc_decoder_partition_table(uint32_t block_width, ++ uint32_t block_height, ++ unsigned *lut_width, ++ unsigned *lut_height); ++ ++#ifdef __cplusplus ++} ++#endif + -+static unsigned char stb__OMatch6[256][2] = { -+ { 0, 0 }, { 0, 1 }, { 1, 0 }, { 1, 0 }, { 1, 1 }, { 2, 0 }, { 2, 1 }, { 3, 0 }, -+ { 3, 0 }, { 3, 1 }, { 4, 0 }, { 4, 0 }, { 4, 1 }, { 5, 0 }, { 5, 1 }, { 6, 0 }, -+ { 6, 0 }, { 6, 1 }, { 7, 0 }, { 7, 0 }, { 7, 1 }, { 8, 0 }, { 8, 1 }, { 8, 1 }, -+ { 8, 2 }, { 9, 1 }, { 9, 2 }, { 9, 2 }, { 9, 3 }, { 10, 2 }, { 10, 3 }, { 10, 3 }, -+ { 10, 4 }, { 11, 3 }, { 11, 4 }, { 11, 4 }, { 11, 5 }, { 12, 4 }, { 12, 5 }, { 12, 5 }, -+ { 12, 6 }, { 13, 5 }, { 13, 6 }, { 8, 16 }, { 13, 7 }, { 14, 6 }, { 14, 7 }, { 9, 17 }, -+ { 14, 8 }, { 15, 7 }, { 15, 8 }, { 11, 16 }, { 15, 9 }, { 15, 10 }, { 16, 8 }, { 16, 9 }, -+ { 16, 10 }, { 15, 13 }, { 17, 9 }, { 17, 10 }, { 17, 11 }, { 15, 16 }, { 18, 10 }, { 18, 11 }, -+ { 18, 12 }, { 16, 16 }, { 19, 11 }, { 19, 12 }, { 19, 13 }, { 17, 17 }, { 20, 12 }, { 20, 13 }, -+ { 20, 14 }, { 19, 16 }, { 21, 13 }, { 21, 14 }, { 21, 15 }, { 20, 17 }, { 22, 14 }, { 22, 15 }, -+ { 25, 10 }, { 22, 16 }, { 23, 15 }, { 23, 16 }, { 26, 11 }, { 23, 17 }, { 24, 16 }, { 24, 17 }, -+ { 27, 12 }, { 24, 18 }, { 25, 17 }, { 25, 18 }, { 28, 13 }, { 25, 19 }, { 26, 18 }, { 26, 19 }, -+ { 29, 14 }, { 26, 20 }, { 27, 19 }, { 27, 20 }, { 30, 15 }, { 27, 21 }, { 28, 20 }, { 28, 21 }, -+ { 28, 21 }, { 28, 22 }, { 29, 21 }, { 29, 22 }, { 24, 32 }, { 29, 23 }, { 30, 22 }, { 30, 23 }, -+ { 25, 33 }, { 30, 24 }, { 31, 23 }, { 31, 24 }, { 27, 32 }, { 31, 25 }, { 31, 26 }, { 32, 24 }, -+ { 32, 25 }, { 32, 26 }, { 31, 29 }, { 33, 25 }, { 33, 26 }, { 33, 27 }, { 31, 32 }, { 34, 26 }, -+ { 34, 27 }, { 34, 28 }, { 32, 32 }, { 35, 27 }, { 35, 28 }, { 35, 29 }, { 33, 33 }, { 36, 28 }, -+ { 36, 29 }, { 36, 30 }, { 35, 32 }, { 37, 29 }, { 37, 30 }, { 37, 31 }, { 36, 33 }, { 38, 30 }, -+ { 38, 31 }, { 41, 26 }, { 38, 32 }, { 39, 31 }, { 39, 32 }, { 42, 27 }, { 39, 33 }, { 40, 32 }, -+ { 40, 33 }, { 43, 28 }, { 40, 34 }, { 41, 33 }, { 41, 34 }, { 44, 29 }, { 41, 35 }, { 42, 34 }, -+ { 42, 35 }, { 45, 30 }, { 42, 36 }, { 43, 35 }, { 43, 36 }, { 46, 31 }, { 43, 37 }, { 44, 36 }, -+ { 44, 37 }, { 44, 37 }, { 44, 38 }, { 45, 37 }, { 45, 38 }, { 40, 48 }, { 45, 39 }, { 46, 38 }, -+ { 46, 39 }, { 41, 49 }, { 46, 40 }, { 47, 39 }, { 47, 40 }, { 43, 48 }, { 47, 41 }, { 47, 42 }, -+ { 48, 40 }, { 48, 41 }, { 48, 42 }, { 47, 45 }, { 49, 41 }, { 49, 42 }, { 49, 43 }, { 47, 48 }, -+ { 50, 42 }, { 50, 43 }, { 50, 44 }, { 48, 48 }, { 51, 43 }, { 51, 44 }, { 51, 45 }, { 49, 49 }, -+ { 52, 44 }, { 52, 45 }, { 52, 46 }, { 51, 48 }, { 53, 45 }, { 53, 46 }, { 53, 47 }, { 52, 49 }, -+ { 54, 46 }, { 54, 47 }, { 57, 42 }, { 54, 48 }, { 55, 47 }, { 55, 48 }, { 58, 43 }, { 55, 49 }, -+ { 56, 48 }, { 56, 49 }, { 59, 44 }, { 56, 50 }, { 57, 49 }, { 57, 50 }, { 60, 45 }, { 57, 51 }, -+ { 58, 50 }, { 58, 51 }, { 61, 46 }, { 58, 52 }, { 59, 51 }, { 59, 52 }, { 62, 47 }, { 59, 53 }, -+ { 60, 52 }, { 60, 53 }, { 60, 53 }, { 60, 54 }, { 61, 53 }, { 61, 54 }, { 61, 54 }, { 61, 55 }, -+ { 62, 54 }, { 62, 55 }, { 62, 55 }, { 62, 56 }, { 63, 55 }, { 63, 56 }, { 63, 56 }, { 63, 57 }, -+ { 63, 58 }, { 63, 59 }, { 63, 59 }, { 63, 60 }, { 63, 61 }, { 63, 62 }, { 63, 62 }, { 63, 63 } -+}; ++#endif diff --git a/src/vulkan/runtime/meson.build b/src/vulkan/runtime/meson.build -index 1c02e1a5dd8..54c89d793a0 100644 +index af134efb04e..1c02e1a5dd8 100644 --- a/src/vulkan/runtime/meson.build +++ b/src/vulkan/runtime/meson.build -@@ -93,12 +93,31 @@ endif - - if prog_glslang.found() - vulkan_runtime_files += files('vk_texcompress_astc.c', 'vk_texcompress_astc.h') -+ vulkan_runtime_files += files('vk_texcompress_bcn.c', 'vk_texcompress_bcn.h') - vulkan_runtime_files += custom_target( - 'astc_spv.h', - input : astc_decoder_glsl_file, - output : 'astc_spv.h', - command : [prog_glslang, '-V', '-S', 'comp', '-x', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet, - ) -+ vulkan_runtime_files += custom_target( -+ 'bc1_spv.h', -+ input : bc1_glsl_file, -+ output : 'bc1_spv.h', -+ command : [prog_glslang, '-V', '-S', 'comp', '-x', '-Ibc3_encode_head_dir', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet, -+ ) -+ vulkan_runtime_files += custom_target( -+ 'bc4_spv.h', -+ input : bc4_glsl_file, -+ output : 'bc4_spv.h', -+ command : [prog_glslang, '-V', '-S', 'comp', '-x', '-Ibc3_encode_head_dir', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet, -+ ) -+ vulkan_runtime_files += custom_target( -+ 'bc3_spv.h', -+ input : etc2_rgba_stitch_glsl_file, -+ output : 'bc3_spv.h', -+ command : [prog_glslang, '-V', '-S', 'comp', '-x', '-Ibc3_encode_head_dir', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet, -+ ) +@@ -91,6 +91,16 @@ if with_platform_android + vulkan_runtime_deps += dep_android endif - vk_common_entrypoints = custom_target( -diff --git a/src/vulkan/runtime/vk_command_buffer.h b/src/vulkan/runtime/vk_command_buffer.h -index 36099d03a56..1a04b53510f 100644 ---- a/src/vulkan/runtime/vk_command_buffer.h -+++ b/src/vulkan/runtime/vk_command_buffer.h -@@ -28,6 +28,7 @@ - #include "vk_object.h" - #include "util/list.h" - #include "util/u_dynarray.h" -+#include "vk_texcompress_bcn.h" - - #ifdef __cplusplus - extern "C" { -@@ -130,6 +131,12 @@ struct vk_command_buffer { - /* This uses the same trick as STACK_ARRAY */ - struct vk_attachment_state *attachments; - struct vk_attachment_state _attachments[8]; -+ -+ bool is_need_hard_encode; -+ -+ struct vk_texcompress_staging_images* staging_image_list_head; -+ -+ struct vk_texcompress_dummy_image* dummy; - }; - - VK_DEFINE_HANDLE_CASTS(vk_command_buffer, base, VkCommandBuffer, -diff --git a/src/vulkan/runtime/vk_queue.c b/src/vulkan/runtime/vk_queue.c -index 6216af9d856..15994e3420d 100644 ---- a/src/vulkan/runtime/vk_queue.c -+++ b/src/vulkan/runtime/vk_queue.c -@@ -42,6 +42,7 @@ - #include "vk_util.h" - - #include "vulkan/wsi/wsi_common.h" -+#include "vk_texcompress_bcn.h" - - static VkResult - vk_queue_start_submit_thread(struct vk_queue *queue); -@@ -1140,7 +1141,22 @@ vk_common_QueueSubmit2KHR(VkQueue _queue, - } - } - -+ int n_command_buffers = 0; -+ for (uint32_t s = 0; s < submitCount; s++) { -+ n_command_buffers += pSubmits[s].commandBufferInfoCount; -+ } -+ const uint32_t const_num_cmd_buf = n_command_buffers; -+ uint32_t num_cmd_buf = 0; -+ struct vk_command_buffer *need_delete_cmd_buf[const_num_cmd_buf]; -+ - for (uint32_t i = 0; i < submitCount; i++) { -+ for (uint32_t j = 0; j < pSubmits[i].commandBufferInfoCount; j++) { -+ VK_FROM_HANDLE(vk_command_buffer, commandBuffer, (pSubmits[i].pCommandBufferInfos)[j].commandBuffer); -+ if (commandBuffer->is_need_hard_encode) { -+ need_delete_cmd_buf[num_cmd_buf] = commandBuffer; -+ num_cmd_buf += 1; -+ } -+ } - struct vulkan_submit_info info = { - .pNext = pSubmits[i].pNext, - .command_buffer_count = pSubmits[i].commandBufferInfoCount, -@@ -1156,6 +1172,20 @@ vk_common_QueueSubmit2KHR(VkQueue _queue, - return result; - } - -+ if (num_cmd_buf > 0 && fence == NULL) { -+ vk_texcompress_create_fence(queue->base.device, NULL, &_fence); -+ } -+ -+ if (num_cmd_buf > 0) { -+ vk_texcompress_wait_fence(queue->base.device, NULL, &_fence); -+ for (uint32_t i = 0; i < num_cmd_buf; i++) { -+ while (need_delete_cmd_buf[i]->staging_image_list_head != NULL) { -+ vk_texcompress_delete_head(&need_delete_cmd_buf[i]->staging_image_list_head, queue->base.device); -+ } -+ need_delete_cmd_buf[i]->is_need_hard_encode = false; -+ } -+ } ++if prog_glslang.found() ++ vulkan_runtime_files += files('vk_texcompress_astc.c', 'vk_texcompress_astc.h') ++ vulkan_runtime_files += custom_target( ++ 'astc_spv.h', ++ input : astc_decoder_glsl_file, ++ output : 'astc_spv.h', ++ command : [prog_glslang, '-V', '-S', 'comp', '-x', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet, ++ ) ++endif + - return VK_SUCCESS; - } - -diff --git a/src/vulkan/runtime/vk_texcompress_bcn.c b/src/vulkan/runtime/vk_texcompress_bcn.c + vk_common_entrypoints = custom_target( + 'vk_common_entrypoints', + input : [vk_entrypoints_gen, vk_api_xml], +diff --git a/src/vulkan/runtime/vk_texcompress_astc.c b/src/vulkan/runtime/vk_texcompress_astc.c new file mode 100644 -index 00000000000..2c85d98af5c +index 00000000000..9275f426b76 --- /dev/null -+++ b/src/vulkan/runtime/vk_texcompress_bcn.c -@@ -0,0 +1,740 @@ ++++ b/src/vulkan/runtime/vk_texcompress_astc.c +@@ -0,0 +1,636 @@ +/* Copyright (c) 2017-2023 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, to any person obtaining @@ -1257,39 +5077,13 @@ index 00000000000..2c85d98af5c + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + -+#include "vk_texcompress_bcn.h" -+#include "util/bc1_tables.h" ++#include "vk_texcompress_astc.h" ++#include "util/texcompress_astc_luts_wrap.h" +#include "vk_alloc.h" ++#include "vk_device.h" +#include "vk_format.h" +#include "vk_image.h" +#include "vk_physical_device.h" -+#include "log/log.h" -+ -+void -+vk_texcompress_insert_head(staging_images **head, bcn_image *current) -+{ -+ staging_images *node = (staging_images *)malloc(sizeof(staging_images)); -+ if (node == NULL) { -+ ALOGE("Failed to alloc staging_images node"); -+ return; -+ } -+ -+ node->current = current; -+ node->next = *head; -+ *head = node; -+} -+ -+void -+vk_texcompress_delete_head(staging_images **head, struct vk_device *device) -+{ -+ if (*head == NULL) { -+ return; -+ } -+ staging_images *current_head = *head; -+ vk_texcompress_finish_image(device, NULL, current_head->current); -+ *head = current_head->next; -+ free(current_head); -+} + +/* type_indexes_mask bits are set/clear for support memory type index as per + * struct VkPhysicalDeviceMemoryProperties.memoryTypes[] */ @@ -1317,9 +5111,11 @@ index 00000000000..2c85d98af5c +} + +static VkResult -+create_buffer(struct vk_device *device, VkAllocationCallbacks *allocator, struct vk_texcompress_bcn_state *bc1) ++vk_create_buffer(struct vk_device *device, VkAllocationCallbacks *allocator, ++ VkDeviceSize size, VkMemoryPropertyFlags mem_prop_flags, ++ VkBufferUsageFlags usage_flags, VkBuffer *vk_buf, ++ VkDeviceMemory *vk_mem) +{ -+ VkDeviceSize size = sizeof(float) * (sizeof(stb__OMatch5) + sizeof(stb__OMatch6)); + VkResult result; + VkDevice _device = vk_device_to_handle(device); + const struct vk_device_dispatch_table *disp = &device->dispatch_table; @@ -1327,17 +5123,17 @@ index 00000000000..2c85d98af5c + VkBufferCreateInfo buffer_create_info = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .size = size, -+ .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, ++ .usage = usage_flags, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + }; + result = -+ disp->CreateBuffer(_device, &buffer_create_info, allocator, &bc1->bc1_table_buf); ++ disp->CreateBuffer(_device, &buffer_create_info, allocator, vk_buf); + if (unlikely(result != VK_SUCCESS)) + return result; + + VkBufferMemoryRequirementsInfo2 mem_req_info = { + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2, -+ .buffer = bc1->bc1_table_buf, ++ .buffer = *vk_buf, + }; + VkMemoryRequirements2 mem_req = { + .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2, @@ -1345,8 +5141,7 @@ index 00000000000..2c85d98af5c + disp->GetBufferMemoryRequirements2(_device, &mem_req_info, &mem_req); + + uint32_t mem_type_index = get_mem_type_index( -+ device, mem_req.memoryRequirements.memoryTypeBits, -+ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); ++ device, mem_req.memoryRequirements.memoryTypeBits, mem_prop_flags); + if (mem_type_index == -1) + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + @@ -1355,128 +5150,213 @@ index 00000000000..2c85d98af5c + .allocationSize = mem_req.memoryRequirements.size, + .memoryTypeIndex = mem_type_index, + }; -+ result = disp->AllocateMemory(_device, &alloc_info, allocator, &bc1->bc1_table_mem); ++ result = disp->AllocateMemory(_device, &alloc_info, allocator, vk_mem); + if (unlikely(result != VK_SUCCESS)) -+ return result; -+ -+ result = disp->BindBufferMemory(_device, bc1->bc1_table_buf, bc1->bc1_table_mem, 0); -+ if (unlikely(result != VK_SUCCESS)) -+ return result; -+ -+ void *data; -+ disp->MapMemory(_device, bc1->bc1_table_mem, 0, size, 0, &data); -+ float (*data_array)[2] = (float(*)[2])data; -+ for (int i = 0; i < 256; i++) { -+ for (int j = 0; j < 2; j++) { -+ data_array[i][j] = (float)stb__OMatch5[i][j]; -+ data_array[i + 256][j] = (float)stb__OMatch6[i][j]; -+ } -+ } -+ disp->UnmapMemory(_device, bc1->bc1_table_mem); ++ return result; ++ ++ disp->BindBufferMemory(_device, *vk_buf, *vk_mem, 0); + + return result; +} + -+VkResult -+vk_texcompress_create_image(struct vk_device *device, VkAllocationCallbacks *allocator, VkDeviceMemory *pMem, -+ VkImage *image, VkExtent3D extent, VkFormat format, int size) ++static VkResult ++create_buffer_view(struct vk_device *device, VkAllocationCallbacks *allocator, ++ VkBufferView *buf_view, VkBuffer buf, VkFormat format, VkDeviceSize size, ++ VkDeviceSize offset) +{ -+ bool is_dummy = (size != 0); + VkResult result; + VkDevice _device = vk_device_to_handle(device); + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + -+ VkImageCreateInfo image_create_info = { -+ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, -+ .flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT, -+ .imageType = VK_IMAGE_TYPE_2D, ++ VkBufferViewCreateInfo buffer_view_create_info = { ++ .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO, ++ .buffer = buf, + .format = format, -+ .extent = extent, -+ .mipLevels = 1, -+ .arrayLayers = 1, -+ .samples = VK_SAMPLE_COUNT_1_BIT, -+ .tiling = VK_IMAGE_TILING_OPTIMAL, -+ .usage = VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT, -+ .sharingMode = VK_SHARING_MODE_EXCLUSIVE, -+ .initialLayout = VK_IMAGE_LAYOUT_GENERAL, ++ .offset = offset, ++ .range = size, + }; -+ result = disp->CreateImage(_device, &image_create_info, allocator, image); -+ if (unlikely(result != VK_SUCCESS)) { -+ return result; ++ result = disp->CreateBufferView(_device, &buffer_view_create_info, ++ allocator, buf_view); ++ return result; ++} ++ ++static uint8_t ++get_partition_table_index(VkFormat format) ++{ ++ switch (format) { ++ case VK_FORMAT_ASTC_4x4_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_4x4_SRGB_BLOCK: ++ return 0; ++ case VK_FORMAT_ASTC_5x4_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_5x4_SRGB_BLOCK: ++ return 1; ++ case VK_FORMAT_ASTC_5x5_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_5x5_SRGB_BLOCK: ++ return 2; ++ case VK_FORMAT_ASTC_6x5_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_6x5_SRGB_BLOCK: ++ return 3; ++ case VK_FORMAT_ASTC_6x6_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_6x6_SRGB_BLOCK: ++ return 4; ++ case VK_FORMAT_ASTC_8x5_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_8x5_SRGB_BLOCK: ++ return 5; ++ case VK_FORMAT_ASTC_8x6_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_8x6_SRGB_BLOCK: ++ return 6; ++ case VK_FORMAT_ASTC_8x8_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_8x8_SRGB_BLOCK: ++ return 7; ++ case VK_FORMAT_ASTC_10x5_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_10x5_SRGB_BLOCK: ++ return 8; ++ case VK_FORMAT_ASTC_10x6_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_10x6_SRGB_BLOCK: ++ return 9; ++ case VK_FORMAT_ASTC_10x8_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_10x8_SRGB_BLOCK: ++ return 10; ++ case VK_FORMAT_ASTC_10x10_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_10x10_SRGB_BLOCK: ++ return 11; ++ case VK_FORMAT_ASTC_12x10_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_12x10_SRGB_BLOCK: ++ return 12; ++ case VK_FORMAT_ASTC_12x12_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_12x12_SRGB_BLOCK: ++ return 13; ++ default: ++ unreachable("bad astc format\n"); ++ return 0; + } ++} ++ ++static VkResult ++astc_prepare_buffer(struct vk_device *device, ++ struct vk_texcompress_astc_state *astc, ++ VkAllocationCallbacks *allocator, ++ VkDeviceSize minTexelBufferOffsetAlignment, ++ uint8_t *single_buf_ptr, ++ VkDeviceSize *single_buf_size) ++{ ++ VkResult result; ++ astc_decoder_lut_holder astc_lut_holder; ++ VkDeviceSize offset = 0; + -+ if (is_dummy) { -+ VkMemoryRequirements mem_requirements; -+ disp->GetImageMemoryRequirements(_device, *image, &mem_requirements); ++ _mesa_init_astc_decoder_luts(&astc_lut_holder); + -+ uint32_t mem_type_index = get_mem_type_index( -+ device, mem_requirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); -+ if (mem_type_index == -1) { -+ return VK_ERROR_OUT_OF_HOST_MEMORY; -+ } ++ const astc_decoder_lut *luts[] = { ++ &astc_lut_holder.color_endpoint, ++ &astc_lut_holder.color_endpoint_unquant, ++ &astc_lut_holder.weights, ++ &astc_lut_holder.weights_unquant, ++ &astc_lut_holder.trits_quints, ++ }; + -+ VkMemoryAllocateInfo alloc_info = { -+ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, -+ .allocationSize = size, -+ .memoryTypeIndex = mem_type_index, -+ }; -+ result = disp->AllocateMemory(_device, &alloc_info, allocator, pMem); -+ if (unlikely(result != VK_SUCCESS)) { -+ return result; ++ for (unsigned i = 0; i < ARRAY_SIZE(luts); i++) { ++ offset = align(offset, minTexelBufferOffsetAlignment); ++ if (single_buf_ptr) { ++ memcpy(single_buf_ptr + offset, luts[i]->data, luts[i]->size_B); ++ result = create_buffer_view(device, allocator, &astc->luts_buf_view[i], astc->luts_buf, ++ vk_format_from_pipe_format(luts[i]->format), luts[i]->size_B, ++ offset); ++ if (result != VK_SUCCESS) ++ return result; + } ++ offset += luts[i]->size_B; + } -+ if (is_dummy) { -+ disp->DestroyImage(_device, *image, allocator); -+ } -+ return result; -+} + -+void -+vk_texcompress_finish_image(struct vk_device *device, const VkAllocationCallbacks *allocator, -+ struct vk_texcompress_bcn_image *intermidate_image) -+{ -+ const struct vk_device_dispatch_table *disp = &device->dispatch_table; -+ disp->DestroyImage(vk_device_to_handle(device), intermidate_image->image_rgba, NULL); -+ disp->DestroyImage(vk_device_to_handle(device), intermidate_image->image_bc1, NULL); -+ disp->DestroyImage(vk_device_to_handle(device), intermidate_image->image_bc4, NULL); -+ disp->FreeMemory(vk_device_to_handle(device), intermidate_image->image_mem, NULL); -+ vk_free(&device->alloc, intermidate_image); -+} ++ const VkFormat formats[] = { ++ VK_FORMAT_ASTC_4x4_UNORM_BLOCK, ++ VK_FORMAT_ASTC_5x4_UNORM_BLOCK, ++ VK_FORMAT_ASTC_5x5_UNORM_BLOCK, ++ VK_FORMAT_ASTC_6x5_UNORM_BLOCK, ++ VK_FORMAT_ASTC_6x6_UNORM_BLOCK, ++ VK_FORMAT_ASTC_8x5_UNORM_BLOCK, ++ VK_FORMAT_ASTC_8x6_UNORM_BLOCK, ++ VK_FORMAT_ASTC_8x8_UNORM_BLOCK, ++ VK_FORMAT_ASTC_10x5_UNORM_BLOCK, ++ VK_FORMAT_ASTC_10x6_UNORM_BLOCK, ++ VK_FORMAT_ASTC_10x8_UNORM_BLOCK, ++ VK_FORMAT_ASTC_10x10_UNORM_BLOCK, ++ VK_FORMAT_ASTC_12x10_UNORM_BLOCK, ++ VK_FORMAT_ASTC_12x12_UNORM_BLOCK, ++ }; + -+void bind_memory(struct vk_device *device, VkImage *image, VkDeviceMemory *pMem, int offset) -+{ -+ VkDevice _device = vk_device_to_handle(device); -+ const struct vk_device_dispatch_table *disp = &device->dispatch_table; -+ disp->BindImageMemory(_device, *image, *pMem, offset); ++ for (uint32_t i = 0; i < ARRAY_SIZE(formats); i++) { ++ unsigned lut_width; ++ unsigned lut_height; ++ const void *lut_data = _mesa_get_astc_decoder_partition_table( ++ vk_format_get_blockwidth(formats[i]), ++ vk_format_get_blockheight(formats[i]), ++ &lut_width, &lut_height); ++ const unsigned lut_size = lut_width * lut_height; ++ ++ offset = align(offset, minTexelBufferOffsetAlignment); ++ if (single_buf_ptr) { ++ memcpy(single_buf_ptr + offset, lut_data, lut_width * lut_height); ++ ++ result = create_buffer_view(device, allocator, &astc->partition_tbl_buf_view[i], ++ astc->luts_buf, VK_FORMAT_R8_UINT, lut_width * lut_height, ++ offset); ++ if (result != VK_SUCCESS) ++ return result; ++ } ++ offset += lut_size; ++ } ++ ++ *single_buf_size = offset; ++ return result; +} + +static VkResult -+create_sampler(struct vk_device *device, VkAllocationCallbacks *allocator, -+ VkSampler *sampler, VkFilter filter, VkSamplerMipmapMode mipmapMode) ++create_fill_all_luts_vulkan(struct vk_device *device, ++ VkAllocationCallbacks *allocator, ++ struct vk_texcompress_astc_state *astc) +{ ++ VkResult result; + VkDevice _device = vk_device_to_handle(device); + const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ VkPhysicalDevice _phy_device = vk_physical_device_to_handle(device->physical); ++ const struct vk_physical_device_dispatch_table *phy_disp = &device->physical->dispatch_table; ++ VkDeviceSize single_buf_size; ++ uint8_t *single_buf_ptr; + -+ VkSamplerCreateInfo create_info = { -+ .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, -+ .magFilter = filter, -+ .minFilter = filter, -+ .addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT, -+ .addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT, -+ .addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT, -+ .anisotropyEnable = VK_FALSE, -+ .borderColor = VK_BORDER_COLOR_INT_OPAQUE_BLACK, -+ .unnormalizedCoordinates = VK_FALSE, -+ .compareEnable = VK_FALSE, -+ .mipmapMode = mipmapMode, ++ VkPhysicalDeviceProperties2 phy_dev_prop = { ++ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2, ++ .pNext = NULL, + }; -+ VkResult result = disp->CreateSampler(_device, &create_info, allocator, sampler); ++ phy_disp->GetPhysicalDeviceProperties2(_phy_device, &phy_dev_prop); ++ ++ /* get the single_buf_size */ ++ result = astc_prepare_buffer(device, astc, allocator, ++ phy_dev_prop.properties.limits.minTexelBufferOffsetAlignment, ++ NULL, &single_buf_size); ++ ++ /* create gpu buffer for all the luts */ ++ result = vk_create_buffer(device, allocator, single_buf_size, ++ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | ++ VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, ++ VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT, ++ &astc->luts_buf, &astc->luts_mem); ++ if (unlikely(result != VK_SUCCESS)) ++ return result; ++ ++ disp->MapMemory(_device, astc->luts_mem, 0, VK_WHOLE_SIZE, 0, (void*)&single_buf_ptr); ++ ++ /* fill all the luts and create views */ ++ result = astc_prepare_buffer(device, astc, allocator, ++ phy_dev_prop.properties.limits.minTexelBufferOffsetAlignment, ++ single_buf_ptr, &single_buf_size); ++ ++ disp->UnmapMemory(_device, astc->luts_mem); + return result; +} + +static VkResult -+create_bc1_layout(struct vk_device *device, VkAllocationCallbacks *allocator, -+ struct vk_texcompress_bcn_state *bcn) ++create_layout(struct vk_device *device, VkAllocationCallbacks *allocator, ++ struct vk_texcompress_astc_state *astc) +{ + VkResult result; + VkDevice _device = vk_device_to_handle(device); @@ -1484,101 +5364,57 @@ index 00000000000..2c85d98af5c + + VkDescriptorSetLayoutBinding bindings[] = { + { -+ .binding = 0, -+ .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, ++ .binding = 0, /* OutputImage2DArray */ ++ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = NULL, + }, + { -+ .binding = 1, -+ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, ++ .binding = 1, /* PayloadInput2DArray */ ++ .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = NULL, + }, + { -+ .binding = 2, -+ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, ++ .binding = 2, /* LUTRemainingBitsToEndpointQuantizer */ ++ .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = NULL, + }, -+ }; -+ -+ VkDescriptorSetLayoutCreateInfo ds_create_info = { -+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, -+ .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR, -+ .bindingCount = ARRAY_SIZE(bindings), -+ .pBindings = bindings, -+ }; -+ -+ result = disp->CreateDescriptorSetLayout(_device, &ds_create_info, allocator, &bcn->ds_layout); -+ return result; -+} -+ -+static VkResult -+create_bc4_layout(struct vk_device *device, VkAllocationCallbacks *allocator, -+ struct vk_texcompress_bcn_state *bcn) -+{ -+ VkResult result; -+ VkDevice _device = vk_device_to_handle(device); -+ const struct vk_device_dispatch_table *disp = &device->dispatch_table; -+ -+ VkDescriptorSetLayoutBinding bindings[] = { + { -+ .binding = 0, -+ .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, ++ .binding = 3, /* LUTEndpointUnquantize */ ++ .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = NULL, + }, + { -+ .binding = 1, -+ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, ++ .binding = 4, /* LUTWeightQuantizer */ ++ .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = NULL, + }, -+ }; -+ -+ VkDescriptorSetLayoutCreateInfo ds_create_info = { -+ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, -+ .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR, -+ .bindingCount = ARRAY_SIZE(bindings), -+ .pBindings = bindings, -+ }; -+ -+ result = disp->CreateDescriptorSetLayout(_device, &ds_create_info, allocator, &bcn->ds_layout); -+ return result; -+} -+ -+static VkResult -+create_bc3_layout(struct vk_device *device, VkAllocationCallbacks *allocator, -+ struct vk_texcompress_bcn_state *bcn) -+{ -+ VkResult result; -+ VkDevice _device = vk_device_to_handle(device); -+ const struct vk_device_dispatch_table *disp = &device->dispatch_table; -+ -+ VkDescriptorSetLayoutBinding bindings[] = { + { -+ .binding = 0, -+ .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, ++ .binding = 5, /* LUTWeightUnquantize */ ++ .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = NULL, + }, + { -+ .binding = 1, -+ .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, ++ .binding = 6, /* LUTTritQuintDecode */ ++ .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = NULL, + }, + { -+ .binding = 2, -+ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, ++ .binding = 7, /* LUTPartitionTable */ ++ .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, + .pImmutableSamplers = NULL, @@ -1592,61 +5428,32 @@ index 00000000000..2c85d98af5c + .pBindings = bindings, + }; + -+ result = disp->CreateDescriptorSetLayout(_device, &ds_create_info, allocator, &bcn->ds_layout); -+ return result; -+} -+ -+static VkResult -+create_pipeline_layout(struct vk_device *device, VkAllocationCallbacks *allocator, -+ struct vk_texcompress_bcn_state *bcn, enum compute_pipeline_id id) -+{ -+ VkResult result = VK_RESULT_MAX_ENUM; -+ switch (id) { -+ case BC1_ENCODE: -+ result = create_bc1_layout(device, allocator, bcn); -+ break; -+ case BC4_ENCODE: -+ result = create_bc4_layout(device, allocator, bcn); -+ break; -+ case BC3_ENCODE: -+ result = create_bc3_layout(device, allocator, bcn); -+ break; -+ default: -+ goto fail; -+ } -+ if (result != VK_SUCCESS) { ++ result = disp->CreateDescriptorSetLayout(_device, &ds_create_info, ++ allocator, &astc->ds_layout); ++ if (result != VK_SUCCESS) + goto fail; -+ } -+ VkDevice _device = vk_device_to_handle(device); -+ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ + VkPipelineLayoutCreateInfo pl_create_info = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .setLayoutCount = 1, -+ .pSetLayouts = &bcn->ds_layout, ++ .pSetLayouts = &astc->ds_layout, + .pushConstantRangeCount = 1, -+ .pPushConstantRanges = &(VkPushConstantRange) {VK_SHADER_STAGE_COMPUTE_BIT, 0, 8} ++ .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 20}, + }; -+ result = disp->CreatePipelineLayout(_device, &pl_create_info, allocator, &bcn->p_layout); ++ result = disp->CreatePipelineLayout(_device, &pl_create_info, allocator, ++ &astc->p_layout); +fail: + return result; +} + -+static const uint32_t bc1_spv[] = { -+#include "bc1_spv.h" -+}; -+ -+static const uint32_t bc4_spv[] = { -+#include "bc4_spv.h" -+}; -+ -+static const uint32_t bc3_spv[] = { -+#include "bc3_spv.h" ++static const uint32_t astc_spv[] = { ++#include "astc_spv.h" +}; + +static VkResult -+vk_bc1_create_shader_module(struct vk_device *device, ++vk_astc_create_shader_module(struct vk_device *device, + VkAllocationCallbacks *allocator, -+ struct vk_texcompress_bcn_state *bcn) ++ struct vk_texcompress_astc_state *astc) +{ + VkDevice _device = vk_device_to_handle(device); + const struct vk_device_dispatch_table *disp = &device->dispatch_table; @@ -1655,131 +5462,114 @@ index 00000000000..2c85d98af5c + .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, + .pNext = NULL, + .flags = 0, -+ .codeSize = sizeof(bc1_spv), -+ .pCode = bc1_spv, ++ .codeSize = sizeof(astc_spv), ++ .pCode = astc_spv, + }; + -+ return disp->CreateShaderModule(_device, &shader_module_create_info, allocator, &bcn->shader_module); ++ return disp->CreateShaderModule(_device, &shader_module_create_info, ++ allocator, &astc->shader_module); +} + +static VkResult -+vk_bc4_create_shader_module(struct vk_device *device, -+ VkAllocationCallbacks *allocator, -+ struct vk_texcompress_bcn_state *bcn) ++create_astc_decode_pipeline(struct vk_device *device, ++ VkAllocationCallbacks *allocator, ++ struct vk_texcompress_astc_state *astc, ++ VkPipelineCache pipeline_cache, VkFormat format) +{ ++ VkResult result; + VkDevice _device = vk_device_to_handle(device); + const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ VkPipeline pipeline; ++ uint8_t t_i; + -+ VkShaderModuleCreateInfo shader_module_create_info = { -+ .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, -+ .pNext = NULL, -+ .flags = 0, -+ .codeSize = sizeof(bc4_spv), -+ .pCode = bc4_spv, -+ }; -+ -+ return disp->CreateShaderModule(_device, &shader_module_create_info, allocator, &bcn->shader_module); -+} -+ -+static VkResult -+vk_bc3_create_shader_module(struct vk_device *device, -+ VkAllocationCallbacks *allocator, -+ struct vk_texcompress_bcn_state *bcn) -+{ -+ VkDevice _device = vk_device_to_handle(device); -+ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ t_i = get_partition_table_index(format); + -+ VkShaderModuleCreateInfo shader_module_create_info = { -+ .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, -+ .pNext = NULL, -+ .flags = 0, -+ .codeSize = sizeof(bc3_spv), -+ .pCode = bc3_spv, ++ uint32_t special_data[3] = { ++ vk_format_get_blockwidth(format), ++ vk_format_get_blockheight(format), ++ true, ++ }; ++ VkSpecializationMapEntry special_map_entry[3] = {{ ++ .constantID = 0, ++ .offset = 0, ++ .size = 4, ++ }, ++ { ++ .constantID = 1, ++ .offset = 4, ++ .size = 4, ++ }, ++ { ++ .constantID = 2, ++ .offset = 8, ++ .size = 4, ++ }}; ++ ++ VkSpecializationInfo specialization_info = { ++ .mapEntryCount = 3, ++ .pMapEntries = special_map_entry, ++ .dataSize = 12, ++ .pData = special_data, + }; -+ -+ return disp->CreateShaderModule(_device, &shader_module_create_info, allocator, &bcn->shader_module); -+} -+ -+static VkResult -+create_bcn_encode_pipeline(struct vk_device *device, -+ VkAllocationCallbacks *allocator, -+ struct vk_texcompress_bcn_state *bcn, -+ VkPipelineCache pipeline_cache) -+{ -+ VkResult result; -+ VkDevice _device = vk_device_to_handle(device); -+ const struct vk_device_dispatch_table *disp = &device->dispatch_table; -+ VkPipeline pipeline; + + /* compute shader */ + VkPipelineShaderStageCreateInfo pipeline_shader_stage = { + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .stage = VK_SHADER_STAGE_COMPUTE_BIT, -+ .module = bcn->shader_module, ++ .module = astc->shader_module, + .pName = "main", ++ .pSpecializationInfo = &specialization_info, + }; + + VkComputePipelineCreateInfo vk_pipeline_info = { + .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, + .stage = pipeline_shader_stage, + .flags = 0, -+ .layout = bcn->p_layout, ++ .layout = astc->p_layout, + }; + -+ result = disp->CreateComputePipelines(_device, pipeline_cache, 1, &vk_pipeline_info, allocator, &pipeline); ++ result = disp->CreateComputePipelines( ++ _device, pipeline_cache, 1, &vk_pipeline_info, allocator, &pipeline); + if (result != VK_SUCCESS) + return result; + -+ bcn->pipeline = pipeline; ++ astc->pipeline[t_i] = pipeline; ++ astc->pipeline_mask |= (1 << t_i); + + return result; +} + +VkPipeline -+vk_texcompress_rgba_get_encode_pipeline(struct vk_device *device, VkAllocationCallbacks *allocator, -+ struct vk_texcompress_bcn_state *bcn, VkPipelineCache pipeline_cache, -+ enum compute_pipeline_id id) ++vk_texcompress_astc_get_decode_pipeline(struct vk_device *device, VkAllocationCallbacks *allocator, ++ struct vk_texcompress_astc_state *astc, VkPipelineCache pipeline_cache, ++ VkFormat format) +{ + VkResult result; ++ uint8_t t_i = get_partition_table_index(format); + -+ simple_mtx_lock(&bcn->mutex); -+ -+ if (bcn->pipeline) -+ goto unlock; ++ simple_mtx_lock(&astc->mutex); + -+ if (id >= MAX_PIPELINE_NUM || id < 0) ++ if (astc->pipeline[t_i]) + goto unlock; + -+ if (id == BC1_ENCODE && !bcn->shader_module) { -+ result = vk_bc1_create_shader_module(device, allocator, bcn); -+ if (result != VK_SUCCESS) -+ goto unlock; -+ } -+ -+ if (id == BC4_ENCODE && !bcn->shader_module) { -+ result = vk_bc4_create_shader_module(device, allocator, bcn); -+ if (result != VK_SUCCESS) -+ goto unlock; -+ } -+ -+ if (id == BC3_ENCODE && !bcn->shader_module) { -+ result = vk_bc3_create_shader_module(device, allocator, bcn); ++ if (!astc->shader_module) { ++ result = vk_astc_create_shader_module(device, allocator, astc); + if (result != VK_SUCCESS) + goto unlock; + } + -+ create_bcn_encode_pipeline(device, allocator, bcn, pipeline_cache); ++ create_astc_decode_pipeline(device, allocator, astc, pipeline_cache, format); + +unlock: -+ simple_mtx_unlock(&bcn->mutex); -+ return bcn->pipeline; ++ simple_mtx_unlock(&astc->mutex); ++ return astc->pipeline[t_i]; +} + +static inline void -+fill_desc_image_info_struct(VkDescriptorImageInfo *info, VkSampler sampler, VkImageView img_view, ++fill_desc_image_info_struct(VkDescriptorImageInfo *info, VkImageView img_view, + VkImageLayout img_layout) +{ -+ info->sampler = sampler; ++ info->sampler = VK_NULL_HANDLE; + info->imageView = img_view; + info->imageLayout = img_layout; +} @@ -1801,9 +5591,9 @@ index 00000000000..2c85d98af5c +} + +static inline void -+fill_write_descriptor_set_buffer(VkWriteDescriptorSet *set, -+ uint8_t bind_i, VkDescriptorType desc_type, -+ VkDescriptorBufferInfo *buf_info) ++fill_write_descriptor_set_uniform_texel(VkWriteDescriptorSet *set, ++ uint8_t bind_i, ++ VkBufferView *buf_view) +{ + set->sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + set->pNext = NULL; @@ -1811,177 +5601,102 @@ index 00000000000..2c85d98af5c + set->dstBinding = bind_i; + set->dstArrayElement = 0; + set->descriptorCount = 1; -+ set->descriptorType = desc_type; ++ set->descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER; + set->pImageInfo = NULL; -+ set->pBufferInfo = buf_info; -+ set->pTexelBufferView = NULL; ++ set->pBufferInfo = NULL; ++ set->pTexelBufferView = buf_view; +} + +void -+vk_texcompress_bc1_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn, -+ struct vk_texcompress_bcn_write_descriptor_set *set, ++vk_texcompress_astc_fill_write_descriptor_sets(struct vk_texcompress_astc_state *astc, ++ struct vk_texcompress_astc_write_descriptor_set *set, + VkImageView src_img_view, VkImageLayout src_img_layout, -+ VkImageView dst_img_view) -+{ -+ fill_desc_image_info_struct(&set->src_desc_image_info, bcn->sampler, src_img_view, src_img_layout); -+ fill_write_descriptor_set_image(&set->descriptor_set[0], 0, -+ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, &set->src_desc_image_info); -+ -+ set->bc1_buffer_info.buffer = bcn->bc1_table_buf; -+ set->bc1_buffer_info.offset = 0; -+ set->bc1_buffer_info.range = VK_WHOLE_SIZE; -+ fill_write_descriptor_set_buffer(&set->descriptor_set[1], 1, -+ VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &set->bc1_buffer_info); -+ -+ fill_desc_image_info_struct(&set->dst_desc_image_info, VK_NULL_HANDLE, dst_img_view, VK_IMAGE_LAYOUT_GENERAL); -+ fill_write_descriptor_set_image(&set->descriptor_set[2], 2, -+ VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &set->dst_desc_image_info); -+} -+ -+void -+vk_texcompress_bc4_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn, -+ struct vk_texcompress_bc4_write_descriptor_set *set, -+ VkImageView src_img_view, VkImageLayout src_img_layout, -+ VkImageView dst_img_view) -+{ -+ fill_desc_image_info_struct(&set->src_desc_image_info, bcn->sampler, src_img_view, src_img_layout); -+ fill_write_descriptor_set_image(&set->descriptor_set[0], 0, -+ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, &set->src_desc_image_info); -+ -+ fill_desc_image_info_struct(&set->dst_desc_image_info, VK_NULL_HANDLE, dst_img_view, VK_IMAGE_LAYOUT_GENERAL); -+ fill_write_descriptor_set_image(&set->descriptor_set[1], 1, -+ VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &set->dst_desc_image_info); -+} -+ -+void -+vk_texcompress_bc3_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn, -+ struct vk_texcompress_bcn_write_descriptor_set *set, -+ VkImageView bc1_src_img_view, VkImageView bc4_src_img_view, -+ VkImageView dst_img_view) ++ VkImageView dst_img_view, ++ VkFormat format) +{ -+ fill_desc_image_info_struct(&set->src_desc_image_info, bcn->sampler, bc1_src_img_view, VK_IMAGE_LAYOUT_GENERAL); -+ fill_write_descriptor_set_image(&set->descriptor_set[0], 0, -+ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, &set->src_desc_image_info); -+ -+ fill_desc_image_info_struct(&set->src_desc_image_info_2, bcn->sampler, bc4_src_img_view, VK_IMAGE_LAYOUT_GENERAL); -+ fill_write_descriptor_set_image(&set->descriptor_set[1], 1, -+ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, &set->src_desc_image_info_2); ++ unsigned desc_i; + -+ fill_desc_image_info_struct(&set->dst_desc_image_info, VK_NULL_HANDLE, dst_img_view, VK_IMAGE_LAYOUT_GENERAL); -+ fill_write_descriptor_set_image(&set->descriptor_set[2], 2, ++ desc_i = 0; ++ fill_desc_image_info_struct(&set->dst_desc_image_info, dst_img_view, VK_IMAGE_LAYOUT_GENERAL); ++ fill_write_descriptor_set_image(&set->descriptor_set[desc_i], desc_i, + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &set->dst_desc_image_info); ++ desc_i++; ++ fill_desc_image_info_struct(&set->src_desc_image_info, src_img_view, src_img_layout); ++ fill_write_descriptor_set_image(&set->descriptor_set[desc_i], desc_i, ++ VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, &set->src_desc_image_info); ++ /* fill luts descriptor */ ++ desc_i++; ++ for (unsigned i = 0; i < VK_TEXCOMPRESS_ASTC_NUM_LUTS; i++) { ++ fill_write_descriptor_set_uniform_texel(&set->descriptor_set[desc_i + i], desc_i + i, ++ &astc->luts_buf_view[i]); ++ } ++ desc_i += VK_TEXCOMPRESS_ASTC_NUM_LUTS; ++ uint8_t t_i = get_partition_table_index(format); ++ fill_write_descriptor_set_uniform_texel(&set->descriptor_set[desc_i], desc_i, ++ &astc->partition_tbl_buf_view[t_i]); ++ desc_i++; ++ assert(desc_i == ARRAY_SIZE(set->descriptor_set)); +} + +VkResult -+vk_texcompress_bcn_init(struct vk_device *device, VkAllocationCallbacks *allocator, -+ VkPipelineCache pipeline_cache, -+ struct vk_texcompress_bcn_state *bcn[]) ++vk_texcompress_astc_init(struct vk_device *device, VkAllocationCallbacks *allocator, ++ VkPipelineCache pipeline_cache, ++ struct vk_texcompress_astc_state **astc) +{ + VkResult result; + -+ for (int pi = VK_TEXCOMPRESS_BCN_NUM_COMPUTE_PIPELINES - 1; pi >= 0; --pi) { -+ /* memory to be freed as part of vk_bcn_decode_finish() */ -+ bcn[pi] = vk_zalloc(allocator, sizeof(struct vk_texcompress_bcn_state), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); -+ if (bcn[pi] == NULL) { -+ return VK_ERROR_OUT_OF_HOST_MEMORY; -+ } -+ simple_mtx_init(&(bcn[pi])->mutex, mtx_plain); -+ } -+ -+ result = create_buffer(device, allocator, bcn[BC1_ENCODE]); -+ if (result != VK_SUCCESS) -+ goto fail; -+ -+ result = create_sampler(device, allocator, &bcn[BC1_ENCODE]->sampler, -+ VK_FILTER_LINEAR, VK_SAMPLER_MIPMAP_MODE_LINEAR); -+ if (result != VK_SUCCESS) -+ goto fail; -+ -+ result = create_pipeline_layout(device, allocator, bcn[BC1_ENCODE], BC1_ENCODE); -+ if (result != VK_SUCCESS) -+ goto fail; -+ -+ result = create_sampler(device, allocator, &bcn[BC4_ENCODE]->sampler, -+ VK_FILTER_LINEAR, VK_SAMPLER_MIPMAP_MODE_LINEAR); -+ if (result != VK_SUCCESS) -+ goto fail; ++ /* astc memory to be freed as part of vk_astc_decode_finish() */ ++ *astc = vk_zalloc(allocator, sizeof(struct vk_texcompress_astc_state), 8, ++ VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); ++ if (*astc == NULL) ++ return VK_ERROR_OUT_OF_HOST_MEMORY; + -+ result = create_pipeline_layout(device, allocator, bcn[BC4_ENCODE], BC4_ENCODE); -+ if (result != VK_SUCCESS) -+ goto fail; ++ simple_mtx_init(&(*astc)->mutex, mtx_plain); + -+ // 由于VK_FORMAT_R32G32_UINT格式的imageView格式不包含VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT, 采用nearest -+ result = create_sampler(device, allocator, &bcn[BC3_ENCODE]->sampler, -+ VK_FILTER_NEAREST, VK_SAMPLER_MIPMAP_MODE_NEAREST); ++ result = create_fill_all_luts_vulkan(device, allocator, *astc); + if (result != VK_SUCCESS) + goto fail; + -+ result = create_pipeline_layout(device, allocator, bcn[BC3_ENCODE], BC3_ENCODE); -+ if (result != VK_SUCCESS) -+ goto fail; ++ result = create_layout(device, allocator, *astc); + +fail: + return result; +} + +void -+vk_texcompress_bcn_finish(struct vk_device *device, -+ VkAllocationCallbacks *allocator, -+ struct vk_texcompress_bcn_state *bcn[]) ++vk_texcompress_astc_finish(struct vk_device *device, ++ VkAllocationCallbacks *allocator, ++ struct vk_texcompress_astc_state *astc) +{ + VkDevice _device = vk_device_to_handle(device); + const struct vk_device_dispatch_table *disp = &device->dispatch_table; + -+ for (int pi = VK_TEXCOMPRESS_BCN_NUM_COMPUTE_PIPELINES - 1; pi >= 0; --pi) { -+ if (bcn[pi]->pipeline) -+ disp->DestroyPipeline(_device, bcn[pi]->pipeline, allocator); -+ -+ disp->DestroyPipelineLayout(_device, bcn[pi]->p_layout, allocator); -+ disp->DestroyShaderModule(_device, bcn[pi]->shader_module, allocator); -+ disp->DestroyDescriptorSetLayout(_device, bcn[pi]->ds_layout, allocator); -+ -+ if (pi == BC1_ENCODE) { -+ disp->DestroyBuffer(_device, bcn[pi]->bc1_table_buf, allocator); -+ disp->FreeMemory(_device, bcn[pi]->bc1_table_mem, allocator); -+ } -+ vk_free(allocator, bcn[pi]); ++ while (astc->pipeline_mask) { ++ uint8_t t_i = u_bit_scan(&astc->pipeline_mask); ++ disp->DestroyPipeline(_device, astc->pipeline[t_i], allocator); + } -+} + -+void -+vk_texcompress_create_fence(struct vk_device *device, VkAllocationCallbacks *allocator, VkFence *fence) -+{ -+ VkDevice _device = vk_device_to_handle(device); -+ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ disp->DestroyPipelineLayout(_device, astc->p_layout, allocator); ++ disp->DestroyShaderModule(_device, astc->shader_module, allocator); ++ disp->DestroyDescriptorSetLayout(_device, astc->ds_layout, allocator); + -+ VkFenceCreateInfo create_info = { -+ .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, -+ .flags = VK_FENCE_CREATE_SIGNALED_BIT, -+ }; -+ VkResult result = disp->CreateFence(_device, &create_info, allocator, fence); -+ if (result != VK_SUCCESS) { -+ ALOGE("Failed to create fence"); -+ } -+} ++ for (unsigned i = 0; i < VK_TEXCOMPRESS_ASTC_NUM_LUTS; i++) ++ disp->DestroyBufferView(_device, astc->luts_buf_view[i], allocator); + -+void -+vk_texcompress_wait_fence(struct vk_device *device, VkAllocationCallbacks *allocator, VkFence *fence) -+{ -+ VkDevice _device = vk_device_to_handle(device); -+ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ for (unsigned i = 0; i < VK_TEXCOMPRESS_ASTC_NUM_PARTITION_TABLES; i++) ++ disp->DestroyBufferView(_device, astc->partition_tbl_buf_view[i], allocator); + -+ VkResult result = disp->WaitForFences(_device, 1, fence, VK_TRUE, UINT64_MAX); -+ if (result != VK_SUCCESS) { -+ ALOGE("Failed to wait for fence"); -+ } ++ disp->DestroyBuffer(_device, astc->luts_buf, allocator); ++ disp->FreeMemory(_device, astc->luts_mem, allocator); ++ ++ vk_free(allocator, astc); +} -\ No newline at end of file -diff --git a/src/vulkan/runtime/vk_texcompress_bcn.h b/src/vulkan/runtime/vk_texcompress_bcn.h +diff --git a/src/vulkan/runtime/vk_texcompress_astc.h b/src/vulkan/runtime/vk_texcompress_astc.h new file mode 100644 -index 00000000000..ca8189b657e +index 00000000000..417ab434be2 --- /dev/null -+++ b/src/vulkan/runtime/vk_texcompress_bcn.h -@@ -0,0 +1,142 @@ ++++ b/src/vulkan/runtime/vk_texcompress_astc.h +@@ -0,0 +1,122 @@ +/* Copyright (c) 2017-2023 Hans-Kristian Arntzen + * + * Permission is hereby granted, free of charge, to any person obtaining @@ -2003,124 +5718,337 @@ index 00000000000..ca8189b657e + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ -+#ifndef VK_TEXCOMPRESS_BCN_H -+#define VK_TEXCOMPRESS_BCN_H ++#ifndef VK_TEXCOMPRESS_ASTC_H ++#define VK_TEXCOMPRESS_ASTC_H + +#include "util/simple_mtx.h" -+#include "vk_fence.h" +#include "vk_device.h" + -+#define VK_TEXCOMPRESS_BCN_NUM_COMPUTE_PIPELINES 3 -+#define VK_TEXCOMPRESS_BCN_WRITE_DESC_SET_COUNT 3 // 每个着色器最多有3个描述符集 -+ -+struct vk_texcompress_bcn_write_descriptor_set { -+ VkWriteDescriptorSet descriptor_set[VK_TEXCOMPRESS_BCN_WRITE_DESC_SET_COUNT]; -+ VkDescriptorImageInfo dst_desc_image_info; -+ VkDescriptorImageInfo src_desc_image_info; -+ VkDescriptorImageInfo src_desc_image_info_2; -+ VkDescriptorBufferInfo bc1_buffer_info; -+}; -+ -+struct vk_texcompress_bc4_write_descriptor_set { -+ VkWriteDescriptorSet descriptor_set[2]; -+ VkDescriptorImageInfo dst_desc_image_info; -+ VkDescriptorImageInfo src_desc_image_info; -+}; -+ -+typedef struct vk_texcompress_bcn_image { -+ VkImage image_dummy; -+ VkImage image_rgba; -+ VkImage image_bc1; -+ VkImage image_bc4; -+ VkDeviceMemory image_mem; -+} bcn_image; -+ -+struct vk_texcompress_dummy_image { -+ VkImage image_dummy; -+ VkDeviceMemory image_mem; -+}; ++/* luts order matching astc glsl shader below, ++ * 0 - color endpoint ++ * 1 - color endpoint unquant ++ * 2 - weights ++ * 3 - weights unquant ++ * 4 - trits quints ++ */ ++#define VK_TEXCOMPRESS_ASTC_NUM_LUTS 5 ++#define VK_TEXCOMPRESS_ASTC_NUM_PARTITION_TABLES 14 ++#define VK_TEXCOMPRESS_ASTC_WRITE_DESC_SET_COUNT 8 + -+typedef struct vk_texcompress_staging_images { -+ struct vk_texcompress_bcn_image *current; -+ struct vk_texcompress_staging_images* next; -+} staging_images; ++struct vk_texcompress_astc_state { ++ /* single buffer is allocated for all luts */ ++ VkDeviceMemory luts_mem; ++ VkBuffer luts_buf; + -+enum compute_pipeline_id { -+ BC1_ENCODE = 0, -+ BC4_ENCODE, -+ BC3_ENCODE, -+ MAX_PIPELINE_NUM -+}; ++ VkBufferView luts_buf_view[VK_TEXCOMPRESS_ASTC_NUM_LUTS]; ++ VkBufferView partition_tbl_buf_view[VK_TEXCOMPRESS_ASTC_NUM_PARTITION_TABLES]; + -+struct vk_texcompress_bcn_state { + simple_mtx_t mutex; + VkDescriptorSetLayout ds_layout; + VkPipelineLayout p_layout; -+ VkPipeline pipeline; ++ VkPipeline pipeline[VK_TEXCOMPRESS_ASTC_NUM_PARTITION_TABLES]; ++ uint32_t pipeline_mask; + VkShaderModule shader_module; -+ VkSampler sampler; -+ -+ VkDeviceMemory bc1_table_mem; -+ VkBuffer bc1_table_buf; +}; + -+void -+vk_texcompress_insert_head(staging_images **head, bcn_image *current); -+ -+void -+vk_texcompress_delete_head(staging_images **head, struct vk_device *device); -+ -+VkResult -+vk_texcompress_create_image(struct vk_device *device, VkAllocationCallbacks *allocator, VkDeviceMemory *pMem, -+ VkImage *image, VkExtent3D extent, VkFormat format, int size); -+ -+void bind_memory(struct vk_device *device, VkImage *image, VkDeviceMemory *pMem, int offset); -+ -+void -+vk_texcompress_finish_image(struct vk_device *device, const VkAllocationCallbacks *allocator, -+ struct vk_texcompress_bcn_image *intermidate_image); -+ -+void -+vk_texcompress_bc1_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn, -+ struct vk_texcompress_bcn_write_descriptor_set *set, -+ VkImageView src_img_view, VkImageLayout src_img_layout, -+ VkImageView dst_img_view); -+ -+void -+vk_texcompress_bc4_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn, -+ struct vk_texcompress_bc4_write_descriptor_set *set, -+ VkImageView src_img_view, VkImageLayout src_img_layout, -+ VkImageView dst_img_view); ++struct vk_texcompress_astc_write_descriptor_set { ++ VkWriteDescriptorSet descriptor_set[VK_TEXCOMPRESS_ASTC_WRITE_DESC_SET_COUNT]; ++ VkDescriptorImageInfo dst_desc_image_info; ++ VkDescriptorImageInfo src_desc_image_info; ++}; + +void -+vk_texcompress_bc3_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn, -+ struct vk_texcompress_bcn_write_descriptor_set *set, -+ VkImageView bc1_src_img_view, VkImageView bc4_src_img_view, -+ VkImageView dst_img_view); -+ -+VkPipeline -+vk_texcompress_rgba_get_encode_pipeline(struct vk_device *device, -+ VkAllocationCallbacks *allocator, -+ struct vk_texcompress_bcn_state *bcn, -+ VkPipelineCache pipeline_cache, -+ enum compute_pipeline_id id); ++vk_texcompress_astc_fill_write_descriptor_sets(struct vk_texcompress_astc_state *astc, ++ struct vk_texcompress_astc_write_descriptor_set *set, ++ VkImageView src_img_view, VkImageLayout src_img_layout, ++ VkImageView dst_img_view, ++ VkFormat format); ++VkPipeline vk_texcompress_astc_get_decode_pipeline(struct vk_device *device, ++ VkAllocationCallbacks *allocator, ++ struct vk_texcompress_astc_state *astc, ++ VkPipelineCache pipeline_cache, ++ VkFormat format); ++VkResult vk_texcompress_astc_init(struct vk_device *device, ++ VkAllocationCallbacks *allocator, ++ VkPipelineCache pipeline_cache, ++ struct vk_texcompress_astc_state **astc); ++void vk_texcompress_astc_finish(struct vk_device *device, ++ VkAllocationCallbacks *allocator, ++ struct vk_texcompress_astc_state *astc); ++ ++static inline VkFormat ++vk_texcompress_astc_emulation_format(VkFormat format) ++{ ++ /* TODO: From VK_EXT_astc_Decode_mode spec, VK_FORMAT_R16G16B16A16_SFLOAT is the default ++ * option. VK_FORMAT_R8G8B8A8_UNORM is only acceptable image quality option. ++ */ ++ switch (format) { ++ case VK_FORMAT_ASTC_4x4_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_5x4_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_5x5_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_6x5_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_6x6_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_8x5_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_8x6_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_8x8_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_10x5_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_10x6_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_10x8_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_10x10_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_12x10_UNORM_BLOCK: ++ case VK_FORMAT_ASTC_12x12_UNORM_BLOCK: ++ return VK_FORMAT_R8G8B8A8_UNORM; ++ case VK_FORMAT_ASTC_4x4_SRGB_BLOCK: ++ case VK_FORMAT_ASTC_5x4_SRGB_BLOCK: ++ case VK_FORMAT_ASTC_5x5_SRGB_BLOCK: ++ case VK_FORMAT_ASTC_6x5_SRGB_BLOCK: ++ case VK_FORMAT_ASTC_6x6_SRGB_BLOCK: ++ case VK_FORMAT_ASTC_8x5_SRGB_BLOCK: ++ case VK_FORMAT_ASTC_8x6_SRGB_BLOCK: ++ case VK_FORMAT_ASTC_8x8_SRGB_BLOCK: ++ case VK_FORMAT_ASTC_10x5_SRGB_BLOCK: ++ case VK_FORMAT_ASTC_10x6_SRGB_BLOCK: ++ case VK_FORMAT_ASTC_10x8_SRGB_BLOCK: ++ case VK_FORMAT_ASTC_10x10_SRGB_BLOCK: ++ case VK_FORMAT_ASTC_12x10_SRGB_BLOCK: ++ case VK_FORMAT_ASTC_12x12_SRGB_BLOCK: ++ return VK_FORMAT_R8G8B8A8_SRGB; ++ default: ++ return VK_FORMAT_UNDEFINED; ++ } ++} + -+VkResult -+vk_texcompress_bcn_init(struct vk_device *device, -+ VkAllocationCallbacks *allocator, -+ VkPipelineCache pipeline_cache, -+ struct vk_texcompress_bcn_state *bcn[]); ++#endif /* VK_TEXCOMPRESS_ASTC_H */ +diff --git a/src/vulkan/util/vk_format.c b/src/vulkan/util/vk_format.c +index f5da22bac88..57984de2227 100644 +--- a/src/vulkan/util/vk_format.c ++++ b/src/vulkan/util/vk_format.c +@@ -288,6 +288,201 @@ vk_format_to_pipe_format(enum VkFormat vkformat) + return vk_format_map[vkformat]; + } + + -+void -+vk_texcompress_bcn_finish(struct vk_device *device, -+ VkAllocationCallbacks *allocator, -+ struct vk_texcompress_bcn_state *bcn[]); ++static const VkFormat formats[PIPE_FORMAT_COUNT] = { ++#define MAP_FORMAT_NORM(FMT) \ ++ [PIPE_FORMAT_ ## FMT ## _UNORM] = VK_FORMAT_ ## FMT ## _UNORM, \ ++ [PIPE_FORMAT_ ## FMT ## _SNORM] = VK_FORMAT_ ## FMT ## _SNORM, ++ ++#define MAP_FORMAT_SCALED(FMT) \ ++ [PIPE_FORMAT_ ## FMT ## _USCALED] = VK_FORMAT_ ## FMT ## _USCALED, \ ++ [PIPE_FORMAT_ ## FMT ## _SSCALED] = VK_FORMAT_ ## FMT ## _SSCALED, ++ ++#define MAP_FORMAT_INT(FMT) \ ++ [PIPE_FORMAT_ ## FMT ## _UINT] = VK_FORMAT_ ## FMT ## _UINT, \ ++ [PIPE_FORMAT_ ## FMT ## _SINT] = VK_FORMAT_ ## FMT ## _SINT, ++ ++#define MAP_FORMAT_SRGB(FMT) \ ++ [PIPE_FORMAT_ ## FMT ## _SRGB] = VK_FORMAT_ ## FMT ## _SRGB, ++ ++#define MAP_FORMAT_FLOAT(FMT) \ ++ [PIPE_FORMAT_ ## FMT ## _FLOAT] = VK_FORMAT_ ## FMT ## _SFLOAT, ++ ++ // one component ++ ++ // 8-bits ++ MAP_FORMAT_NORM(R8) ++ MAP_FORMAT_SCALED(R8) ++ MAP_FORMAT_INT(R8) ++ MAP_FORMAT_SRGB(R8) ++ // 16-bits ++ MAP_FORMAT_NORM(R16) ++ MAP_FORMAT_SCALED(R16) ++ MAP_FORMAT_INT(R16) ++ MAP_FORMAT_FLOAT(R16) ++ // 32-bits ++ MAP_FORMAT_INT(R32) ++ MAP_FORMAT_FLOAT(R32) ++ ++ // two components ++ ++ // 8-bits ++ MAP_FORMAT_NORM(R8G8) ++ MAP_FORMAT_SCALED(R8G8) ++ MAP_FORMAT_INT(R8G8) ++ MAP_FORMAT_SRGB(R8G8) ++ // 16-bits ++ MAP_FORMAT_NORM(R16G16) ++ MAP_FORMAT_SCALED(R16G16) ++ MAP_FORMAT_INT(R16G16) ++ MAP_FORMAT_FLOAT(R16G16) ++ // 32-bits ++ MAP_FORMAT_INT(R32G32) ++ MAP_FORMAT_FLOAT(R32G32) ++ ++ // three components ++ ++ // 8-bits ++ MAP_FORMAT_NORM(R8G8B8) ++ MAP_FORMAT_SCALED(R8G8B8) ++ MAP_FORMAT_INT(R8G8B8) ++ MAP_FORMAT_SRGB(R8G8B8) ++ MAP_FORMAT_NORM(B8G8R8) ++ MAP_FORMAT_SCALED(B8G8R8) ++ MAP_FORMAT_INT(B8G8R8) ++ MAP_FORMAT_SRGB(B8G8R8) ++ // 16-bits ++ MAP_FORMAT_NORM(R16G16B16) ++ MAP_FORMAT_SCALED(R16G16B16) ++ MAP_FORMAT_INT(R16G16B16) ++ MAP_FORMAT_FLOAT(R16G16B16) ++ // 32-bits ++ MAP_FORMAT_INT(R32G32B32) ++ MAP_FORMAT_FLOAT(R32G32B32) ++ ++ // four components ++ ++ // 8-bits ++ MAP_FORMAT_NORM(R8G8B8A8) ++ MAP_FORMAT_SCALED(R8G8B8A8) ++ MAP_FORMAT_INT(R8G8B8A8) ++ MAP_FORMAT_NORM(B8G8R8A8) ++ MAP_FORMAT_SCALED(B8G8R8A8) ++ MAP_FORMAT_INT(B8G8R8A8) ++ MAP_FORMAT_SRGB(B8G8R8A8) ++ [PIPE_FORMAT_RGBA8888_SRGB] = VK_FORMAT_A8B8G8R8_SRGB_PACK32, ++ // 16-bits ++ MAP_FORMAT_NORM(R16G16B16A16) ++ MAP_FORMAT_SCALED(R16G16B16A16) ++ MAP_FORMAT_INT(R16G16B16A16) ++ MAP_FORMAT_FLOAT(R16G16B16A16) ++ // 32-bits ++ MAP_FORMAT_INT(R32G32B32A32) ++ MAP_FORMAT_FLOAT(R32G32B32A32) ++ ++ // other color formats ++ [PIPE_FORMAT_A4B4G4R4_UNORM] = VK_FORMAT_R4G4B4A4_UNORM_PACK16, ++ [PIPE_FORMAT_A4R4G4B4_UNORM] = VK_FORMAT_B4G4R4A4_UNORM_PACK16, ++ [PIPE_FORMAT_B4G4R4A4_UNORM] = VK_FORMAT_A4R4G4B4_UNORM_PACK16, ++ [PIPE_FORMAT_R4G4B4A4_UNORM] = VK_FORMAT_A4B4G4R4_UNORM_PACK16, ++ [PIPE_FORMAT_B5G6R5_UNORM] = VK_FORMAT_R5G6B5_UNORM_PACK16, ++ [PIPE_FORMAT_R5G6B5_UNORM] = VK_FORMAT_B5G6R5_UNORM_PACK16, ++ ++ [PIPE_FORMAT_A1B5G5R5_UNORM] = VK_FORMAT_R5G5B5A1_UNORM_PACK16, ++ [PIPE_FORMAT_A1R5G5B5_UNORM] = VK_FORMAT_B5G5R5A1_UNORM_PACK16, ++ [PIPE_FORMAT_B5G5R5A1_UNORM] = VK_FORMAT_A1R5G5B5_UNORM_PACK16, ++ ++ [PIPE_FORMAT_R11G11B10_FLOAT] = VK_FORMAT_B10G11R11_UFLOAT_PACK32, ++ [PIPE_FORMAT_R9G9B9E5_FLOAT] = VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, ++ /* ARB_vertex_type_2_10_10_10 */ ++ [PIPE_FORMAT_R10G10B10A2_UNORM] = VK_FORMAT_A2B10G10R10_UNORM_PACK32, ++ [PIPE_FORMAT_R10G10B10A2_SNORM] = VK_FORMAT_A2B10G10R10_SNORM_PACK32, ++ [PIPE_FORMAT_B10G10R10A2_UNORM] = VK_FORMAT_A2R10G10B10_UNORM_PACK32, ++ [PIPE_FORMAT_B10G10R10A2_SNORM] = VK_FORMAT_A2R10G10B10_SNORM_PACK32, ++ [PIPE_FORMAT_R10G10B10A2_USCALED] = VK_FORMAT_A2B10G10R10_USCALED_PACK32, ++ [PIPE_FORMAT_R10G10B10A2_SSCALED] = VK_FORMAT_A2B10G10R10_SSCALED_PACK32, ++ [PIPE_FORMAT_B10G10R10A2_USCALED] = VK_FORMAT_A2R10G10B10_USCALED_PACK32, ++ [PIPE_FORMAT_B10G10R10A2_SSCALED] = VK_FORMAT_A2R10G10B10_SSCALED_PACK32, ++ [PIPE_FORMAT_R10G10B10A2_UINT] = VK_FORMAT_A2B10G10R10_UINT_PACK32, ++ [PIPE_FORMAT_B10G10R10A2_UINT] = VK_FORMAT_A2R10G10B10_UINT_PACK32, ++ [PIPE_FORMAT_B10G10R10A2_SINT] = VK_FORMAT_A2R10G10B10_SINT_PACK32, ++ ++ // depth/stencil formats ++ [PIPE_FORMAT_Z32_FLOAT] = VK_FORMAT_D32_SFLOAT, ++ [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT] = VK_FORMAT_D32_SFLOAT_S8_UINT, ++ [PIPE_FORMAT_Z16_UNORM] = VK_FORMAT_D16_UNORM, ++ [PIPE_FORMAT_Z16_UNORM_S8_UINT] = VK_FORMAT_D16_UNORM_S8_UINT, ++ [PIPE_FORMAT_Z24X8_UNORM] = VK_FORMAT_X8_D24_UNORM_PACK32, ++ [PIPE_FORMAT_Z24_UNORM_S8_UINT] = VK_FORMAT_D24_UNORM_S8_UINT, ++ [PIPE_FORMAT_S8_UINT] = VK_FORMAT_S8_UINT, ++ ++ // compressed formats ++ [PIPE_FORMAT_DXT1_RGB] = VK_FORMAT_BC1_RGB_UNORM_BLOCK, ++ [PIPE_FORMAT_DXT1_RGBA] = VK_FORMAT_BC1_RGBA_UNORM_BLOCK, ++ [PIPE_FORMAT_DXT3_RGBA] = VK_FORMAT_BC2_UNORM_BLOCK, ++ [PIPE_FORMAT_DXT5_RGBA] = VK_FORMAT_BC3_UNORM_BLOCK, ++ [PIPE_FORMAT_DXT1_SRGB] = VK_FORMAT_BC1_RGB_SRGB_BLOCK, ++ [PIPE_FORMAT_DXT1_SRGBA] = VK_FORMAT_BC1_RGBA_SRGB_BLOCK, ++ [PIPE_FORMAT_DXT3_SRGBA] = VK_FORMAT_BC2_SRGB_BLOCK, ++ [PIPE_FORMAT_DXT5_SRGBA] = VK_FORMAT_BC3_SRGB_BLOCK, ++ ++ [PIPE_FORMAT_RGTC1_UNORM] = VK_FORMAT_BC4_UNORM_BLOCK, ++ [PIPE_FORMAT_RGTC1_SNORM] = VK_FORMAT_BC4_SNORM_BLOCK, ++ [PIPE_FORMAT_RGTC2_UNORM] = VK_FORMAT_BC5_UNORM_BLOCK, ++ [PIPE_FORMAT_RGTC2_SNORM] = VK_FORMAT_BC5_SNORM_BLOCK, ++ [PIPE_FORMAT_BPTC_RGBA_UNORM] = VK_FORMAT_BC7_UNORM_BLOCK, ++ [PIPE_FORMAT_BPTC_SRGBA] = VK_FORMAT_BC7_SRGB_BLOCK, ++ [PIPE_FORMAT_BPTC_RGB_FLOAT] = VK_FORMAT_BC6H_SFLOAT_BLOCK, ++ [PIPE_FORMAT_BPTC_RGB_UFLOAT] = VK_FORMAT_BC6H_UFLOAT_BLOCK, ++ ++ [PIPE_FORMAT_ETC1_RGB8] = VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK, ++ [PIPE_FORMAT_ETC2_RGB8] = VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK, ++ [PIPE_FORMAT_ETC2_SRGB8] = VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK, ++ [PIPE_FORMAT_ETC2_RGB8A1] = VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK, ++ [PIPE_FORMAT_ETC2_SRGB8A1] = VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK, ++ [PIPE_FORMAT_ETC2_RGBA8] = VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK, ++ [PIPE_FORMAT_ETC2_SRGBA8] = VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK, ++ [PIPE_FORMAT_ETC2_R11_UNORM] = VK_FORMAT_EAC_R11_UNORM_BLOCK, ++ [PIPE_FORMAT_ETC2_R11_SNORM] = VK_FORMAT_EAC_R11_SNORM_BLOCK, ++ [PIPE_FORMAT_ETC2_RG11_UNORM] = VK_FORMAT_EAC_R11G11_UNORM_BLOCK, ++ [PIPE_FORMAT_ETC2_RG11_SNORM] = VK_FORMAT_EAC_R11G11_SNORM_BLOCK, ++ ++ [PIPE_FORMAT_ASTC_4x4] = VK_FORMAT_ASTC_4x4_UNORM_BLOCK, ++ [PIPE_FORMAT_ASTC_4x4_SRGB] = VK_FORMAT_ASTC_4x4_SRGB_BLOCK, ++ [PIPE_FORMAT_ASTC_5x4] = VK_FORMAT_ASTC_5x4_UNORM_BLOCK, ++ [PIPE_FORMAT_ASTC_5x4_SRGB] = VK_FORMAT_ASTC_5x4_SRGB_BLOCK, ++ [PIPE_FORMAT_ASTC_5x5] = VK_FORMAT_ASTC_5x5_UNORM_BLOCK, ++ [PIPE_FORMAT_ASTC_5x5_SRGB] = VK_FORMAT_ASTC_5x5_SRGB_BLOCK, ++ [PIPE_FORMAT_ASTC_6x5] = VK_FORMAT_ASTC_6x5_UNORM_BLOCK, ++ [PIPE_FORMAT_ASTC_6x5_SRGB] = VK_FORMAT_ASTC_6x5_SRGB_BLOCK, ++ [PIPE_FORMAT_ASTC_6x6] = VK_FORMAT_ASTC_6x6_UNORM_BLOCK, ++ [PIPE_FORMAT_ASTC_6x6_SRGB] = VK_FORMAT_ASTC_6x6_SRGB_BLOCK, ++ [PIPE_FORMAT_ASTC_8x5] = VK_FORMAT_ASTC_8x5_UNORM_BLOCK, ++ [PIPE_FORMAT_ASTC_8x5_SRGB] = VK_FORMAT_ASTC_8x5_SRGB_BLOCK, ++ [PIPE_FORMAT_ASTC_8x6] = VK_FORMAT_ASTC_8x6_UNORM_BLOCK, ++ [PIPE_FORMAT_ASTC_8x6_SRGB] = VK_FORMAT_ASTC_8x6_SRGB_BLOCK, ++ [PIPE_FORMAT_ASTC_8x8] = VK_FORMAT_ASTC_8x8_UNORM_BLOCK, ++ [PIPE_FORMAT_ASTC_8x8_SRGB] = VK_FORMAT_ASTC_8x8_SRGB_BLOCK, ++ [PIPE_FORMAT_ASTC_10x5] = VK_FORMAT_ASTC_10x5_UNORM_BLOCK, ++ [PIPE_FORMAT_ASTC_10x5_SRGB] = VK_FORMAT_ASTC_10x5_SRGB_BLOCK, ++ [PIPE_FORMAT_ASTC_10x6] = VK_FORMAT_ASTC_10x6_UNORM_BLOCK, ++ [PIPE_FORMAT_ASTC_10x6_SRGB] = VK_FORMAT_ASTC_10x6_SRGB_BLOCK, ++ [PIPE_FORMAT_ASTC_10x8] = VK_FORMAT_ASTC_10x8_UNORM_BLOCK, ++ [PIPE_FORMAT_ASTC_10x8_SRGB] = VK_FORMAT_ASTC_10x8_SRGB_BLOCK, ++ [PIPE_FORMAT_ASTC_10x10] = VK_FORMAT_ASTC_10x10_UNORM_BLOCK, ++ [PIPE_FORMAT_ASTC_10x10_SRGB] = VK_FORMAT_ASTC_10x10_SRGB_BLOCK, ++ [PIPE_FORMAT_ASTC_12x10] = VK_FORMAT_ASTC_12x10_UNORM_BLOCK, ++ [PIPE_FORMAT_ASTC_12x10_SRGB] = VK_FORMAT_ASTC_12x10_SRGB_BLOCK, ++ [PIPE_FORMAT_ASTC_12x12] = VK_FORMAT_ASTC_12x12_UNORM_BLOCK, ++ [PIPE_FORMAT_ASTC_12x12_SRGB] = VK_FORMAT_ASTC_12x12_SRGB_BLOCK, ++}; + ++VkFormat ++vk_format_from_pipe_format(enum pipe_format format) ++{ ++ return formats[format]; ++} + -+void -+vk_texcompress_create_fence(struct vk_device *device, VkAllocationCallbacks *allocator, VkFence *fence); + VkImageAspectFlags + vk_format_aspects(VkFormat format) + { +diff --git a/src/vulkan/util/vk_format.h b/src/vulkan/util/vk_format.h +index 630cfbe5929..ca43c134bbb 100644 +--- a/src/vulkan/util/vk_format.h ++++ b/src/vulkan/util/vk_format.h +@@ -35,6 +35,9 @@ extern "C" { + enum pipe_format + vk_format_to_pipe_format(enum VkFormat vkformat); + ++VkFormat ++vk_format_from_pipe_format(enum pipe_format format); + -+void -+vk_texcompress_wait_fence(struct vk_device *device, VkAllocationCallbacks *allocator, VkFence *fence); + VkImageAspectFlags + vk_format_aspects(VkFormat format); + +@@ -158,6 +161,12 @@ vk_format_is_compressed(VkFormat format) + return vk_format_get_blockwidth(format) > 1; + } + ++static inline bool ++vk_format_is_block_compressed(VkFormat format) ++{ ++ return util_format_is_compressed(vk_format_to_pipe_format(format)); ++} + -+#endif /* VK_TEXCOMPRESS_BCN_H */ + static inline const struct util_format_description * + vk_format_description(VkFormat format) + { -- Gitee From f1b25b20cc768bab1e7ad3b85d72ed5c375f14d0 Mon Sep 17 00:00:00 2001 From: zhuanghb3 Date: Thu, 25 Sep 2025 02:18:45 +0000 Subject: [PATCH 2/3] =?UTF-8?q?=E4=BD=BF=E8=83=BDopengles=E6=94=AF?= =?UTF-8?q?=E6=8C=81astc=E6=A0=BC=E5=BC=8F=E7=A1=AC=E8=A7=A3=E7=A1=AC?= =?UTF-8?q?=E7=BC=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- patchForAndroid/external-mesa-0004.patch | 2126 ++++++++++++++++++++++ 1 file changed, 2126 insertions(+) create mode 100644 patchForAndroid/external-mesa-0004.patch diff --git a/patchForAndroid/external-mesa-0004.patch b/patchForAndroid/external-mesa-0004.patch new file mode 100644 index 0000000..bd9c3bf --- /dev/null +++ b/patchForAndroid/external-mesa-0004.patch @@ -0,0 +1,2126 @@ +diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build +index 71c8b16d4fc..691ebd9d616 100644 +--- a/src/amd/vulkan/meson.build ++++ b/src/amd/vulkan/meson.build +@@ -70,6 +70,7 @@ libradv_files = files( + 'radv_meta_fmask_copy.c', + 'radv_meta_fmask_expand.c', + 'radv_meta_astc_decode.c', ++ 'radv_meta_rgba_encode.c', + 'radv_meta_resolve.c', + 'radv_meta_resolve_cs.c', + 'radv_meta_resolve_fs.c', +diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c +index 3fd4ea777a8..dfd7bdca894 100644 +--- a/src/amd/vulkan/radv_image.c ++++ b/src/amd/vulkan/radv_image.c +@@ -38,6 +38,7 @@ + #include + + #include "gfx10_format_table.h" ++#include "vk_texcompress_bcn.h" + + static const VkImageUsageFlagBits RADV_IMAGE_USAGE_WRITE_BITS = + VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | +@@ -546,6 +547,66 @@ radv_patch_image_from_extra_info(struct radv_device *device, struct radv_image * + + extern bool radv_is_format_emulated(const struct radv_physical_device *physical_device, VkFormat format); + ++static char isEnableHardEncode[PROPERTY_VALUE_MAX]; ++VkFormat ++radv_translate_rgba2dxt5(VkFormat format) ++{ ++ switch (format) { ++ case VK_FORMAT_R8G8B8A8_UNORM: ++ return VK_FORMAT_BC3_UNORM_BLOCK; ++ case VK_FORMAT_R8G8B8A8_SRGB: ++ return VK_FORMAT_BC3_SRGB_BLOCK; ++ default: ++ return format; ++ } ++} ++ ++static bool ++radv_need_hard_encode(const struct radv_physical_device *pdev, const VkImageCreateInfo *pCreateInfo) ++{ ++ if (property_get("sys.vmi.vk.texturecompress", isEnableHardEncode, "0") && strcmp(isEnableHardEncode, "2") != 0) { ++ return false; ++ } ++ ++ // 暂不支持对图像数组进行压缩 ++ if (pCreateInfo->arrayLayers > 1) { ++ return false; ++ } ++ ++ if (!radv_is_format_emulated(pdev, pCreateInfo->format)) { ++ return false; ++ } ++ ++ const enum util_format_layout format_layout = vk_format_description(pCreateInfo->format)->layout; ++ if (format_layout != UTIL_FORMAT_LAYOUT_ASTC && ++ vk_texcompress_etc2_emulation_format(pCreateInfo->format) != VK_FORMAT_R8G8B8A8_UNORM && ++ vk_texcompress_etc2_emulation_format(pCreateInfo->format) != VK_FORMAT_R8G8B8A8_SRGB) { ++ return false; ++ } ++ ++ // 过滤掉对稀疏纹理的压缩,对该部分纹理进行压缩时可能造成GPU reset。 ++ if (pCreateInfo->flags & (VK_IMAGE_CREATE_SPARSE_BINDING_BIT | VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT | ++ VK_IMAGE_CREATE_SPARSE_ALIASED_BIT)) { ++ return false; ++ } ++ ++ // 检查图像的用途,防止压缩 swapchain 的 images;并只对 VK_IMAGE_TYPE_2D 类型纹理进行压缩 ++ if (!(pCreateInfo->usage & VK_IMAGE_USAGE_SAMPLED_BIT) || pCreateInfo->imageType != VK_IMAGE_TYPE_2D) { ++ return false; ++ } ++ ++ // 图像太大内存申请不了 ++ if (pCreateInfo->extent.width >= 3960 && pCreateInfo->extent.height >= 3960) { ++ return false; ++ } ++ ++ // 检查图像大小 ++ if (pCreateInfo->extent.width >= 256 && pCreateInfo->extent.height >= 256) { ++ return true; ++ } ++ return false; ++} ++ + static VkFormat + radv_image_get_plane_format(const struct radv_physical_device *pdev, const struct radv_image *image, + unsigned plane) +@@ -553,10 +614,15 @@ radv_image_get_plane_format(const struct radv_physical_device *pdev, const struc + if (radv_is_format_emulated(pdev, image->vk_format)) { + if (plane == 0) + return image->vk_format; +- if (vk_format_description(image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ASTC) ++ if (vk_format_description(image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ASTC) { ++ if (image->isNeedHardEncode) ++ return radv_translate_rgba2dxt5(vk_texcompress_astc_emulation_format(image->vk_format)); + return vk_texcompress_astc_emulation_format(image->vk_format); +- else ++ } else { ++ if (image->isNeedHardEncode) ++ return radv_translate_rgba2dxt5(vk_texcompress_etc2_emulation_format(image->vk_format)); + return vk_texcompress_etc2_emulation_format(image->vk_format); ++ } + } + return vk_format_get_plane_format(image->vk_format, plane); + } +@@ -1710,6 +1776,12 @@ radv_destroy_image(struct radv_device *device, const VkAllocationCallbacks *pAll + vk_free2(&device->vk.alloc, pAllocator, image); + } + ++void ++radv_internal_image_finish(struct radv_image *image) ++{ ++ vk_object_base_finish(&image->base); ++} ++ + static void + radv_image_print_info(struct radv_device *device, struct radv_image *image) + { +@@ -1890,9 +1962,11 @@ radv_image_create(VkDevice _device, const struct radv_image_create_info *create_ + vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_MEMORY_IMAGE_CREATE_INFO); + assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO); + ++ bool needHardEncode = radv_need_hard_encode(device->physical_device, pCreateInfo); ++ + unsigned plane_count = radv_get_internal_plane_count(device->physical_device, format); + +- bool needSoftEncode = !external_info && radv_need_soft_encode(pCreateInfo); ++ bool needSoftEncode = !external_info && radv_need_soft_encode(pCreateInfo) && !needHardEncode; + bool needSoftDecode = needSoftEncode && radv_need_soft_decode(format); + VkFormat softDecodeFormat = radv_translate_etc(format); + VkFormat softEncodeFormat = radv_translate_rgb2BC(format); +@@ -1936,6 +2010,7 @@ radv_image_create(VkDevice _device, const struct radv_image_create_info *create_ + image->usage = pCreateInfo->usage; + image->flags = pCreateInfo->flags; + image->plane_count = vk_format_get_plane_count(format); ++ image->isNeedHardEncode = needHardEncode; + + //满足要求的 ETC2 纹理使用软解 + if (needSoftDecode) { +diff --git a/src/amd/vulkan/radv_meta.c b/src/amd/vulkan/radv_meta.c +index b1080ef8935..9b5a73cd1be 100644 +--- a/src/amd/vulkan/radv_meta.c ++++ b/src/amd/vulkan/radv_meta.c +@@ -601,8 +601,14 @@ radv_device_init_meta(struct radv_device *device) + if (result != VK_SUCCESS) + goto fail_astc_decode; + ++ result = radv_device_init_meta_rgba_encode_state(device); ++ if (result != VK_SUCCESS) ++ goto fail_bcn_encode; ++ + return VK_SUCCESS; + ++fail_bcn_encode: ++ radv_device_finish_meta_rgba_encode_state(device); + fail_astc_decode: + radv_device_finish_meta_astc_decode_state(device); + fail_etc_decode: +@@ -644,6 +650,7 @@ fail_clear: + void + radv_device_finish_meta(struct radv_device *device) + { ++ radv_device_finish_meta_rgba_encode_state(device); + radv_device_finish_meta_etc_decode_state(device); + radv_device_finish_meta_astc_decode_state(device); + radv_device_finish_accel_struct_build_state(device); +diff --git a/src/amd/vulkan/radv_meta.h b/src/amd/vulkan/radv_meta.h +index d8ab75481d9..9702bfc9c7b 100644 +--- a/src/amd/vulkan/radv_meta.h ++++ b/src/amd/vulkan/radv_meta.h +@@ -107,6 +107,9 @@ void radv_device_finish_meta_etc_decode_state(struct radv_device *device); + VkResult radv_device_init_meta_astc_decode_state(struct radv_device *device, bool on_demand); + void radv_device_finish_meta_astc_decode_state(struct radv_device *device); + ++VkResult radv_device_init_meta_rgba_encode_state(struct radv_device *device); ++void radv_device_finish_meta_rgba_encode_state(struct radv_device *device); ++ + void radv_meta_save(struct radv_meta_saved_state *saved_state, struct radv_cmd_buffer *cmd_buffer, + uint32_t flags); + +@@ -232,6 +235,9 @@ void radv_meta_decode_etc(struct radv_cmd_buffer *cmd_buffer, struct radv_image + void radv_meta_decode_astc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout layout, + const VkImageSubresourceLayers *subresource, VkOffset3D offset, VkExtent3D extent); + ++void radv_meta_bcn_encoded(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout layout, ++ const VkImageSubresourceLayers *subresource, VkOffset3D offset, VkExtent3D extent); ++ + /** + * Return whether the bound pipeline is the FMASK decompress pass. + */ +diff --git a/src/amd/vulkan/radv_meta_astc_decode.c b/src/amd/vulkan/radv_meta_astc_decode.c +index 93a63692c09..006394c639b 100644 +--- a/src/amd/vulkan/radv_meta_astc_decode.c ++++ b/src/amd/vulkan/radv_meta_astc_decode.c +@@ -166,8 +166,15 @@ radv_meta_decode_astc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *ima + struct radv_image_view src_iview, dst_iview; + image_view_init(device, image, VK_FORMAT_R32G32B32A32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, + subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &src_iview); +- image_view_init(device, image, VK_FORMAT_R8G8B8A8_UINT, VK_IMAGE_ASPECT_PLANE_1_BIT, subresource->mipLevel, +- subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &dst_iview); ++ if (image->isNeedHardEncode) { ++ RADV_FROM_HANDLE(radv_image, store_image, image->staging_images->image_rgba); ++ cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, store_image); ++ image_view_init(device, store_image, VK_FORMAT_R8G8B8A8_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, ++ subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &dst_iview); ++ } else { ++ image_view_init(device, image, VK_FORMAT_R8G8B8A8_UINT, VK_IMAGE_ASPECT_PLANE_1_BIT, subresource->mipLevel, ++ subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &dst_iview); ++ } + + VkExtent3D extent_copy = { + .width = extent.width, +diff --git a/src/amd/vulkan/radv_meta_copy.c b/src/amd/vulkan/radv_meta_copy.c +index b418d3f2ab9..a0a6b41cce9 100644 +--- a/src/amd/vulkan/radv_meta_copy.c ++++ b/src/amd/vulkan/radv_meta_copy.c +@@ -26,6 +26,7 @@ + #include "main/formats.h" + #include "main/texcompress_etc.h" + #include "main/texcompress_bptc_tmp.h" ++#include "vk_texcompress_bcn.h" + + static VkExtent3D + meta_image_block_size(const struct radv_image *image) +@@ -471,6 +472,74 @@ copy_buffer_to_image(struct radv_cmd_buffer *cmd_buffer, struct radv_buffer *buf + radv_meta_restore(&saved_state, cmd_buffer); + } + ++void radv_create_staging_images(struct radv_image *dst_image, struct radv_cmd_buffer *cmd_buffer) ++{ ++ struct vk_texcompress_bcn_image *staging_images = ++ vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(struct vk_texcompress_bcn_image), 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); ++ dst_image->staging_images = staging_images; ++ ++ uint32_t size = 0; ++ VkExtent3D extent = (VkExtent3D){dst_image->info.width, dst_image->info.height, 1}; ++ VkFormat format = VK_FORMAT_R8G8B8A8_UNORM; ++ vk_texcompress_create_image(&cmd_buffer->device->vk, NULL, &staging_images->image_mem, ++ &staging_images->image_rgba, extent, format, 0); ++ RADV_FROM_HANDLE(radv_image, _image_rgba, dst_image->staging_images->image_rgba); ++ size += _image_rgba->size; ++ ++ extent = (VkExtent3D){(dst_image->info.width + 3) / 4, (dst_image->info.height + 3) / 4, 1}; ++ format = VK_FORMAT_R32G32_UINT; ++ vk_texcompress_create_image(&cmd_buffer->device->vk, NULL, &staging_images->image_mem, ++ &staging_images->image_bc1, extent, format, 0); ++ RADV_FROM_HANDLE(radv_image, _image_bc1, dst_image->staging_images->image_bc1); ++ size += _image_bc1->size; ++ ++ extent = (VkExtent3D){(dst_image->info.width + 3) / 4, (dst_image->info.height + 3) / 4, 1}; ++ format = VK_FORMAT_R32G32_UINT; ++ vk_texcompress_create_image(&cmd_buffer->device->vk, NULL, &staging_images->image_mem, ++ &staging_images->image_bc4, extent, format, 0); ++ RADV_FROM_HANDLE(radv_image, _image_bc4, dst_image->staging_images->image_bc4); ++ size += _image_bc4->size; ++ ++ extent = (VkExtent3D){dst_image->info.width, dst_image->info.height, 1}; ++ format = VK_FORMAT_R8G8B8A8_UNORM; ++ vk_texcompress_create_image(&cmd_buffer->device->vk, NULL, &staging_images->image_mem, ++ &staging_images->image_dummy, extent, format, size); ++ RADV_FROM_HANDLE(radv_image, _image_dummy, dst_image->staging_images->image_dummy); ++ ++ bind_memory(&cmd_buffer->device->vk, &staging_images->image_rgba, &staging_images->image_mem, 0); ++ bind_memory(&cmd_buffer->device->vk, &staging_images->image_bc1, &staging_images->image_mem, _image_rgba->size); ++ bind_memory(&cmd_buffer->device->vk, &staging_images->image_bc4, &staging_images->image_mem, _image_bc1->size); ++ ++ cmd_buffer->vk.is_need_hard_encode = true; ++ ++ vk_texcompress_insert_head(&cmd_buffer->vk.staging_image_list_head, dst_image->staging_images); ++} ++ ++void radv_copy_buffer_hard_encode(struct radv_image *dst_image, ++ const enum util_format_layout format_layout, ++ struct radv_cmd_buffer *cmd_buffer, ++ const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo) ++{ ++ radv_create_staging_images(dst_image, cmd_buffer); ++ for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) { ++ if (format_layout == UTIL_FORMAT_LAYOUT_ASTC) { ++ radv_meta_decode_astc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout, ++ &pCopyBufferToImageInfo->pRegions[r].imageSubresource, ++ pCopyBufferToImageInfo->pRegions[r].imageOffset, ++ pCopyBufferToImageInfo->pRegions[r].imageExtent); ++ } else { ++ radv_meta_decode_etc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout, ++ &pCopyBufferToImageInfo->pRegions[r].imageSubresource, ++ pCopyBufferToImageInfo->pRegions[r].imageOffset, ++ pCopyBufferToImageInfo->pRegions[r].imageExtent); ++ } ++ radv_meta_bcn_encoded(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout, ++ &pCopyBufferToImageInfo->pRegions[r].imageSubresource, ++ pCopyBufferToImageInfo->pRegions[r].imageOffset, ++ pCopyBufferToImageInfo->pRegions[r].imageExtent); ++ } ++} ++ + extern bool + radv_is_format_emulated(const struct radv_physical_device *physical_device, VkFormat format); + +@@ -495,6 +564,12 @@ radv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer, + radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_image); + + const enum util_format_layout format_layout = vk_format_description(dst_image->vk_format)->layout; ++ ++ if (dst_image->isNeedHardEncode) { ++ radv_copy_buffer_hard_encode(dst_image, format_layout, cmd_buffer, pCopyBufferToImageInfo); ++ return; ++ } ++ + for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) { + if (format_layout == UTIL_FORMAT_LAYOUT_ASTC) { + radv_meta_decode_astc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout, +@@ -847,6 +922,36 @@ copy_image(struct radv_cmd_buffer *cmd_buffer, struct radv_image *src_image, + radv_meta_restore(&saved_state, cmd_buffer); + } + ++void radv_copy_image_hard_encode(struct radv_image *dst_image, ++ const enum util_format_layout format_layout, ++ struct radv_cmd_buffer *cmd_buffer, ++ const VkCopyImageInfo2 *pCopyImageInfo) ++{ ++ radv_create_staging_images(dst_image, cmd_buffer); ++ RADV_FROM_HANDLE(radv_image, src_image, pCopyImageInfo->srcImage); ++ for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) { ++ VkExtent3D dst_extent = pCopyImageInfo->pRegions[r].extent; ++ if (src_image->vk_format != dst_image->vk_format) { ++ dst_extent.width = dst_extent.width / vk_format_get_blockwidth(src_image->vk_format) * ++ vk_format_get_blockwidth(dst_image->vk_format); ++ dst_extent.height = dst_extent.height / vk_format_get_blockheight(src_image->vk_format) * ++ vk_format_get_blockheight(dst_image->vk_format); ++ } ++ if (format_layout == UTIL_FORMAT_LAYOUT_ASTC) { ++ radv_meta_decode_astc(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout, ++ &pCopyImageInfo->pRegions[r].dstSubresource, pCopyImageInfo->pRegions[r].dstOffset, ++ dst_extent); ++ } else { ++ radv_meta_decode_etc(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout, ++ &pCopyImageInfo->pRegions[r].dstSubresource, pCopyImageInfo->pRegions[r].dstOffset, ++ dst_extent); ++ } ++ radv_meta_bcn_encoded(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout, ++ &pCopyImageInfo->pRegions[r].dstSubresource, pCopyImageInfo->pRegions[r].dstOffset, ++ dst_extent); ++ } ++} ++ + VKAPI_ATTR void VKAPI_CALL + radv_CmdCopyImage2(VkCommandBuffer commandBuffer, const VkCopyImageInfo2 *pCopyImageInfo) + { +@@ -866,6 +971,12 @@ radv_CmdCopyImage2(VkCommandBuffer commandBuffer, const VkCopyImageInfo2 *pCopyI + radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_image); + + const enum util_format_layout format_layout = vk_format_description(dst_image->vk_format)->layout; ++ ++ if (dst_image->isNeedHardEncode) { ++ radv_copy_image_hard_encode(dst_image, format_layout, cmd_buffer, pCopyImageInfo); ++ return; ++ } ++ + for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) { + VkExtent3D dst_extent = pCopyImageInfo->pRegions[r].extent; + if (src_image->vk_format != dst_image->vk_format) { +diff --git a/src/amd/vulkan/radv_meta_etc_decode.c b/src/amd/vulkan/radv_meta_etc_decode.c +index d5d002c63a0..3685218d915 100644 +--- a/src/amd/vulkan/radv_meta_etc_decode.c ++++ b/src/amd/vulkan/radv_meta_etc_decode.c +@@ -801,23 +801,45 @@ radv_meta_decode_etc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *imag + store_format = VK_FORMAT_R8G8B8A8_UNORM; + } + struct radv_image_view dest_iview; +- radv_image_view_init( +- &dest_iview, cmd_buffer->device, +- &(VkImageViewCreateInfo){ +- .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, +- .image = radv_image_to_handle(image), +- .viewType = radv_meta_get_view_type(image), +- .format = store_format, +- .subresourceRange = +- { +- .aspectMask = VK_IMAGE_ASPECT_PLANE_1_BIT, +- .baseMipLevel = subresource->mipLevel, +- .levelCount = 1, +- .baseArrayLayer = 0, +- .layerCount = subresource->baseArrayLayer + subresource->layerCount, +- }, +- }, +- NULL); ++ if (image->isNeedHardEncode) { ++ RADV_FROM_HANDLE(radv_image, store_image, image->staging_images->image_rgba); ++ cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, store_image); ++ radv_image_view_init( ++ &dest_iview, cmd_buffer->device, ++ &(VkImageViewCreateInfo){ ++ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, ++ .image = radv_image_to_handle(store_image), ++ .viewType = radv_meta_get_view_type(store_image), ++ .format = store_format, ++ .subresourceRange = ++ { ++ .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, ++ .baseMipLevel = subresource->mipLevel, ++ .levelCount = 1, ++ .baseArrayLayer = 0, ++ .layerCount = subresource->baseArrayLayer + subresource->layerCount, ++ }, ++ }, ++ NULL); ++ } else { ++ radv_image_view_init( ++ &dest_iview, cmd_buffer->device, ++ &(VkImageViewCreateInfo){ ++ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, ++ .image = radv_image_to_handle(image), ++ .viewType = radv_meta_get_view_type(image), ++ .format = store_format, ++ .subresourceRange = ++ { ++ .aspectMask = VK_IMAGE_ASPECT_PLANE_1_BIT, ++ .baseMipLevel = subresource->mipLevel, ++ .levelCount = 1, ++ .baseArrayLayer = 0, ++ .layerCount = subresource->baseArrayLayer + subresource->layerCount, ++ }, ++ }, ++ NULL); ++ } + + decode_etc(cmd_buffer, &src_iview, &dest_iview, &(VkOffset3D){offset.x, offset.y, base_slice}, + &(VkExtent3D){extent.width, extent.height, slice_count}); +diff --git a/src/amd/vulkan/radv_meta_rgba_encode.c b/src/amd/vulkan/radv_meta_rgba_encode.c +new file mode 100644 +index 00000000000..b144767143d +--- /dev/null ++++ b/src/amd/vulkan/radv_meta_rgba_encode.c +@@ -0,0 +1,312 @@ ++/* Copyright (c) 2017-2023 Hans-Kristian Arntzen ++ * ++ * SPDX-License-Identifier: MIT ++ */ ++ ++#include ++#include ++ ++#include "radv_meta.h" ++#include "sid.h" ++#include "vk_common_entrypoints.h" ++#include "vk_format.h" ++#include "vk_texcompress_bcn.h" ++#include "radv_debug.h" ++ ++struct radv_dispatch_info { ++ /** ++ * Determine the layout of the grid (in block units) to be used. ++ */ ++ uint32_t blocks[3]; ++ ++ /** ++ * A starting offset for the grid. If unaligned is set, the offset ++ * must still be aligned. ++ */ ++ uint32_t offsets[3]; ++ ++ /** ++ * Whether it's an unaligned compute dispatch. ++ */ ++ bool unaligned; ++ ++ /** ++ * Whether waves must be launched in order. ++ */ ++ bool ordered; ++ ++ /** ++ * Indirect compute parameters resource. ++ */ ++ struct radeon_winsys_bo *indirect; ++ uint64_t va; ++}; ++ ++VkResult ++radv_device_init_meta_rgba_encode_state(struct radv_device *device) ++{ ++ const struct radv_physical_device *pdev = device->physical_device; ++ struct radv_meta_state *state = &device->meta_state; ++ ++ return vk_texcompress_bcn_init(&device->vk, &state->alloc, radv_pipeline_cache_to_handle(&state->cache), state->bcn_encoded); ++} ++ ++void ++radv_device_finish_meta_rgba_encode_state(struct radv_device *device) ++{ ++ struct radv_meta_state *state = &device->meta_state; ++ struct vk_texcompress_bcn_state **bcns = state->bcn_encoded; ++ ++ if (bcns) ++ vk_texcompress_bcn_finish(&device->vk, &state->alloc, state->bcn_encoded); ++} ++ ++extern void ++radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info); ++ ++static VkImageViewType ++get_view_type(const struct radv_image *image) ++{ ++ switch (image->type) { ++ case VK_IMAGE_TYPE_2D: ++ return VK_IMAGE_VIEW_TYPE_2D_ARRAY; ++ case VK_IMAGE_TYPE_3D: ++ return VK_IMAGE_VIEW_TYPE_3D; ++ default: ++ unreachable("bad VkImageViewType"); ++ } ++} ++ ++static void ++image_view_init(struct radv_device *device, struct radv_image *image, VkFormat format, VkImageAspectFlags aspectMask, ++ uint32_t baseMipLevel, uint32_t baseArrayLayer, uint32_t layerCount, VkComponentMapping *component, ++ struct radv_image_view *iview) ++{ ++ VkImageViewCreateInfo iview_create_info = { ++ .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, ++ .image = radv_image_to_handle(image), ++ .viewType = get_view_type(image), ++ .format = format, ++ .subresourceRange = ++ { ++ .aspectMask = aspectMask, ++ .baseMipLevel = baseMipLevel, ++ .levelCount = 1, ++ .baseArrayLayer = 0, ++ .layerCount = baseArrayLayer + layerCount, ++ }, ++ }; ++ if (component) { ++ iview_create_info.components = *component; ++ } ++ ++ radv_image_view_init(iview, device, &iview_create_info, NULL); ++} ++ ++static void ++encode_bc1(struct radv_cmd_buffer *cmd_buffer, VkImageLayout layout, ++ const VkExtent3D *extent, const VkImageSubresourceLayers *subresource, ++ struct radv_image *src_image, struct radv_image *dst_image) ++{ ++ struct radv_device *device = cmd_buffer->device; ++ struct radv_meta_state *state = &device->meta_state; ++ ++ struct radv_image_view src_iview, dst_iview; ++ VkComponentMapping component = { ++ .r = VK_COMPONENT_SWIZZLE_R, ++ .g = VK_COMPONENT_SWIZZLE_G, ++ .b = VK_COMPONENT_SWIZZLE_B, ++ .a = VK_COMPONENT_SWIZZLE_A ++ }; ++ image_view_init(device, src_image, VK_FORMAT_R8G8B8A8_UNORM, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, ++ subresource->baseArrayLayer, 1, &component, &src_iview); ++ image_view_init(device, dst_image, VK_FORMAT_R16G16B16A16_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, ++ subresource->baseArrayLayer, 1, NULL, &dst_iview); ++ ++ cmd_buffer->state.flush_bits |= ++ RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | ++ radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, src_iview.image) | ++ radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_iview.image); ++ ++ struct vk_texcompress_bcn_write_descriptor_set write_desc_set; ++ vk_texcompress_bc1_fill_write_descriptor_sets(state->bcn_encoded[BC1_ENCODE], &write_desc_set, ++ radv_image_view_to_handle(&src_iview), layout, ++ radv_image_view_to_handle(&dst_iview)); ++ ++ radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, state->bcn_encoded[BC1_ENCODE]->p_layout, ++ 0, /* set number */ ++ 3, write_desc_set.descriptor_set); ++ ++ VkPipeline pipeline = ++ vk_texcompress_rgba_get_encode_pipeline(&device->vk, &state->alloc, ++ state->bcn_encoded[BC1_ENCODE], ++ radv_pipeline_cache_to_handle(&state->cache), BC1_ENCODE); ++ if (pipeline == VK_NULL_HANDLE) ++ return; ++ ++ radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); ++ ++ const unsigned num_refinements = 1; ++ const unsigned push_constants[2] = {num_refinements}; ++ radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), state->bcn_encoded[BC1_ENCODE]->p_layout, ++ VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants); ++ ++ struct radv_dispatch_info info = { ++ .blocks[0] = DIV_ROUND_UP(extent->width, 32), ++ .blocks[1] = DIV_ROUND_UP(extent->height, 32), ++ .blocks[2] = 1, ++ }; ++ radv_compute_dispatch(cmd_buffer, &info); ++ radv_image_view_finish(&src_iview); ++ radv_image_view_finish(&dst_iview); ++} ++ ++static void ++encode_bc4(struct radv_cmd_buffer *cmd_buffer, VkImageLayout layout, ++ const VkExtent3D *extent, const VkImageSubresourceLayers *subresource, ++ struct radv_image *src_image, struct radv_image *dst_image) ++{ ++ struct radv_device *device = cmd_buffer->device; ++ struct radv_meta_state *state = &device->meta_state; ++ ++ struct radv_image_view src_iview, dst_iview; ++ VkComponentMapping component = { ++ .r = VK_COMPONENT_SWIZZLE_A, ++ .g = VK_COMPONENT_SWIZZLE_ZERO, ++ .b = VK_COMPONENT_SWIZZLE_ZERO, ++ .a = VK_COMPONENT_SWIZZLE_ONE ++ }; ++ image_view_init(device, src_image, VK_FORMAT_R8G8B8A8_UNORM, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, ++ subresource->baseArrayLayer, 1, &component, &src_iview); ++ image_view_init(device, dst_image, VK_FORMAT_R16G16B16A16_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, ++ subresource->baseArrayLayer, 1, NULL, &dst_iview); ++ ++ cmd_buffer->state.flush_bits |= ++ RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | ++ radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, src_iview.image) | ++ radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_iview.image); ++ ++ struct vk_texcompress_bc4_write_descriptor_set write_desc_set; ++ vk_texcompress_bc4_fill_write_descriptor_sets(state->bcn_encoded[BC4_ENCODE], &write_desc_set, ++ radv_image_view_to_handle(&src_iview), layout, ++ radv_image_view_to_handle(&dst_iview)); ++ ++ radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, state->bcn_encoded[BC4_ENCODE]->p_layout, ++ 0, /* set number */ ++ 2, write_desc_set.descriptor_set); ++ ++ VkPipeline pipeline = ++ vk_texcompress_rgba_get_encode_pipeline(&device->vk, &state->alloc, ++ state->bcn_encoded[BC4_ENCODE], ++ radv_pipeline_cache_to_handle(&state->cache), BC4_ENCODE); ++ if (pipeline == VK_NULL_HANDLE) ++ return; ++ ++ radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); ++ ++ const unsigned alpha_channel_id = 0; // r通道 ++ const unsigned use_norm = 0; ++ const unsigned push_constants[2] = {alpha_channel_id, use_norm}; ++ radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), state->bcn_encoded[BC4_ENCODE]->p_layout, ++ VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants); ++ ++ struct radv_dispatch_info info = { ++ .blocks[0] = 1, ++ .blocks[1] = DIV_ROUND_UP(extent->width, 16), ++ .blocks[2] = DIV_ROUND_UP(extent->height, 16), ++ }; ++ radv_compute_dispatch(cmd_buffer, &info); ++ radv_image_view_finish(&src_iview); ++ radv_image_view_finish(&dst_iview); ++} ++ ++static void ++encode_bc3(struct radv_cmd_buffer *cmd_buffer, ++ const VkExtent3D *extent, const VkImageSubresourceLayers *subresource, ++ struct radv_image *bc1_image, struct radv_image *bc4_image, struct radv_image *dst_image) ++{ ++ struct radv_device *device = cmd_buffer->device; ++ struct radv_meta_state *state = &device->meta_state; ++ ++ struct radv_image_view bc1_iview, bc4_iview, dst_iview; ++ VkComponentMapping component = { ++ .r = VK_COMPONENT_SWIZZLE_R, ++ .g = VK_COMPONENT_SWIZZLE_G, ++ .b = VK_COMPONENT_SWIZZLE_ZERO, ++ .a = VK_COMPONENT_SWIZZLE_ONE ++ }; ++ image_view_init(device, bc1_image, VK_FORMAT_R32G32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, ++ subresource->baseArrayLayer, 1, &component, &bc1_iview); ++ image_view_init(device, bc4_image, VK_FORMAT_R32G32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel, ++ subresource->baseArrayLayer, 1, &component, &bc4_iview); ++ image_view_init(device, dst_image, VK_FORMAT_R32G32B32A32_UINT, VK_IMAGE_ASPECT_PLANE_1_BIT, subresource->mipLevel, ++ subresource->baseArrayLayer, 1, NULL, &dst_iview); ++ ++ cmd_buffer->state.flush_bits |= ++ RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | ++ radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, bc1_iview.image) | ++ radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, bc4_iview.image) | ++ radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_iview.image); ++ ++ struct vk_texcompress_bcn_write_descriptor_set write_desc_set; ++ vk_texcompress_bc3_fill_write_descriptor_sets(state->bcn_encoded[BC3_ENCODE], &write_desc_set, ++ radv_image_view_to_handle(&bc1_iview), radv_image_view_to_handle(&bc4_iview), ++ radv_image_view_to_handle(&dst_iview)); ++ ++ radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, state->bcn_encoded[BC3_ENCODE]->p_layout, ++ 0, /* set number */ ++ 3, write_desc_set.descriptor_set); ++ ++ VkPipeline pipeline = ++ vk_texcompress_rgba_get_encode_pipeline(&device->vk, &state->alloc, ++ state->bcn_encoded[BC3_ENCODE], ++ radv_pipeline_cache_to_handle(&state->cache), BC3_ENCODE); ++ if (pipeline == VK_NULL_HANDLE) ++ return; ++ ++ radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); ++ ++ struct radv_dispatch_info info = { ++ .blocks[0] = DIV_ROUND_UP(extent->width, 32), ++ .blocks[1] = DIV_ROUND_UP(extent->height, 32), ++ .blocks[2] = 1, ++ }; ++ radv_compute_dispatch(cmd_buffer, &info); ++ radv_image_view_finish(&bc1_iview); ++ radv_image_view_finish(&bc4_iview); ++ radv_image_view_finish(&dst_iview); ++} ++ ++void ++radv_meta_bcn_encoded(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout layout, ++ const VkImageSubresourceLayers *subresource, VkOffset3D offset, VkExtent3D extent) ++{ ++ struct radv_device *device = cmd_buffer->device; ++ struct radv_meta_saved_state saved_state; ++ radv_meta_save(&saved_state, cmd_buffer, ++ RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS | RADV_META_SAVE_DESCRIPTORS); ++ ++ VkImage _image = radv_image_to_handle(image); ++ VK_FROM_HANDLE(vk_image, vkImage, _image); ++ ++ extent = radv_sanitize_image_extent(image->type, extent); ++ offset = radv_sanitize_image_offset(image->type, offset); ++ ++ VkExtent3D extent_copy = { ++ .width = extent.width, ++ .height = extent.height, ++ .depth = 1, ++ }; ++ ++ RADV_FROM_HANDLE(radv_image, rgba_image, image->staging_images->image_rgba); ++ RADV_FROM_HANDLE(radv_image, image1, image->staging_images->image_bc1); ++ RADV_FROM_HANDLE(radv_image, image4, image->staging_images->image_bc4); ++ ++ encode_bc1(cmd_buffer, layout, &extent_copy, subresource, rgba_image, image1); ++ ++ encode_bc4(cmd_buffer, layout, &extent_copy, subresource, rgba_image, image4); ++ ++ encode_bc3(cmd_buffer, &extent_copy, subresource, image1, image4, image); ++ ++ radv_meta_restore(&saved_state, cmd_buffer); ++} +diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h +index 472858dd401..04a670d714d 100644 +--- a/src/amd/vulkan/radv_private.h ++++ b/src/amd/vulkan/radv_private.h +@@ -98,6 +98,7 @@ typedef uint32_t xcb_window_t; + + #include "vk_texcompress_astc.h" + #include "wsi_common.h" ++#include "vk_texcompress_bcn.h" + + #ifdef __cplusplus + extern "C" +@@ -681,6 +682,8 @@ struct radv_meta_state { + } etc_decode; + + struct vk_texcompress_astc_state *astc_decode; ++ ++ struct vk_texcompress_bcn_state *bcn_encoded[VK_TEXCOMPRESS_BCN_NUM_COMPUTE_PIPELINES]; + }; + + #define RADV_NUM_HW_CTX (RADEON_CTX_PRIORITY_REALTIME + 1) +@@ -2090,8 +2093,12 @@ struct radv_image { + bool dcc_sign_reinterpret; + bool support_comp_to_single; + +- bool isNeedSoftEncode;// 是否需要使用 软编 +- bool isNeedSoftDecode;// 是否需要使用 软解 ++ bool isNeedSoftEncode; // 是否需要使用 软编 ++ bool isNeedSoftDecode; // 是否需要使用 软解 ++ ++ bool isNeedHardEncode; // 是否需要使用 硬编 ++ struct vk_texcompress_bcn_image *staging_images; // 硬编的中间资源 ++ + VkFormat srcFormat; //用于保存纹理的原始格式 + VkFormat unpackETCFormat;//用于保存ETC纹理解压后的格式 + +@@ -2474,6 +2481,8 @@ void radv_image_view_init(struct radv_image_view *view, struct radv_device *devi + const struct radv_image_view_extra_create_info *extra_create_info); + void radv_image_view_finish(struct radv_image_view *iview); + ++void radv_internal_image_finish(struct radv_image *image); ++ + VkFormat radv_get_aspect_format(struct radv_image *image, VkImageAspectFlags mask); + + struct radv_sampler_ycbcr_conversion_state { +diff --git a/src/compiler/glsl/bc1.glsl b/src/compiler/glsl/bc1.glsl +index 251635d7b69..627b2e5419c 100644 +--- a/src/compiler/glsl/bc1.glsl ++++ b/src/compiler/glsl/bc1.glsl +@@ -1,3 +1,4 @@ ++#version 310 es + /* + * Copyright 2020-2022 Matias N. Goldberg + * Copyright 2022 Intel Corporation +@@ -21,7 +22,6 @@ + * DEALINGS IN THE SOFTWARE. + */ + +-#version 310 es + + #if defined(GL_ES) && GL_ES == 1 + // Desktop GLSL allows the const keyword for either compile-time or +@@ -31,10 +31,31 @@ + #define const + #endif + +-%s // include "CrossPlatformSettings_piece_all.glsl" +- + #define FLT_MAX 340282346638528859811704183484516925440.0f + ++#ifdef VULKAN ++ ++#extension GL_GOOGLE_include_directive : require ++#include "CrossPlatformSettings_piece_all.glsl" ++ ++layout(push_constant, std430) uniform pc { ++ uint p_numRefinements; ++}; ++ ++layout ( set = 0, binding = 0 ) uniform sampler2D srcTex; ++ ++layout (std430, set = 0, binding = 1) readonly restrict buffer globalBuffer ++{ ++ float2 c_oMatch5[256]; ++ float2 c_oMatch6[256]; ++}; ++ ++layout (rgba16ui, set = 0, binding = 2 ) uniform restrict writeonly mediump uimage2D dstTexture; ++ ++#else ++ ++%s // include "CrossPlatformSettings_piece_all.glsl" ++ + layout( location = 0 ) uniform uint p_numRefinements; + + uniform sampler2D srcTex; +@@ -47,6 +68,8 @@ layout( std430, binding = 1 ) readonly restrict buffer globalBuffer + float2 c_oMatch6[256]; + }; + ++#endif ++ + layout( local_size_x = 8, // + local_size_y = 8, // + local_size_z = 1 ) in; +diff --git a/src/compiler/glsl/bc4.glsl b/src/compiler/glsl/bc4.glsl +index 2486fa4ae0c..0b9d236d624 100644 +--- a/src/compiler/glsl/bc4.glsl ++++ b/src/compiler/glsl/bc4.glsl +@@ -1,3 +1,5 @@ ++#version 310 es ++ + /* + * Copyright 2020-2022 Matias N. Goldberg + * Copyright 2022 Intel Corporation +@@ -21,8 +23,6 @@ + * DEALINGS IN THE SOFTWARE. + */ + +-#version 310 es +- + #if defined(GL_ES) && GL_ES == 1 + // Desktop GLSL allows the const keyword for either compile-time or + // run-time constants. GLSL ES only allows the keyword for compile-time +@@ -33,20 +33,37 @@ + + #define __sharedOnlyBarrier memoryBarrierShared();barrier(); + +-%s // include "CrossPlatformSettings_piece_all.glsl" ++#ifdef VULKAN + +-shared float2 g_minMaxValues[4u * 4u * 4u]; +-shared uint2 g_mask[4u * 4u]; ++#extension GL_GOOGLE_include_directive : require ++#include "CrossPlatformSettings_piece_all.glsl" + +-layout( location = 0 ) uniform uint2 params; ++layout(push_constant, std430) uniform pc { ++ uint2 params; ++}; + +-#define p_channelIdx params.x +-#define p_useSNorm params.y ++layout ( set = 0, binding = 0 ) uniform sampler2D srcTex; ++ ++layout (rgba16ui, set = 0, binding = 1) uniform restrict writeonly mediump uimage2D dstTexture; ++ ++#else ++ ++%s // include "CrossPlatformSettings_piece_all.glsl" + + uniform sampler2D srcTex; + + layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture; + ++layout( location = 0 ) uniform uint2 params; ++ ++#endif ++ ++shared float2 g_minMaxValues[4u * 4u * 4u]; ++shared uint2 g_mask[4u * 4u]; ++ ++#define p_channelIdx params.x ++#define p_useSNorm params.y ++ + layout( local_size_x = 4, // + local_size_y = 4, // + local_size_z = 4 ) in; +diff --git a/src/compiler/glsl/etc2_rgba_stitch.glsl b/src/compiler/glsl/etc2_rgba_stitch.glsl +index f90accb82e1..8d50c469956 100644 +--- a/src/compiler/glsl/etc2_rgba_stitch.glsl ++++ b/src/compiler/glsl/etc2_rgba_stitch.glsl +@@ -1,3 +1,5 @@ ++#version 310 es ++ + /* + * Copyright 2020-2022 Matias N. Goldberg + * Copyright 2022 Intel Corporation +@@ -25,18 +27,30 @@ + // This compute shader merely stitches them together to form the final result + // It's also used by RG11 driver to stitch two R11 into one RG11 + +-#version 310 es ++#ifdef VULKAN + +-%s // include "CrossPlatformSettings_piece_all.glsl" ++#extension GL_GOOGLE_include_directive : require ++#include "CrossPlatformSettings_piece_all.glsl" + +-layout( local_size_x = 8, // +- local_size_y = 8, // +- local_size_z = 1 ) in; ++layout ( binding = 0 ) uniform highp usampler2D srcRGB; ++layout ( binding = 1 ) uniform highp usampler2D srcAlpha; ++ ++layout ( rgba32ui, binding = 2 ) uniform restrict writeonly mediump uimage2D dstTexture; ++ ++#else ++ ++%s // include "CrossPlatformSettings_piece_all.glsl" + + layout( binding = 0 ) uniform highp usampler2D srcRGB; + layout( binding = 1 ) uniform highp usampler2D srcAlpha; + layout( rgba32ui ) uniform restrict writeonly highp uimage2D dstTexture; + ++#endif ++ ++layout( local_size_x = 8, // ++ local_size_y = 8, // ++ local_size_z = 1 ) in; ++ + void main() + { + uint2 etcRgb = OGRE_Load2D( srcRGB, int2( gl_GlobalInvocationID.xy ), 0 ).xy; +diff --git a/src/compiler/meson.build b/src/compiler/meson.build +index e18af3bf335..85b19b84bec 100644 +--- a/src/compiler/meson.build ++++ b/src/compiler/meson.build +@@ -24,6 +24,14 @@ inc_spirv = include_directories('spirv') + + astc_decoder_glsl_file = files('glsl/astc_decoder.glsl') + ++bc3_encode_head_dir = meson.current_source_dir() / 'glsl' ++ ++bc1_glsl_file = files('glsl/bc1.glsl') ++ ++bc4_glsl_file = files('glsl/bc4.glsl') ++ ++etc2_rgba_stitch_glsl_file = files('glsl/etc2_rgba_stitch.glsl') ++ + files_libcompiler = files( + 'builtin_type_macros.h', + 'glsl_types.cpp', +diff --git a/src/util/bc1_tables.h b/src/util/bc1_tables.h +new file mode 100644 +index 00000000000..654ccdadb78 +--- /dev/null ++++ b/src/util/bc1_tables.h +@@ -0,0 +1,124 @@ ++/* ++ * Copyright 2020-2022 Matias N. Goldberg ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ */ ++ ++/* These tables have been generated with: ++ ++ #define STB_DXT_IMPLEMENTATION ++ #include "stb_dxt.h" ++ #include ++ ++ int main() ++ { ++ stb__InitDXT(); ++ ++ printf( "static unsigned char stb__OMatch5[256][2] = {\n" ); ++ for( size_t i = 0u; i < 256u; ++i ) ++ { ++ printf( "{ %i, %i }", stb__OMatch5[i][0], stb__OMatch5[i][1] ); ++ if( i != 255u ) ++ printf( ",\n" ); ++ } ++ printf( "\n};\n" ); ++ ++ printf( "static unsigned char stb__OMatch6[256][2] = {\n" ); ++ for( size_t i = 0u; i < 256u; ++i ) ++ { ++ printf( "{ %i, %i }", stb__OMatch6[i][0], stb__OMatch6[i][1] ); ++ if( i != 255u ) ++ printf( ",\n" ); ++ } ++ printf( "\n};\n" ); ++ ++ return 0; ++ } ++ ++ Note that stb__OMatch5[i][0] = max and stb__OMatch5[i][1] = min ++*/ ++static unsigned char stb__OMatch5[256][2] = { ++ { 0, 0 }, { 0, 0 }, { 0, 1 }, { 0, 1 }, { 1, 0 }, { 1, 0 }, { 1, 0 }, { 1, 1 }, ++ { 1, 1 }, { 2, 0 }, { 2, 0 }, { 0, 4 }, { 2, 1 }, { 2, 1 }, { 2, 1 }, { 3, 0 }, ++ { 3, 0 }, { 3, 0 }, { 3, 1 }, { 1, 5 }, { 3, 2 }, { 3, 2 }, { 4, 0 }, { 4, 0 }, ++ { 4, 1 }, { 4, 1 }, { 4, 2 }, { 4, 2 }, { 4, 2 }, { 3, 5 }, { 5, 1 }, { 5, 1 }, ++ { 5, 2 }, { 4, 4 }, { 5, 3 }, { 5, 3 }, { 5, 3 }, { 6, 2 }, { 6, 2 }, { 6, 2 }, ++ { 6, 3 }, { 5, 5 }, { 6, 4 }, { 6, 4 }, { 4, 8 }, { 7, 3 }, { 7, 3 }, { 7, 3 }, ++ { 7, 4 }, { 7, 4 }, { 7, 4 }, { 7, 5 }, { 5, 9 }, { 7, 6 }, { 7, 6 }, { 8, 4 }, ++ { 8, 4 }, { 8, 5 }, { 8, 5 }, { 8, 6 }, { 8, 6 }, { 8, 6 }, { 7, 9 }, { 9, 5 }, ++ { 9, 5 }, { 9, 6 }, { 8, 8 }, { 9, 7 }, { 9, 7 }, { 9, 7 }, { 10, 6 }, { 10, 6 }, ++ { 10, 6 }, { 10, 7 }, { 9, 9 }, { 10, 8 }, { 10, 8 }, { 8, 12 }, { 11, 7 }, { 11, 7 }, ++ { 11, 7 }, { 11, 8 }, { 11, 8 }, { 11, 8 }, { 11, 9 }, { 9, 13 }, { 11, 10 }, { 11, 10 }, ++ { 12, 8 }, { 12, 8 }, { 12, 9 }, { 12, 9 }, { 12, 10 }, { 12, 10 }, { 12, 10 }, { 11, 13 }, ++ { 13, 9 }, { 13, 9 }, { 13, 10 }, { 12, 12 }, { 13, 11 }, { 13, 11 }, { 13, 11 }, { 14, 10 }, ++ { 14, 10 }, { 14, 10 }, { 14, 11 }, { 13, 13 }, { 14, 12 }, { 14, 12 }, { 12, 16 }, { 15, 11 }, ++ { 15, 11 }, { 15, 11 }, { 15, 12 }, { 15, 12 }, { 15, 12 }, { 15, 13 }, { 13, 17 }, { 15, 14 }, ++ { 15, 14 }, { 16, 12 }, { 16, 12 }, { 16, 13 }, { 16, 13 }, { 16, 14 }, { 16, 14 }, { 16, 14 }, ++ { 15, 17 }, { 17, 13 }, { 17, 13 }, { 17, 14 }, { 16, 16 }, { 17, 15 }, { 17, 15 }, { 17, 15 }, ++ { 18, 14 }, { 18, 14 }, { 18, 14 }, { 18, 15 }, { 17, 17 }, { 18, 16 }, { 18, 16 }, { 16, 20 }, ++ { 19, 15 }, { 19, 15 }, { 19, 15 }, { 19, 16 }, { 19, 16 }, { 19, 16 }, { 19, 17 }, { 17, 21 }, ++ { 19, 18 }, { 19, 18 }, { 20, 16 }, { 20, 16 }, { 20, 17 }, { 20, 17 }, { 20, 18 }, { 20, 18 }, ++ { 20, 18 }, { 19, 21 }, { 21, 17 }, { 21, 17 }, { 21, 18 }, { 20, 20 }, { 21, 19 }, { 21, 19 }, ++ { 21, 19 }, { 22, 18 }, { 22, 18 }, { 22, 18 }, { 22, 19 }, { 21, 21 }, { 22, 20 }, { 22, 20 }, ++ { 20, 24 }, { 23, 19 }, { 23, 19 }, { 23, 19 }, { 23, 20 }, { 23, 20 }, { 23, 20 }, { 23, 21 }, ++ { 21, 25 }, { 23, 22 }, { 23, 22 }, { 24, 20 }, { 24, 20 }, { 24, 21 }, { 24, 21 }, { 24, 22 }, ++ { 24, 22 }, { 24, 22 }, { 23, 25 }, { 25, 21 }, { 25, 21 }, { 25, 22 }, { 24, 24 }, { 25, 23 }, ++ { 25, 23 }, { 25, 23 }, { 26, 22 }, { 26, 22 }, { 26, 22 }, { 26, 23 }, { 25, 25 }, { 26, 24 }, ++ { 26, 24 }, { 24, 28 }, { 27, 23 }, { 27, 23 }, { 27, 23 }, { 27, 24 }, { 27, 24 }, { 27, 24 }, ++ { 27, 25 }, { 25, 29 }, { 27, 26 }, { 27, 26 }, { 28, 24 }, { 28, 24 }, { 28, 25 }, { 28, 25 }, ++ { 28, 26 }, { 28, 26 }, { 28, 26 }, { 27, 29 }, { 29, 25 }, { 29, 25 }, { 29, 26 }, { 28, 28 }, ++ { 29, 27 }, { 29, 27 }, { 29, 27 }, { 30, 26 }, { 30, 26 }, { 30, 26 }, { 30, 27 }, { 29, 29 }, ++ { 30, 28 }, { 30, 28 }, { 30, 28 }, { 31, 27 }, { 31, 27 }, { 31, 27 }, { 31, 28 }, { 31, 28 }, ++ { 31, 28 }, { 31, 29 }, { 31, 29 }, { 31, 30 }, { 31, 30 }, { 31, 30 }, { 31, 31 }, { 31, 31 } ++}; ++ ++static unsigned char stb__OMatch6[256][2] = { ++ { 0, 0 }, { 0, 1 }, { 1, 0 }, { 1, 0 }, { 1, 1 }, { 2, 0 }, { 2, 1 }, { 3, 0 }, ++ { 3, 0 }, { 3, 1 }, { 4, 0 }, { 4, 0 }, { 4, 1 }, { 5, 0 }, { 5, 1 }, { 6, 0 }, ++ { 6, 0 }, { 6, 1 }, { 7, 0 }, { 7, 0 }, { 7, 1 }, { 8, 0 }, { 8, 1 }, { 8, 1 }, ++ { 8, 2 }, { 9, 1 }, { 9, 2 }, { 9, 2 }, { 9, 3 }, { 10, 2 }, { 10, 3 }, { 10, 3 }, ++ { 10, 4 }, { 11, 3 }, { 11, 4 }, { 11, 4 }, { 11, 5 }, { 12, 4 }, { 12, 5 }, { 12, 5 }, ++ { 12, 6 }, { 13, 5 }, { 13, 6 }, { 8, 16 }, { 13, 7 }, { 14, 6 }, { 14, 7 }, { 9, 17 }, ++ { 14, 8 }, { 15, 7 }, { 15, 8 }, { 11, 16 }, { 15, 9 }, { 15, 10 }, { 16, 8 }, { 16, 9 }, ++ { 16, 10 }, { 15, 13 }, { 17, 9 }, { 17, 10 }, { 17, 11 }, { 15, 16 }, { 18, 10 }, { 18, 11 }, ++ { 18, 12 }, { 16, 16 }, { 19, 11 }, { 19, 12 }, { 19, 13 }, { 17, 17 }, { 20, 12 }, { 20, 13 }, ++ { 20, 14 }, { 19, 16 }, { 21, 13 }, { 21, 14 }, { 21, 15 }, { 20, 17 }, { 22, 14 }, { 22, 15 }, ++ { 25, 10 }, { 22, 16 }, { 23, 15 }, { 23, 16 }, { 26, 11 }, { 23, 17 }, { 24, 16 }, { 24, 17 }, ++ { 27, 12 }, { 24, 18 }, { 25, 17 }, { 25, 18 }, { 28, 13 }, { 25, 19 }, { 26, 18 }, { 26, 19 }, ++ { 29, 14 }, { 26, 20 }, { 27, 19 }, { 27, 20 }, { 30, 15 }, { 27, 21 }, { 28, 20 }, { 28, 21 }, ++ { 28, 21 }, { 28, 22 }, { 29, 21 }, { 29, 22 }, { 24, 32 }, { 29, 23 }, { 30, 22 }, { 30, 23 }, ++ { 25, 33 }, { 30, 24 }, { 31, 23 }, { 31, 24 }, { 27, 32 }, { 31, 25 }, { 31, 26 }, { 32, 24 }, ++ { 32, 25 }, { 32, 26 }, { 31, 29 }, { 33, 25 }, { 33, 26 }, { 33, 27 }, { 31, 32 }, { 34, 26 }, ++ { 34, 27 }, { 34, 28 }, { 32, 32 }, { 35, 27 }, { 35, 28 }, { 35, 29 }, { 33, 33 }, { 36, 28 }, ++ { 36, 29 }, { 36, 30 }, { 35, 32 }, { 37, 29 }, { 37, 30 }, { 37, 31 }, { 36, 33 }, { 38, 30 }, ++ { 38, 31 }, { 41, 26 }, { 38, 32 }, { 39, 31 }, { 39, 32 }, { 42, 27 }, { 39, 33 }, { 40, 32 }, ++ { 40, 33 }, { 43, 28 }, { 40, 34 }, { 41, 33 }, { 41, 34 }, { 44, 29 }, { 41, 35 }, { 42, 34 }, ++ { 42, 35 }, { 45, 30 }, { 42, 36 }, { 43, 35 }, { 43, 36 }, { 46, 31 }, { 43, 37 }, { 44, 36 }, ++ { 44, 37 }, { 44, 37 }, { 44, 38 }, { 45, 37 }, { 45, 38 }, { 40, 48 }, { 45, 39 }, { 46, 38 }, ++ { 46, 39 }, { 41, 49 }, { 46, 40 }, { 47, 39 }, { 47, 40 }, { 43, 48 }, { 47, 41 }, { 47, 42 }, ++ { 48, 40 }, { 48, 41 }, { 48, 42 }, { 47, 45 }, { 49, 41 }, { 49, 42 }, { 49, 43 }, { 47, 48 }, ++ { 50, 42 }, { 50, 43 }, { 50, 44 }, { 48, 48 }, { 51, 43 }, { 51, 44 }, { 51, 45 }, { 49, 49 }, ++ { 52, 44 }, { 52, 45 }, { 52, 46 }, { 51, 48 }, { 53, 45 }, { 53, 46 }, { 53, 47 }, { 52, 49 }, ++ { 54, 46 }, { 54, 47 }, { 57, 42 }, { 54, 48 }, { 55, 47 }, { 55, 48 }, { 58, 43 }, { 55, 49 }, ++ { 56, 48 }, { 56, 49 }, { 59, 44 }, { 56, 50 }, { 57, 49 }, { 57, 50 }, { 60, 45 }, { 57, 51 }, ++ { 58, 50 }, { 58, 51 }, { 61, 46 }, { 58, 52 }, { 59, 51 }, { 59, 52 }, { 62, 47 }, { 59, 53 }, ++ { 60, 52 }, { 60, 53 }, { 60, 53 }, { 60, 54 }, { 61, 53 }, { 61, 54 }, { 61, 54 }, { 61, 55 }, ++ { 62, 54 }, { 62, 55 }, { 62, 55 }, { 62, 56 }, { 63, 55 }, { 63, 56 }, { 63, 56 }, { 63, 57 }, ++ { 63, 58 }, { 63, 59 }, { 63, 59 }, { 63, 60 }, { 63, 61 }, { 63, 62 }, { 63, 62 }, { 63, 63 } ++}; +diff --git a/src/vulkan/runtime/meson.build b/src/vulkan/runtime/meson.build +index 1c02e1a5dd8..54c89d793a0 100644 +--- a/src/vulkan/runtime/meson.build ++++ b/src/vulkan/runtime/meson.build +@@ -93,12 +93,31 @@ endif + + if prog_glslang.found() + vulkan_runtime_files += files('vk_texcompress_astc.c', 'vk_texcompress_astc.h') ++ vulkan_runtime_files += files('vk_texcompress_bcn.c', 'vk_texcompress_bcn.h') + vulkan_runtime_files += custom_target( + 'astc_spv.h', + input : astc_decoder_glsl_file, + output : 'astc_spv.h', + command : [prog_glslang, '-V', '-S', 'comp', '-x', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet, + ) ++ vulkan_runtime_files += custom_target( ++ 'bc1_spv.h', ++ input : bc1_glsl_file, ++ output : 'bc1_spv.h', ++ command : [prog_glslang, '-V', '-S', 'comp', '-x', '-Ibc3_encode_head_dir', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet, ++ ) ++ vulkan_runtime_files += custom_target( ++ 'bc4_spv.h', ++ input : bc4_glsl_file, ++ output : 'bc4_spv.h', ++ command : [prog_glslang, '-V', '-S', 'comp', '-x', '-Ibc3_encode_head_dir', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet, ++ ) ++ vulkan_runtime_files += custom_target( ++ 'bc3_spv.h', ++ input : etc2_rgba_stitch_glsl_file, ++ output : 'bc3_spv.h', ++ command : [prog_glslang, '-V', '-S', 'comp', '-x', '-Ibc3_encode_head_dir', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet, ++ ) + endif + + vk_common_entrypoints = custom_target( +diff --git a/src/vulkan/runtime/vk_command_buffer.h b/src/vulkan/runtime/vk_command_buffer.h +index 36099d03a56..1a04b53510f 100644 +--- a/src/vulkan/runtime/vk_command_buffer.h ++++ b/src/vulkan/runtime/vk_command_buffer.h +@@ -28,6 +28,7 @@ + #include "vk_object.h" + #include "util/list.h" + #include "util/u_dynarray.h" ++#include "vk_texcompress_bcn.h" + + #ifdef __cplusplus + extern "C" { +@@ -130,6 +131,12 @@ struct vk_command_buffer { + /* This uses the same trick as STACK_ARRAY */ + struct vk_attachment_state *attachments; + struct vk_attachment_state _attachments[8]; ++ ++ bool is_need_hard_encode; ++ ++ struct vk_texcompress_staging_images* staging_image_list_head; ++ ++ struct vk_texcompress_dummy_image* dummy; + }; + + VK_DEFINE_HANDLE_CASTS(vk_command_buffer, base, VkCommandBuffer, +diff --git a/src/vulkan/runtime/vk_queue.c b/src/vulkan/runtime/vk_queue.c +index 6216af9d856..15994e3420d 100644 +--- a/src/vulkan/runtime/vk_queue.c ++++ b/src/vulkan/runtime/vk_queue.c +@@ -42,6 +42,7 @@ + #include "vk_util.h" + + #include "vulkan/wsi/wsi_common.h" ++#include "vk_texcompress_bcn.h" + + static VkResult + vk_queue_start_submit_thread(struct vk_queue *queue); +@@ -1140,7 +1141,22 @@ vk_common_QueueSubmit2KHR(VkQueue _queue, + } + } + ++ int n_command_buffers = 0; ++ for (uint32_t s = 0; s < submitCount; s++) { ++ n_command_buffers += pSubmits[s].commandBufferInfoCount; ++ } ++ const uint32_t const_num_cmd_buf = n_command_buffers; ++ uint32_t num_cmd_buf = 0; ++ struct vk_command_buffer *need_delete_cmd_buf[const_num_cmd_buf]; ++ + for (uint32_t i = 0; i < submitCount; i++) { ++ for (uint32_t j = 0; j < pSubmits[i].commandBufferInfoCount; j++) { ++ VK_FROM_HANDLE(vk_command_buffer, commandBuffer, (pSubmits[i].pCommandBufferInfos)[j].commandBuffer); ++ if (commandBuffer->is_need_hard_encode) { ++ need_delete_cmd_buf[num_cmd_buf] = commandBuffer; ++ num_cmd_buf += 1; ++ } ++ } + struct vulkan_submit_info info = { + .pNext = pSubmits[i].pNext, + .command_buffer_count = pSubmits[i].commandBufferInfoCount, +@@ -1156,6 +1172,20 @@ vk_common_QueueSubmit2KHR(VkQueue _queue, + return result; + } + ++ if (num_cmd_buf > 0 && fence == NULL) { ++ vk_texcompress_create_fence(queue->base.device, NULL, &_fence); ++ } ++ ++ if (num_cmd_buf > 0) { ++ vk_texcompress_wait_fence(queue->base.device, NULL, &_fence); ++ for (uint32_t i = 0; i < num_cmd_buf; i++) { ++ while (need_delete_cmd_buf[i]->staging_image_list_head != NULL) { ++ vk_texcompress_delete_head(&need_delete_cmd_buf[i]->staging_image_list_head, queue->base.device); ++ } ++ need_delete_cmd_buf[i]->is_need_hard_encode = false; ++ } ++ } ++ + return VK_SUCCESS; + } + +diff --git a/src/vulkan/runtime/vk_texcompress_bcn.c b/src/vulkan/runtime/vk_texcompress_bcn.c +new file mode 100644 +index 00000000000..2c85d98af5c +--- /dev/null ++++ b/src/vulkan/runtime/vk_texcompress_bcn.c +@@ -0,0 +1,740 @@ ++/* Copyright (c) 2017-2023 Hans-Kristian Arntzen ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining ++ * a copy of this software and associated documentation files (the ++ * "Software"), to deal in the Software without restriction, including ++ * without limitation the rights to use, copy, modify, merge, publish, ++ * distribute, sublicense, and/or sell copies of the Software, and to ++ * permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be ++ * included in all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++ ++#include "vk_texcompress_bcn.h" ++#include "util/bc1_tables.h" ++#include "vk_alloc.h" ++#include "vk_format.h" ++#include "vk_image.h" ++#include "vk_physical_device.h" ++#include "log/log.h" ++ ++void ++vk_texcompress_insert_head(staging_images **head, bcn_image *current) ++{ ++ staging_images *node = (staging_images *)malloc(sizeof(staging_images)); ++ if (node == NULL) { ++ ALOGE("Failed to alloc staging_images node"); ++ return; ++ } ++ ++ node->current = current; ++ node->next = *head; ++ *head = node; ++} ++ ++void ++vk_texcompress_delete_head(staging_images **head, struct vk_device *device) ++{ ++ if (*head == NULL) { ++ return; ++ } ++ staging_images *current_head = *head; ++ vk_texcompress_finish_image(device, NULL, current_head->current); ++ *head = current_head->next; ++ free(current_head); ++} ++ ++/* type_indexes_mask bits are set/clear for support memory type index as per ++ * struct VkPhysicalDeviceMemoryProperties.memoryTypes[] */ ++static uint32_t ++get_mem_type_index(struct vk_device *device, uint32_t type_indexes_mask, ++ VkMemoryPropertyFlags mem_property) ++{ ++ const struct vk_physical_device_dispatch_table *disp = &device->physical->dispatch_table; ++ VkPhysicalDevice _phy_device = vk_physical_device_to_handle(device->physical); ++ ++ VkPhysicalDeviceMemoryProperties2 props2 = { ++ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2, ++ .pNext = NULL, ++ }; ++ disp->GetPhysicalDeviceMemoryProperties2(_phy_device, &props2); ++ ++ for (uint32_t i = 0; i < props2.memoryProperties.memoryTypeCount; i++) { ++ if ((type_indexes_mask & (1 << i)) && ++ ((props2.memoryProperties.memoryTypes[i].propertyFlags & mem_property) == mem_property)) { ++ return i; ++ } ++ } ++ ++ return -1; ++} ++ ++static VkResult ++create_buffer(struct vk_device *device, VkAllocationCallbacks *allocator, struct vk_texcompress_bcn_state *bc1) ++{ ++ VkDeviceSize size = sizeof(float) * (sizeof(stb__OMatch5) + sizeof(stb__OMatch6)); ++ VkResult result; ++ VkDevice _device = vk_device_to_handle(device); ++ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ ++ VkBufferCreateInfo buffer_create_info = { ++ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, ++ .size = size, ++ .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, ++ .sharingMode = VK_SHARING_MODE_EXCLUSIVE, ++ }; ++ result = ++ disp->CreateBuffer(_device, &buffer_create_info, allocator, &bc1->bc1_table_buf); ++ if (unlikely(result != VK_SUCCESS)) ++ return result; ++ ++ VkBufferMemoryRequirementsInfo2 mem_req_info = { ++ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2, ++ .buffer = bc1->bc1_table_buf, ++ }; ++ VkMemoryRequirements2 mem_req = { ++ .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2, ++ }; ++ disp->GetBufferMemoryRequirements2(_device, &mem_req_info, &mem_req); ++ ++ uint32_t mem_type_index = get_mem_type_index( ++ device, mem_req.memoryRequirements.memoryTypeBits, ++ VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); ++ if (mem_type_index == -1) ++ return VK_ERROR_OUT_OF_DEVICE_MEMORY; ++ ++ VkMemoryAllocateInfo alloc_info = { ++ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, ++ .allocationSize = mem_req.memoryRequirements.size, ++ .memoryTypeIndex = mem_type_index, ++ }; ++ result = disp->AllocateMemory(_device, &alloc_info, allocator, &bc1->bc1_table_mem); ++ if (unlikely(result != VK_SUCCESS)) ++ return result; ++ ++ result = disp->BindBufferMemory(_device, bc1->bc1_table_buf, bc1->bc1_table_mem, 0); ++ if (unlikely(result != VK_SUCCESS)) ++ return result; ++ ++ void *data; ++ disp->MapMemory(_device, bc1->bc1_table_mem, 0, size, 0, &data); ++ float (*data_array)[2] = (float(*)[2])data; ++ for (int i = 0; i < 256; i++) { ++ for (int j = 0; j < 2; j++) { ++ data_array[i][j] = (float)stb__OMatch5[i][j]; ++ data_array[i + 256][j] = (float)stb__OMatch6[i][j]; ++ } ++ } ++ disp->UnmapMemory(_device, bc1->bc1_table_mem); ++ ++ return result; ++} ++ ++VkResult ++vk_texcompress_create_image(struct vk_device *device, VkAllocationCallbacks *allocator, VkDeviceMemory *pMem, ++ VkImage *image, VkExtent3D extent, VkFormat format, int size) ++{ ++ bool is_dummy = (size != 0); ++ VkResult result; ++ VkDevice _device = vk_device_to_handle(device); ++ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ ++ VkImageCreateInfo image_create_info = { ++ .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, ++ .flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT, ++ .imageType = VK_IMAGE_TYPE_2D, ++ .format = format, ++ .extent = extent, ++ .mipLevels = 1, ++ .arrayLayers = 1, ++ .samples = VK_SAMPLE_COUNT_1_BIT, ++ .tiling = VK_IMAGE_TILING_OPTIMAL, ++ .usage = VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT, ++ .sharingMode = VK_SHARING_MODE_EXCLUSIVE, ++ .initialLayout = VK_IMAGE_LAYOUT_GENERAL, ++ }; ++ result = disp->CreateImage(_device, &image_create_info, allocator, image); ++ if (unlikely(result != VK_SUCCESS)) { ++ return result; ++ } ++ ++ if (is_dummy) { ++ VkMemoryRequirements mem_requirements; ++ disp->GetImageMemoryRequirements(_device, *image, &mem_requirements); ++ ++ uint32_t mem_type_index = get_mem_type_index( ++ device, mem_requirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); ++ if (mem_type_index == -1) { ++ return VK_ERROR_OUT_OF_HOST_MEMORY; ++ } ++ ++ VkMemoryAllocateInfo alloc_info = { ++ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, ++ .allocationSize = size, ++ .memoryTypeIndex = mem_type_index, ++ }; ++ result = disp->AllocateMemory(_device, &alloc_info, allocator, pMem); ++ if (unlikely(result != VK_SUCCESS)) { ++ return result; ++ } ++ } ++ if (is_dummy) { ++ disp->DestroyImage(_device, *image, allocator); ++ } ++ return result; ++} ++ ++void ++vk_texcompress_finish_image(struct vk_device *device, const VkAllocationCallbacks *allocator, ++ struct vk_texcompress_bcn_image *intermidate_image) ++{ ++ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ disp->DestroyImage(vk_device_to_handle(device), intermidate_image->image_rgba, NULL); ++ disp->DestroyImage(vk_device_to_handle(device), intermidate_image->image_bc1, NULL); ++ disp->DestroyImage(vk_device_to_handle(device), intermidate_image->image_bc4, NULL); ++ disp->FreeMemory(vk_device_to_handle(device), intermidate_image->image_mem, NULL); ++ vk_free(&device->alloc, intermidate_image); ++} ++ ++void bind_memory(struct vk_device *device, VkImage *image, VkDeviceMemory *pMem, int offset) ++{ ++ VkDevice _device = vk_device_to_handle(device); ++ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ disp->BindImageMemory(_device, *image, *pMem, offset); ++} ++ ++static VkResult ++create_sampler(struct vk_device *device, VkAllocationCallbacks *allocator, ++ VkSampler *sampler, VkFilter filter, VkSamplerMipmapMode mipmapMode) ++{ ++ VkDevice _device = vk_device_to_handle(device); ++ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ ++ VkSamplerCreateInfo create_info = { ++ .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, ++ .magFilter = filter, ++ .minFilter = filter, ++ .addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT, ++ .addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT, ++ .addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT, ++ .anisotropyEnable = VK_FALSE, ++ .borderColor = VK_BORDER_COLOR_INT_OPAQUE_BLACK, ++ .unnormalizedCoordinates = VK_FALSE, ++ .compareEnable = VK_FALSE, ++ .mipmapMode = mipmapMode, ++ }; ++ VkResult result = disp->CreateSampler(_device, &create_info, allocator, sampler); ++ return result; ++} ++ ++static VkResult ++create_bc1_layout(struct vk_device *device, VkAllocationCallbacks *allocator, ++ struct vk_texcompress_bcn_state *bcn) ++{ ++ VkResult result; ++ VkDevice _device = vk_device_to_handle(device); ++ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ ++ VkDescriptorSetLayoutBinding bindings[] = { ++ { ++ .binding = 0, ++ .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, ++ .descriptorCount = 1, ++ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, ++ .pImmutableSamplers = NULL, ++ }, ++ { ++ .binding = 1, ++ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, ++ .descriptorCount = 1, ++ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, ++ .pImmutableSamplers = NULL, ++ }, ++ { ++ .binding = 2, ++ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, ++ .descriptorCount = 1, ++ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, ++ .pImmutableSamplers = NULL, ++ }, ++ }; ++ ++ VkDescriptorSetLayoutCreateInfo ds_create_info = { ++ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, ++ .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR, ++ .bindingCount = ARRAY_SIZE(bindings), ++ .pBindings = bindings, ++ }; ++ ++ result = disp->CreateDescriptorSetLayout(_device, &ds_create_info, allocator, &bcn->ds_layout); ++ return result; ++} ++ ++static VkResult ++create_bc4_layout(struct vk_device *device, VkAllocationCallbacks *allocator, ++ struct vk_texcompress_bcn_state *bcn) ++{ ++ VkResult result; ++ VkDevice _device = vk_device_to_handle(device); ++ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ ++ VkDescriptorSetLayoutBinding bindings[] = { ++ { ++ .binding = 0, ++ .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, ++ .descriptorCount = 1, ++ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, ++ .pImmutableSamplers = NULL, ++ }, ++ { ++ .binding = 1, ++ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, ++ .descriptorCount = 1, ++ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, ++ .pImmutableSamplers = NULL, ++ }, ++ }; ++ ++ VkDescriptorSetLayoutCreateInfo ds_create_info = { ++ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, ++ .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR, ++ .bindingCount = ARRAY_SIZE(bindings), ++ .pBindings = bindings, ++ }; ++ ++ result = disp->CreateDescriptorSetLayout(_device, &ds_create_info, allocator, &bcn->ds_layout); ++ return result; ++} ++ ++static VkResult ++create_bc3_layout(struct vk_device *device, VkAllocationCallbacks *allocator, ++ struct vk_texcompress_bcn_state *bcn) ++{ ++ VkResult result; ++ VkDevice _device = vk_device_to_handle(device); ++ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ ++ VkDescriptorSetLayoutBinding bindings[] = { ++ { ++ .binding = 0, ++ .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, ++ .descriptorCount = 1, ++ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, ++ .pImmutableSamplers = NULL, ++ }, ++ { ++ .binding = 1, ++ .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, ++ .descriptorCount = 1, ++ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, ++ .pImmutableSamplers = NULL, ++ }, ++ { ++ .binding = 2, ++ .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, ++ .descriptorCount = 1, ++ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, ++ .pImmutableSamplers = NULL, ++ }, ++ }; ++ ++ VkDescriptorSetLayoutCreateInfo ds_create_info = { ++ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, ++ .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR, ++ .bindingCount = ARRAY_SIZE(bindings), ++ .pBindings = bindings, ++ }; ++ ++ result = disp->CreateDescriptorSetLayout(_device, &ds_create_info, allocator, &bcn->ds_layout); ++ return result; ++} ++ ++static VkResult ++create_pipeline_layout(struct vk_device *device, VkAllocationCallbacks *allocator, ++ struct vk_texcompress_bcn_state *bcn, enum compute_pipeline_id id) ++{ ++ VkResult result = VK_RESULT_MAX_ENUM; ++ switch (id) { ++ case BC1_ENCODE: ++ result = create_bc1_layout(device, allocator, bcn); ++ break; ++ case BC4_ENCODE: ++ result = create_bc4_layout(device, allocator, bcn); ++ break; ++ case BC3_ENCODE: ++ result = create_bc3_layout(device, allocator, bcn); ++ break; ++ default: ++ goto fail; ++ } ++ if (result != VK_SUCCESS) { ++ goto fail; ++ } ++ VkDevice _device = vk_device_to_handle(device); ++ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ VkPipelineLayoutCreateInfo pl_create_info = { ++ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, ++ .setLayoutCount = 1, ++ .pSetLayouts = &bcn->ds_layout, ++ .pushConstantRangeCount = 1, ++ .pPushConstantRanges = &(VkPushConstantRange) {VK_SHADER_STAGE_COMPUTE_BIT, 0, 8} ++ }; ++ result = disp->CreatePipelineLayout(_device, &pl_create_info, allocator, &bcn->p_layout); ++fail: ++ return result; ++} ++ ++static const uint32_t bc1_spv[] = { ++#include "bc1_spv.h" ++}; ++ ++static const uint32_t bc4_spv[] = { ++#include "bc4_spv.h" ++}; ++ ++static const uint32_t bc3_spv[] = { ++#include "bc3_spv.h" ++}; ++ ++static VkResult ++vk_bc1_create_shader_module(struct vk_device *device, ++ VkAllocationCallbacks *allocator, ++ struct vk_texcompress_bcn_state *bcn) ++{ ++ VkDevice _device = vk_device_to_handle(device); ++ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ ++ VkShaderModuleCreateInfo shader_module_create_info = { ++ .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, ++ .pNext = NULL, ++ .flags = 0, ++ .codeSize = sizeof(bc1_spv), ++ .pCode = bc1_spv, ++ }; ++ ++ return disp->CreateShaderModule(_device, &shader_module_create_info, allocator, &bcn->shader_module); ++} ++ ++static VkResult ++vk_bc4_create_shader_module(struct vk_device *device, ++ VkAllocationCallbacks *allocator, ++ struct vk_texcompress_bcn_state *bcn) ++{ ++ VkDevice _device = vk_device_to_handle(device); ++ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ ++ VkShaderModuleCreateInfo shader_module_create_info = { ++ .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, ++ .pNext = NULL, ++ .flags = 0, ++ .codeSize = sizeof(bc4_spv), ++ .pCode = bc4_spv, ++ }; ++ ++ return disp->CreateShaderModule(_device, &shader_module_create_info, allocator, &bcn->shader_module); ++} ++ ++static VkResult ++vk_bc3_create_shader_module(struct vk_device *device, ++ VkAllocationCallbacks *allocator, ++ struct vk_texcompress_bcn_state *bcn) ++{ ++ VkDevice _device = vk_device_to_handle(device); ++ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ ++ VkShaderModuleCreateInfo shader_module_create_info = { ++ .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, ++ .pNext = NULL, ++ .flags = 0, ++ .codeSize = sizeof(bc3_spv), ++ .pCode = bc3_spv, ++ }; ++ ++ return disp->CreateShaderModule(_device, &shader_module_create_info, allocator, &bcn->shader_module); ++} ++ ++static VkResult ++create_bcn_encode_pipeline(struct vk_device *device, ++ VkAllocationCallbacks *allocator, ++ struct vk_texcompress_bcn_state *bcn, ++ VkPipelineCache pipeline_cache) ++{ ++ VkResult result; ++ VkDevice _device = vk_device_to_handle(device); ++ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ VkPipeline pipeline; ++ ++ /* compute shader */ ++ VkPipelineShaderStageCreateInfo pipeline_shader_stage = { ++ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, ++ .stage = VK_SHADER_STAGE_COMPUTE_BIT, ++ .module = bcn->shader_module, ++ .pName = "main", ++ }; ++ ++ VkComputePipelineCreateInfo vk_pipeline_info = { ++ .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, ++ .stage = pipeline_shader_stage, ++ .flags = 0, ++ .layout = bcn->p_layout, ++ }; ++ ++ result = disp->CreateComputePipelines(_device, pipeline_cache, 1, &vk_pipeline_info, allocator, &pipeline); ++ if (result != VK_SUCCESS) ++ return result; ++ ++ bcn->pipeline = pipeline; ++ ++ return result; ++} ++ ++VkPipeline ++vk_texcompress_rgba_get_encode_pipeline(struct vk_device *device, VkAllocationCallbacks *allocator, ++ struct vk_texcompress_bcn_state *bcn, VkPipelineCache pipeline_cache, ++ enum compute_pipeline_id id) ++{ ++ VkResult result; ++ ++ simple_mtx_lock(&bcn->mutex); ++ ++ if (bcn->pipeline) ++ goto unlock; ++ ++ if (id >= MAX_PIPELINE_NUM || id < 0) ++ goto unlock; ++ ++ if (id == BC1_ENCODE && !bcn->shader_module) { ++ result = vk_bc1_create_shader_module(device, allocator, bcn); ++ if (result != VK_SUCCESS) ++ goto unlock; ++ } ++ ++ if (id == BC4_ENCODE && !bcn->shader_module) { ++ result = vk_bc4_create_shader_module(device, allocator, bcn); ++ if (result != VK_SUCCESS) ++ goto unlock; ++ } ++ ++ if (id == BC3_ENCODE && !bcn->shader_module) { ++ result = vk_bc3_create_shader_module(device, allocator, bcn); ++ if (result != VK_SUCCESS) ++ goto unlock; ++ } ++ ++ create_bcn_encode_pipeline(device, allocator, bcn, pipeline_cache); ++ ++unlock: ++ simple_mtx_unlock(&bcn->mutex); ++ return bcn->pipeline; ++} ++ ++static inline void ++fill_desc_image_info_struct(VkDescriptorImageInfo *info, VkSampler sampler, VkImageView img_view, ++ VkImageLayout img_layout) ++{ ++ info->sampler = sampler; ++ info->imageView = img_view; ++ info->imageLayout = img_layout; ++} ++ ++static inline void ++fill_write_descriptor_set_image(VkWriteDescriptorSet *set, uint8_t bind_i, ++ VkDescriptorType desc_type, VkDescriptorImageInfo *image_info) ++{ ++ set->sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; ++ set->pNext = NULL; ++ set->dstSet = VK_NULL_HANDLE; ++ set->dstBinding = bind_i; ++ set->dstArrayElement = 0; ++ set->descriptorCount = 1; ++ set->descriptorType = desc_type; ++ set->pImageInfo = image_info; ++ set->pBufferInfo = NULL; ++ set->pTexelBufferView = NULL; ++} ++ ++static inline void ++fill_write_descriptor_set_buffer(VkWriteDescriptorSet *set, ++ uint8_t bind_i, VkDescriptorType desc_type, ++ VkDescriptorBufferInfo *buf_info) ++{ ++ set->sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; ++ set->pNext = NULL; ++ set->dstSet = VK_NULL_HANDLE; ++ set->dstBinding = bind_i; ++ set->dstArrayElement = 0; ++ set->descriptorCount = 1; ++ set->descriptorType = desc_type; ++ set->pImageInfo = NULL; ++ set->pBufferInfo = buf_info; ++ set->pTexelBufferView = NULL; ++} ++ ++void ++vk_texcompress_bc1_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn, ++ struct vk_texcompress_bcn_write_descriptor_set *set, ++ VkImageView src_img_view, VkImageLayout src_img_layout, ++ VkImageView dst_img_view) ++{ ++ fill_desc_image_info_struct(&set->src_desc_image_info, bcn->sampler, src_img_view, src_img_layout); ++ fill_write_descriptor_set_image(&set->descriptor_set[0], 0, ++ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, &set->src_desc_image_info); ++ ++ set->bc1_buffer_info.buffer = bcn->bc1_table_buf; ++ set->bc1_buffer_info.offset = 0; ++ set->bc1_buffer_info.range = VK_WHOLE_SIZE; ++ fill_write_descriptor_set_buffer(&set->descriptor_set[1], 1, ++ VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &set->bc1_buffer_info); ++ ++ fill_desc_image_info_struct(&set->dst_desc_image_info, VK_NULL_HANDLE, dst_img_view, VK_IMAGE_LAYOUT_GENERAL); ++ fill_write_descriptor_set_image(&set->descriptor_set[2], 2, ++ VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &set->dst_desc_image_info); ++} ++ ++void ++vk_texcompress_bc4_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn, ++ struct vk_texcompress_bc4_write_descriptor_set *set, ++ VkImageView src_img_view, VkImageLayout src_img_layout, ++ VkImageView dst_img_view) ++{ ++ fill_desc_image_info_struct(&set->src_desc_image_info, bcn->sampler, src_img_view, src_img_layout); ++ fill_write_descriptor_set_image(&set->descriptor_set[0], 0, ++ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, &set->src_desc_image_info); ++ ++ fill_desc_image_info_struct(&set->dst_desc_image_info, VK_NULL_HANDLE, dst_img_view, VK_IMAGE_LAYOUT_GENERAL); ++ fill_write_descriptor_set_image(&set->descriptor_set[1], 1, ++ VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &set->dst_desc_image_info); ++} ++ ++void ++vk_texcompress_bc3_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn, ++ struct vk_texcompress_bcn_write_descriptor_set *set, ++ VkImageView bc1_src_img_view, VkImageView bc4_src_img_view, ++ VkImageView dst_img_view) ++{ ++ fill_desc_image_info_struct(&set->src_desc_image_info, bcn->sampler, bc1_src_img_view, VK_IMAGE_LAYOUT_GENERAL); ++ fill_write_descriptor_set_image(&set->descriptor_set[0], 0, ++ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, &set->src_desc_image_info); ++ ++ fill_desc_image_info_struct(&set->src_desc_image_info_2, bcn->sampler, bc4_src_img_view, VK_IMAGE_LAYOUT_GENERAL); ++ fill_write_descriptor_set_image(&set->descriptor_set[1], 1, ++ VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, &set->src_desc_image_info_2); ++ ++ fill_desc_image_info_struct(&set->dst_desc_image_info, VK_NULL_HANDLE, dst_img_view, VK_IMAGE_LAYOUT_GENERAL); ++ fill_write_descriptor_set_image(&set->descriptor_set[2], 2, ++ VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &set->dst_desc_image_info); ++} ++ ++VkResult ++vk_texcompress_bcn_init(struct vk_device *device, VkAllocationCallbacks *allocator, ++ VkPipelineCache pipeline_cache, ++ struct vk_texcompress_bcn_state *bcn[]) ++{ ++ VkResult result; ++ ++ for (int pi = VK_TEXCOMPRESS_BCN_NUM_COMPUTE_PIPELINES - 1; pi >= 0; --pi) { ++ /* memory to be freed as part of vk_bcn_decode_finish() */ ++ bcn[pi] = vk_zalloc(allocator, sizeof(struct vk_texcompress_bcn_state), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); ++ if (bcn[pi] == NULL) { ++ return VK_ERROR_OUT_OF_HOST_MEMORY; ++ } ++ simple_mtx_init(&(bcn[pi])->mutex, mtx_plain); ++ } ++ ++ result = create_buffer(device, allocator, bcn[BC1_ENCODE]); ++ if (result != VK_SUCCESS) ++ goto fail; ++ ++ result = create_sampler(device, allocator, &bcn[BC1_ENCODE]->sampler, ++ VK_FILTER_LINEAR, VK_SAMPLER_MIPMAP_MODE_LINEAR); ++ if (result != VK_SUCCESS) ++ goto fail; ++ ++ result = create_pipeline_layout(device, allocator, bcn[BC1_ENCODE], BC1_ENCODE); ++ if (result != VK_SUCCESS) ++ goto fail; ++ ++ result = create_sampler(device, allocator, &bcn[BC4_ENCODE]->sampler, ++ VK_FILTER_LINEAR, VK_SAMPLER_MIPMAP_MODE_LINEAR); ++ if (result != VK_SUCCESS) ++ goto fail; ++ ++ result = create_pipeline_layout(device, allocator, bcn[BC4_ENCODE], BC4_ENCODE); ++ if (result != VK_SUCCESS) ++ goto fail; ++ ++ // 由于VK_FORMAT_R32G32_UINT格式的imageView格式不包含VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT, 采用nearest ++ result = create_sampler(device, allocator, &bcn[BC3_ENCODE]->sampler, ++ VK_FILTER_NEAREST, VK_SAMPLER_MIPMAP_MODE_NEAREST); ++ if (result != VK_SUCCESS) ++ goto fail; ++ ++ result = create_pipeline_layout(device, allocator, bcn[BC3_ENCODE], BC3_ENCODE); ++ if (result != VK_SUCCESS) ++ goto fail; ++ ++fail: ++ return result; ++} ++ ++void ++vk_texcompress_bcn_finish(struct vk_device *device, ++ VkAllocationCallbacks *allocator, ++ struct vk_texcompress_bcn_state *bcn[]) ++{ ++ VkDevice _device = vk_device_to_handle(device); ++ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ ++ for (int pi = VK_TEXCOMPRESS_BCN_NUM_COMPUTE_PIPELINES - 1; pi >= 0; --pi) { ++ if (bcn[pi]->pipeline) ++ disp->DestroyPipeline(_device, bcn[pi]->pipeline, allocator); ++ ++ disp->DestroyPipelineLayout(_device, bcn[pi]->p_layout, allocator); ++ disp->DestroyShaderModule(_device, bcn[pi]->shader_module, allocator); ++ disp->DestroyDescriptorSetLayout(_device, bcn[pi]->ds_layout, allocator); ++ ++ if (pi == BC1_ENCODE) { ++ disp->DestroyBuffer(_device, bcn[pi]->bc1_table_buf, allocator); ++ disp->FreeMemory(_device, bcn[pi]->bc1_table_mem, allocator); ++ } ++ vk_free(allocator, bcn[pi]); ++ } ++} ++ ++void ++vk_texcompress_create_fence(struct vk_device *device, VkAllocationCallbacks *allocator, VkFence *fence) ++{ ++ VkDevice _device = vk_device_to_handle(device); ++ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ ++ VkFenceCreateInfo create_info = { ++ .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, ++ .flags = VK_FENCE_CREATE_SIGNALED_BIT, ++ }; ++ VkResult result = disp->CreateFence(_device, &create_info, allocator, fence); ++ if (result != VK_SUCCESS) { ++ ALOGE("Failed to create fence"); ++ } ++} ++ ++void ++vk_texcompress_wait_fence(struct vk_device *device, VkAllocationCallbacks *allocator, VkFence *fence) ++{ ++ VkDevice _device = vk_device_to_handle(device); ++ const struct vk_device_dispatch_table *disp = &device->dispatch_table; ++ ++ VkResult result = disp->WaitForFences(_device, 1, fence, VK_TRUE, UINT64_MAX); ++ if (result != VK_SUCCESS) { ++ ALOGE("Failed to wait for fence"); ++ } ++} +\ No newline at end of file +diff --git a/src/vulkan/runtime/vk_texcompress_bcn.h b/src/vulkan/runtime/vk_texcompress_bcn.h +new file mode 100644 +index 00000000000..ca8189b657e +--- /dev/null ++++ b/src/vulkan/runtime/vk_texcompress_bcn.h +@@ -0,0 +1,142 @@ ++/* Copyright (c) 2017-2023 Hans-Kristian Arntzen ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining ++ * a copy of this software and associated documentation files (the ++ * "Software"), to deal in the Software without restriction, including ++ * without limitation the rights to use, copy, modify, merge, publish, ++ * distribute, sublicense, and/or sell copies of the Software, and to ++ * permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be ++ * included in all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ++ */ ++#ifndef VK_TEXCOMPRESS_BCN_H ++#define VK_TEXCOMPRESS_BCN_H ++ ++#include "util/simple_mtx.h" ++#include "vk_fence.h" ++#include "vk_device.h" ++ ++#define VK_TEXCOMPRESS_BCN_NUM_COMPUTE_PIPELINES 3 ++#define VK_TEXCOMPRESS_BCN_WRITE_DESC_SET_COUNT 3 // 每个着色器最多有3个描述符集 ++ ++struct vk_texcompress_bcn_write_descriptor_set { ++ VkWriteDescriptorSet descriptor_set[VK_TEXCOMPRESS_BCN_WRITE_DESC_SET_COUNT]; ++ VkDescriptorImageInfo dst_desc_image_info; ++ VkDescriptorImageInfo src_desc_image_info; ++ VkDescriptorImageInfo src_desc_image_info_2; ++ VkDescriptorBufferInfo bc1_buffer_info; ++}; ++ ++struct vk_texcompress_bc4_write_descriptor_set { ++ VkWriteDescriptorSet descriptor_set[2]; ++ VkDescriptorImageInfo dst_desc_image_info; ++ VkDescriptorImageInfo src_desc_image_info; ++}; ++ ++typedef struct vk_texcompress_bcn_image { ++ VkImage image_dummy; ++ VkImage image_rgba; ++ VkImage image_bc1; ++ VkImage image_bc4; ++ VkDeviceMemory image_mem; ++} bcn_image; ++ ++struct vk_texcompress_dummy_image { ++ VkImage image_dummy; ++ VkDeviceMemory image_mem; ++}; ++ ++typedef struct vk_texcompress_staging_images { ++ struct vk_texcompress_bcn_image *current; ++ struct vk_texcompress_staging_images* next; ++} staging_images; ++ ++enum compute_pipeline_id { ++ BC1_ENCODE = 0, ++ BC4_ENCODE, ++ BC3_ENCODE, ++ MAX_PIPELINE_NUM ++}; ++ ++struct vk_texcompress_bcn_state { ++ simple_mtx_t mutex; ++ VkDescriptorSetLayout ds_layout; ++ VkPipelineLayout p_layout; ++ VkPipeline pipeline; ++ VkShaderModule shader_module; ++ VkSampler sampler; ++ ++ VkDeviceMemory bc1_table_mem; ++ VkBuffer bc1_table_buf; ++}; ++ ++void ++vk_texcompress_insert_head(staging_images **head, bcn_image *current); ++ ++void ++vk_texcompress_delete_head(staging_images **head, struct vk_device *device); ++ ++VkResult ++vk_texcompress_create_image(struct vk_device *device, VkAllocationCallbacks *allocator, VkDeviceMemory *pMem, ++ VkImage *image, VkExtent3D extent, VkFormat format, int size); ++ ++void bind_memory(struct vk_device *device, VkImage *image, VkDeviceMemory *pMem, int offset); ++ ++void ++vk_texcompress_finish_image(struct vk_device *device, const VkAllocationCallbacks *allocator, ++ struct vk_texcompress_bcn_image *intermidate_image); ++ ++void ++vk_texcompress_bc1_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn, ++ struct vk_texcompress_bcn_write_descriptor_set *set, ++ VkImageView src_img_view, VkImageLayout src_img_layout, ++ VkImageView dst_img_view); ++ ++void ++vk_texcompress_bc4_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn, ++ struct vk_texcompress_bc4_write_descriptor_set *set, ++ VkImageView src_img_view, VkImageLayout src_img_layout, ++ VkImageView dst_img_view); ++ ++void ++vk_texcompress_bc3_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn, ++ struct vk_texcompress_bcn_write_descriptor_set *set, ++ VkImageView bc1_src_img_view, VkImageView bc4_src_img_view, ++ VkImageView dst_img_view); ++ ++VkPipeline ++vk_texcompress_rgba_get_encode_pipeline(struct vk_device *device, ++ VkAllocationCallbacks *allocator, ++ struct vk_texcompress_bcn_state *bcn, ++ VkPipelineCache pipeline_cache, ++ enum compute_pipeline_id id); ++ ++VkResult ++vk_texcompress_bcn_init(struct vk_device *device, ++ VkAllocationCallbacks *allocator, ++ VkPipelineCache pipeline_cache, ++ struct vk_texcompress_bcn_state *bcn[]); ++ ++void ++vk_texcompress_bcn_finish(struct vk_device *device, ++ VkAllocationCallbacks *allocator, ++ struct vk_texcompress_bcn_state *bcn[]); ++ ++ ++void ++vk_texcompress_create_fence(struct vk_device *device, VkAllocationCallbacks *allocator, VkFence *fence); ++ ++void ++vk_texcompress_wait_fence(struct vk_device *device, VkAllocationCallbacks *allocator, VkFence *fence); ++ ++#endif /* VK_TEXCOMPRESS_BCN_H */ -- Gitee From 646e99d044d2e307bd583369a8d3c462b3e16635 Mon Sep 17 00:00:00 2001 From: zhuanghb3 Date: Thu, 25 Sep 2025 02:18:55 +0000 Subject: [PATCH 3/3] =?UTF-8?q?=E4=BD=BF=E8=83=BDvulkan=20astc=E5=92=8Cetc?= =?UTF-8?q?=E7=BA=B9=E7=90=86=E7=A1=AC=E8=A7=A3=E7=A1=AC=E7=BC=96=E4=B8=BA?= =?UTF-8?q?bc3=E7=BA=B9=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- patchForAndroid/external-mesa-0003.patch | 1579 ++++++++++++++++++++++ 1 file changed, 1579 insertions(+) create mode 100644 patchForAndroid/external-mesa-0003.patch diff --git a/patchForAndroid/external-mesa-0003.patch b/patchForAndroid/external-mesa-0003.patch new file mode 100644 index 0000000..1b0df66 --- /dev/null +++ b/patchForAndroid/external-mesa-0003.patch @@ -0,0 +1,1579 @@ +diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c +index 7172e41ab75..522da8c8699 100644 +--- a/src/mesa/main/shaderapi.c ++++ b/src/mesa/main/shaderapi.c +@@ -2674,12 +2674,11 @@ _mesa_copy_linked_program_data(const struct gl_shader_program *src, + /** + * ARB_separate_shader_objects: Compile & Link Program + */ +-GLuint GLAPIENTRY +-_mesa_CreateShaderProgramv(GLenum type, GLsizei count, +- const GLchar* const *strings) ++GLuint ++_mesa_CreateShaderProgramv_impl(struct gl_context *ctx, ++ GLenum type, GLsizei count, ++ const GLchar* const *strings) + { +- GET_CURRENT_CONTEXT(ctx); +- + const GLuint shader = create_shader_err(ctx, type, "glCreateShaderProgramv"); + GLuint program = 0; + +@@ -2731,6 +2730,17 @@ _mesa_CreateShaderProgramv(GLenum type, GLsizei count, + return program; + } + ++/** ++ * ARB_separate_shader_objects: Compile & Link Program ++ */ ++GLuint GLAPIENTRY ++_mesa_CreateShaderProgramv(GLenum type, GLsizei count, ++ const GLchar* const *strings) ++{ ++ GET_CURRENT_CONTEXT(ctx); ++ ++ return _mesa_CreateShaderProgramv_impl(ctx, type, count, strings); ++} + + static void + set_patch_vertices(struct gl_context *ctx, GLint value) +diff --git a/src/mesa/main/shaderapi.h b/src/mesa/main/shaderapi.h +index 90e90fee743..1367dd8b5ee 100644 +--- a/src/mesa/main/shaderapi.h ++++ b/src/mesa/main/shaderapi.h +@@ -200,6 +200,10 @@ const char * + _mesa_lookup_shader_include(struct gl_context *ctx, char *path, + bool error_check); + ++GLuint ++_mesa_CreateShaderProgramv_impl(struct gl_context *ctx, ++ GLenum type, GLsizei count, ++ const GLchar* const *strings); + #ifdef __cplusplus + } + #endif +diff --git a/src/mesa/meson.build b/src/mesa/meson.build +index bc7963413ff..06cba8d8353 100644 +--- a/src/mesa/meson.build ++++ b/src/mesa/meson.build +@@ -320,6 +320,7 @@ files_libmesa = files( + 'state_tracker/st_atom_tess.c', + 'state_tracker/st_atom_texture.c', + 'state_tracker/st_atom_viewport.c', ++ 'state_tracker/st_bc1_tables.h', + 'state_tracker/st_cb_bitmap.c', + 'state_tracker/st_cb_bitmap.h', + 'state_tracker/st_cb_bitmap_shader.c', +@@ -388,6 +389,8 @@ files_libmesa = files( + 'state_tracker/st_scissor.h', + 'state_tracker/st_shader_cache.c', + 'state_tracker/st_shader_cache.h', ++ 'state_tracker/st_texcompress_compute.c', ++ 'state_tracker/st_texcompress_compute.h', + 'state_tracker/st_texture.c', + 'state_tracker/st_texture.h', + 'state_tracker/st_tgsi_lower_yuv.c', +diff --git a/src/mesa/state_tracker/st_bc1_tables.h b/src/mesa/state_tracker/st_bc1_tables.h +new file mode 100644 +index 00000000000..654ccdadb78 +--- /dev/null ++++ b/src/mesa/state_tracker/st_bc1_tables.h +@@ -0,0 +1,124 @@ ++/* ++ * Copyright 2020-2022 Matias N. Goldberg ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ */ ++ ++/* These tables have been generated with: ++ ++ #define STB_DXT_IMPLEMENTATION ++ #include "stb_dxt.h" ++ #include ++ ++ int main() ++ { ++ stb__InitDXT(); ++ ++ printf( "static unsigned char stb__OMatch5[256][2] = {\n" ); ++ for( size_t i = 0u; i < 256u; ++i ) ++ { ++ printf( "{ %i, %i }", stb__OMatch5[i][0], stb__OMatch5[i][1] ); ++ if( i != 255u ) ++ printf( ",\n" ); ++ } ++ printf( "\n};\n" ); ++ ++ printf( "static unsigned char stb__OMatch6[256][2] = {\n" ); ++ for( size_t i = 0u; i < 256u; ++i ) ++ { ++ printf( "{ %i, %i }", stb__OMatch6[i][0], stb__OMatch6[i][1] ); ++ if( i != 255u ) ++ printf( ",\n" ); ++ } ++ printf( "\n};\n" ); ++ ++ return 0; ++ } ++ ++ Note that stb__OMatch5[i][0] = max and stb__OMatch5[i][1] = min ++*/ ++static unsigned char stb__OMatch5[256][2] = { ++ { 0, 0 }, { 0, 0 }, { 0, 1 }, { 0, 1 }, { 1, 0 }, { 1, 0 }, { 1, 0 }, { 1, 1 }, ++ { 1, 1 }, { 2, 0 }, { 2, 0 }, { 0, 4 }, { 2, 1 }, { 2, 1 }, { 2, 1 }, { 3, 0 }, ++ { 3, 0 }, { 3, 0 }, { 3, 1 }, { 1, 5 }, { 3, 2 }, { 3, 2 }, { 4, 0 }, { 4, 0 }, ++ { 4, 1 }, { 4, 1 }, { 4, 2 }, { 4, 2 }, { 4, 2 }, { 3, 5 }, { 5, 1 }, { 5, 1 }, ++ { 5, 2 }, { 4, 4 }, { 5, 3 }, { 5, 3 }, { 5, 3 }, { 6, 2 }, { 6, 2 }, { 6, 2 }, ++ { 6, 3 }, { 5, 5 }, { 6, 4 }, { 6, 4 }, { 4, 8 }, { 7, 3 }, { 7, 3 }, { 7, 3 }, ++ { 7, 4 }, { 7, 4 }, { 7, 4 }, { 7, 5 }, { 5, 9 }, { 7, 6 }, { 7, 6 }, { 8, 4 }, ++ { 8, 4 }, { 8, 5 }, { 8, 5 }, { 8, 6 }, { 8, 6 }, { 8, 6 }, { 7, 9 }, { 9, 5 }, ++ { 9, 5 }, { 9, 6 }, { 8, 8 }, { 9, 7 }, { 9, 7 }, { 9, 7 }, { 10, 6 }, { 10, 6 }, ++ { 10, 6 }, { 10, 7 }, { 9, 9 }, { 10, 8 }, { 10, 8 }, { 8, 12 }, { 11, 7 }, { 11, 7 }, ++ { 11, 7 }, { 11, 8 }, { 11, 8 }, { 11, 8 }, { 11, 9 }, { 9, 13 }, { 11, 10 }, { 11, 10 }, ++ { 12, 8 }, { 12, 8 }, { 12, 9 }, { 12, 9 }, { 12, 10 }, { 12, 10 }, { 12, 10 }, { 11, 13 }, ++ { 13, 9 }, { 13, 9 }, { 13, 10 }, { 12, 12 }, { 13, 11 }, { 13, 11 }, { 13, 11 }, { 14, 10 }, ++ { 14, 10 }, { 14, 10 }, { 14, 11 }, { 13, 13 }, { 14, 12 }, { 14, 12 }, { 12, 16 }, { 15, 11 }, ++ { 15, 11 }, { 15, 11 }, { 15, 12 }, { 15, 12 }, { 15, 12 }, { 15, 13 }, { 13, 17 }, { 15, 14 }, ++ { 15, 14 }, { 16, 12 }, { 16, 12 }, { 16, 13 }, { 16, 13 }, { 16, 14 }, { 16, 14 }, { 16, 14 }, ++ { 15, 17 }, { 17, 13 }, { 17, 13 }, { 17, 14 }, { 16, 16 }, { 17, 15 }, { 17, 15 }, { 17, 15 }, ++ { 18, 14 }, { 18, 14 }, { 18, 14 }, { 18, 15 }, { 17, 17 }, { 18, 16 }, { 18, 16 }, { 16, 20 }, ++ { 19, 15 }, { 19, 15 }, { 19, 15 }, { 19, 16 }, { 19, 16 }, { 19, 16 }, { 19, 17 }, { 17, 21 }, ++ { 19, 18 }, { 19, 18 }, { 20, 16 }, { 20, 16 }, { 20, 17 }, { 20, 17 }, { 20, 18 }, { 20, 18 }, ++ { 20, 18 }, { 19, 21 }, { 21, 17 }, { 21, 17 }, { 21, 18 }, { 20, 20 }, { 21, 19 }, { 21, 19 }, ++ { 21, 19 }, { 22, 18 }, { 22, 18 }, { 22, 18 }, { 22, 19 }, { 21, 21 }, { 22, 20 }, { 22, 20 }, ++ { 20, 24 }, { 23, 19 }, { 23, 19 }, { 23, 19 }, { 23, 20 }, { 23, 20 }, { 23, 20 }, { 23, 21 }, ++ { 21, 25 }, { 23, 22 }, { 23, 22 }, { 24, 20 }, { 24, 20 }, { 24, 21 }, { 24, 21 }, { 24, 22 }, ++ { 24, 22 }, { 24, 22 }, { 23, 25 }, { 25, 21 }, { 25, 21 }, { 25, 22 }, { 24, 24 }, { 25, 23 }, ++ { 25, 23 }, { 25, 23 }, { 26, 22 }, { 26, 22 }, { 26, 22 }, { 26, 23 }, { 25, 25 }, { 26, 24 }, ++ { 26, 24 }, { 24, 28 }, { 27, 23 }, { 27, 23 }, { 27, 23 }, { 27, 24 }, { 27, 24 }, { 27, 24 }, ++ { 27, 25 }, { 25, 29 }, { 27, 26 }, { 27, 26 }, { 28, 24 }, { 28, 24 }, { 28, 25 }, { 28, 25 }, ++ { 28, 26 }, { 28, 26 }, { 28, 26 }, { 27, 29 }, { 29, 25 }, { 29, 25 }, { 29, 26 }, { 28, 28 }, ++ { 29, 27 }, { 29, 27 }, { 29, 27 }, { 30, 26 }, { 30, 26 }, { 30, 26 }, { 30, 27 }, { 29, 29 }, ++ { 30, 28 }, { 30, 28 }, { 30, 28 }, { 31, 27 }, { 31, 27 }, { 31, 27 }, { 31, 28 }, { 31, 28 }, ++ { 31, 28 }, { 31, 29 }, { 31, 29 }, { 31, 30 }, { 31, 30 }, { 31, 30 }, { 31, 31 }, { 31, 31 } ++}; ++ ++static unsigned char stb__OMatch6[256][2] = { ++ { 0, 0 }, { 0, 1 }, { 1, 0 }, { 1, 0 }, { 1, 1 }, { 2, 0 }, { 2, 1 }, { 3, 0 }, ++ { 3, 0 }, { 3, 1 }, { 4, 0 }, { 4, 0 }, { 4, 1 }, { 5, 0 }, { 5, 1 }, { 6, 0 }, ++ { 6, 0 }, { 6, 1 }, { 7, 0 }, { 7, 0 }, { 7, 1 }, { 8, 0 }, { 8, 1 }, { 8, 1 }, ++ { 8, 2 }, { 9, 1 }, { 9, 2 }, { 9, 2 }, { 9, 3 }, { 10, 2 }, { 10, 3 }, { 10, 3 }, ++ { 10, 4 }, { 11, 3 }, { 11, 4 }, { 11, 4 }, { 11, 5 }, { 12, 4 }, { 12, 5 }, { 12, 5 }, ++ { 12, 6 }, { 13, 5 }, { 13, 6 }, { 8, 16 }, { 13, 7 }, { 14, 6 }, { 14, 7 }, { 9, 17 }, ++ { 14, 8 }, { 15, 7 }, { 15, 8 }, { 11, 16 }, { 15, 9 }, { 15, 10 }, { 16, 8 }, { 16, 9 }, ++ { 16, 10 }, { 15, 13 }, { 17, 9 }, { 17, 10 }, { 17, 11 }, { 15, 16 }, { 18, 10 }, { 18, 11 }, ++ { 18, 12 }, { 16, 16 }, { 19, 11 }, { 19, 12 }, { 19, 13 }, { 17, 17 }, { 20, 12 }, { 20, 13 }, ++ { 20, 14 }, { 19, 16 }, { 21, 13 }, { 21, 14 }, { 21, 15 }, { 20, 17 }, { 22, 14 }, { 22, 15 }, ++ { 25, 10 }, { 22, 16 }, { 23, 15 }, { 23, 16 }, { 26, 11 }, { 23, 17 }, { 24, 16 }, { 24, 17 }, ++ { 27, 12 }, { 24, 18 }, { 25, 17 }, { 25, 18 }, { 28, 13 }, { 25, 19 }, { 26, 18 }, { 26, 19 }, ++ { 29, 14 }, { 26, 20 }, { 27, 19 }, { 27, 20 }, { 30, 15 }, { 27, 21 }, { 28, 20 }, { 28, 21 }, ++ { 28, 21 }, { 28, 22 }, { 29, 21 }, { 29, 22 }, { 24, 32 }, { 29, 23 }, { 30, 22 }, { 30, 23 }, ++ { 25, 33 }, { 30, 24 }, { 31, 23 }, { 31, 24 }, { 27, 32 }, { 31, 25 }, { 31, 26 }, { 32, 24 }, ++ { 32, 25 }, { 32, 26 }, { 31, 29 }, { 33, 25 }, { 33, 26 }, { 33, 27 }, { 31, 32 }, { 34, 26 }, ++ { 34, 27 }, { 34, 28 }, { 32, 32 }, { 35, 27 }, { 35, 28 }, { 35, 29 }, { 33, 33 }, { 36, 28 }, ++ { 36, 29 }, { 36, 30 }, { 35, 32 }, { 37, 29 }, { 37, 30 }, { 37, 31 }, { 36, 33 }, { 38, 30 }, ++ { 38, 31 }, { 41, 26 }, { 38, 32 }, { 39, 31 }, { 39, 32 }, { 42, 27 }, { 39, 33 }, { 40, 32 }, ++ { 40, 33 }, { 43, 28 }, { 40, 34 }, { 41, 33 }, { 41, 34 }, { 44, 29 }, { 41, 35 }, { 42, 34 }, ++ { 42, 35 }, { 45, 30 }, { 42, 36 }, { 43, 35 }, { 43, 36 }, { 46, 31 }, { 43, 37 }, { 44, 36 }, ++ { 44, 37 }, { 44, 37 }, { 44, 38 }, { 45, 37 }, { 45, 38 }, { 40, 48 }, { 45, 39 }, { 46, 38 }, ++ { 46, 39 }, { 41, 49 }, { 46, 40 }, { 47, 39 }, { 47, 40 }, { 43, 48 }, { 47, 41 }, { 47, 42 }, ++ { 48, 40 }, { 48, 41 }, { 48, 42 }, { 47, 45 }, { 49, 41 }, { 49, 42 }, { 49, 43 }, { 47, 48 }, ++ { 50, 42 }, { 50, 43 }, { 50, 44 }, { 48, 48 }, { 51, 43 }, { 51, 44 }, { 51, 45 }, { 49, 49 }, ++ { 52, 44 }, { 52, 45 }, { 52, 46 }, { 51, 48 }, { 53, 45 }, { 53, 46 }, { 53, 47 }, { 52, 49 }, ++ { 54, 46 }, { 54, 47 }, { 57, 42 }, { 54, 48 }, { 55, 47 }, { 55, 48 }, { 58, 43 }, { 55, 49 }, ++ { 56, 48 }, { 56, 49 }, { 59, 44 }, { 56, 50 }, { 57, 49 }, { 57, 50 }, { 60, 45 }, { 57, 51 }, ++ { 58, 50 }, { 58, 51 }, { 61, 46 }, { 58, 52 }, { 59, 51 }, { 59, 52 }, { 62, 47 }, { 59, 53 }, ++ { 60, 52 }, { 60, 53 }, { 60, 53 }, { 60, 54 }, { 61, 53 }, { 61, 54 }, { 61, 54 }, { 61, 55 }, ++ { 62, 54 }, { 62, 55 }, { 62, 55 }, { 62, 56 }, { 63, 55 }, { 63, 56 }, { 63, 56 }, { 63, 57 }, ++ { 63, 58 }, { 63, 59 }, { 63, 59 }, { 63, 60 }, { 63, 61 }, { 63, 62 }, { 63, 62 }, { 63, 63 } ++}; +diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c +index f4609e73cf4..bd81d022e6a 100644 +--- a/src/mesa/state_tracker/st_cb_texture.c ++++ b/src/mesa/state_tracker/st_cb_texture.c +@@ -27,6 +27,7 @@ + + #include + #include "main/bufferobj.h" ++#include "main/context.h" + #include "main/enums.h" + #include "main/errors.h" + #include "main/fbobject.h" +@@ -62,10 +63,12 @@ + #include "state_tracker/st_gen_mipmap.h" + #include "state_tracker/st_atom.h" + #include "state_tracker/st_sampler_view.h" ++#include "state_tracker/st_texcompress_compute.h" + #include "state_tracker/st_util.h" + + #include "pipe/p_context.h" + #include "pipe/p_defines.h" ++#include "util/log.h" + #include "util/u_inlines.h" + #include "util/u_upload_mgr.h" + #include "pipe/p_shader_tokens.h" +@@ -479,8 +482,6 @@ st_MapTextureImage(struct gl_context *ctx, + GLubyte **mapOut, GLint *rowStrideOut) + { + struct st_context *st = st_context(ctx); +- GLubyte *map; +- struct pipe_transfer *transfer; + + /* Check for unexpected flags */ + assert((mode & ~(GL_MAP_READ_BIT | +@@ -490,47 +491,59 @@ st_MapTextureImage(struct gl_context *ctx, + const enum pipe_map_flags transfer_flags = + _mesa_access_flags_to_transfer_flags(mode, false); + +- map = st_texture_image_map(st, texImage, transfer_flags, x, y, slice, w, h, 1, +- &transfer); +- if (map) { +- if (st_compressed_format_fallback(st, texImage->TexFormat)) { +- /* Some compressed formats don't have to be supported by drivers, +- * and st/mesa transparently handles decompression on upload (Unmap), +- * so that drivers don't see the compressed formats. +- * +- * We store the compressed data (it's needed for glGetCompressedTex- +- * Image and image copies in OES_copy_image). +- */ +- unsigned z = transfer->box.z; +- struct st_texture_image_transfer *itransfer = &texImage->transfer[z]; +- +- unsigned blk_w, blk_h; +- _mesa_get_format_block_size(texImage->TexFormat, &blk_w, &blk_h); +- +- unsigned y_blocks = DIV_ROUND_UP(texImage->Height2, blk_h); +- unsigned stride = *rowStrideOut = itransfer->temp_stride = +- _mesa_format_row_stride(texImage->TexFormat, texImage->Width2); +- unsigned block_size = _mesa_get_format_bytes(texImage->TexFormat); +- +- assert(texImage->compressed_data); +- *mapOut = itransfer->temp_data = +- texImage->compressed_data->ptr + +- (z * y_blocks + (y / blk_h)) * stride + +- (x / blk_w) * block_size; +- itransfer->map = map; +- } +- else { +- /* supported mapping */ +- *mapOut = map; +- *rowStrideOut = transfer->stride; +- } +- } +- else { +- *mapOut = NULL; +- *rowStrideOut = 0; ++ if (st_compressed_format_fallback(st, texImage->TexFormat)) { ++ /* Some compressed formats don't have to be supported by drivers, ++ * and st/mesa transparently handles decompression on upload (Unmap), ++ * so that drivers don't see the compressed formats. ++ * ++ * We store the compressed data (it's needed for glGetCompressedTexImage ++ * and image copies in OES_copy_image). ++ */ ++ unsigned z = slice + texImage->Face + ++ texImage->TexObject->Attrib.MinLayer; ++ ++ /* Enlarge the transfer array if it's not large enough. */ ++ st_texture_image_insert_transfer(texImage, z, NULL); ++ ++ struct st_texture_image_transfer *itransfer = &texImage->transfer[z]; ++ ++ assert(itransfer->box.depth == 0); ++ if (transfer_flags & PIPE_MAP_WRITE) ++ u_box_2d_zslice(x, y, z, w, h, &itransfer->box); ++ ++ unsigned blk_w, blk_h; ++ _mesa_get_format_block_size(texImage->TexFormat, &blk_w, &blk_h); ++ ++ unsigned y_blocks = DIV_ROUND_UP(texImage->Height2, blk_h); ++ unsigned stride = *rowStrideOut = itransfer->temp_stride = ++ _mesa_format_row_stride(texImage->TexFormat, texImage->Width2); ++ unsigned block_size = _mesa_get_format_bytes(texImage->TexFormat); ++ ++ assert(texImage->compressed_data); ++ *mapOut = itransfer->temp_data = ++ texImage->compressed_data->ptr + ++ (z * y_blocks + (y / blk_h)) * stride + ++ (x / blk_w) * block_size; ++ } else { ++ struct pipe_transfer *transfer; ++ *mapOut = st_texture_image_map(st, texImage, transfer_flags, ++ x, y, slice, w, h, 1, &transfer); ++ *rowStrideOut = *mapOut ? transfer->stride : 0; + } + } + ++static void ++log_unmap_time_delta(const struct pipe_box *box, ++ const struct gl_texture_image *texImage, ++ const char *pathname, int64_t start_us) ++{ ++ assert(start_us >= 0); ++ mesa_logi("unmap %dx%d pixels of %s data for %s tex, %s path: " ++ "%"PRIi64" us\n", box->width, box->height, ++ util_format_short_name(texImage->TexFormat), ++ util_format_short_name(texImage->pt->format), ++ pathname, os_time_get() - start_us); ++} + + void + st_UnmapTextureImage(struct gl_context *ctx, +@@ -544,11 +557,64 @@ st_UnmapTextureImage(struct gl_context *ctx, + * support the compressed format. */ + unsigned z = slice + texImage->Face; + struct st_texture_image_transfer *itransfer = &texImage->transfer[z]; +- struct pipe_transfer *transfer = itransfer->transfer; + +- assert(z == transfer->box.z); ++ if (itransfer->box.depth != 0) { ++ assert(itransfer->box.depth == 1); ++ ++ /* Toggle logging for the different unmap paths. */ ++ const bool log_unmap_time = false; ++ const int64_t unmap_start_us = log_unmap_time ? os_time_get() : 0; ++ if (_mesa_is_format_astc_2d(texImage->TexFormat) && ++ util_format_is_compressed(texImage->pt->format)) { ++ ++ /* DXT5 is the only supported transcode target from ASTC. */ ++ assert(texImage->pt->format == PIPE_FORMAT_DXT5_RGBA || ++ texImage->pt->format == PIPE_FORMAT_DXT5_SRGBA); ++ ++ /* Try a compute-based transcode. */ ++ if (itransfer->box.x == 0 && ++ itransfer->box.y == 0 && ++ itransfer->box.width == texImage->Width && ++ itransfer->box.height == texImage->Height && ++ _mesa_has_compute_shaders(ctx) && ++ st_compute_transcode_astc_to_dxt5(st, ++ itransfer->temp_data, ++ itransfer->temp_stride, ++ texImage->TexFormat, ++ texImage->pt, ++ st_texture_image_resource_level(texImage), ++ itransfer->box.z)) { ++ ++ if (log_unmap_time) { ++ log_unmap_time_delta(&itransfer->box, texImage, "GPU", ++ unmap_start_us); ++ } ++ ++ /* Mark the unmap as complete. */ ++ assert(itransfer->transfer == NULL); ++ memset(itransfer, 0, sizeof(struct st_texture_image_transfer)); ++ ++ return; ++ } ++ } ++ ++ struct pipe_transfer *transfer; ++ GLubyte *map = st_texture_image_map(st, texImage, ++ PIPE_MAP_WRITE | ++ PIPE_MAP_DISCARD_RANGE, ++ itransfer->box.x, ++ itransfer->box.y, slice, ++ itransfer->box.width, ++ itransfer->box.height, 1, ++ &transfer); ++ ++ if (!map) { ++ _mesa_error(ctx, GL_OUT_OF_MEMORY, "compressed fallback map"); ++ return; ++ } ++ ++ assert(z == transfer->box.z); + +- if (transfer->usage & PIPE_MAP_WRITE) { + if (util_format_is_compressed(texImage->pt->format)) { + /* Transcode into a different compressed format. */ + unsigned size = +@@ -590,7 +656,7 @@ st_UnmapTextureImage(struct gl_context *ctx, + pack.Alignment = 4; + + _mesa_texstore(ctx, 2, GL_RGBA, texImage->pt->format, +- transfer->stride, &itransfer->map, ++ transfer->stride, &map, + transfer->box.width, + transfer->box.height, 1, GL_RGBA, + GL_UNSIGNED_BYTE, tmp, &pack); +@@ -598,7 +664,7 @@ st_UnmapTextureImage(struct gl_context *ctx, + } else { + /* Decompress into an uncompressed format. */ + if (texImage->TexFormat == MESA_FORMAT_ETC1_RGB8) { +- _mesa_etc1_unpack_rgba8888(itransfer->map, transfer->stride, ++ _mesa_etc1_unpack_rgba8888(map, transfer->stride, + itransfer->temp_data, + itransfer->temp_stride, + transfer->box.width, +@@ -606,14 +672,14 @@ st_UnmapTextureImage(struct gl_context *ctx, + } else if (_mesa_is_format_etc2(texImage->TexFormat)) { + bool bgra = texImage->pt->format == PIPE_FORMAT_B8G8R8A8_SRGB; + +- _mesa_unpack_etc2_format(itransfer->map, transfer->stride, ++ _mesa_unpack_etc2_format(map, transfer->stride, + itransfer->temp_data, + itransfer->temp_stride, + transfer->box.width, transfer->box.height, + texImage->TexFormat, + bgra); + } else if (_mesa_is_format_astc_2d(texImage->TexFormat)) { +- _mesa_unpack_astc_2d_ldr(itransfer->map, transfer->stride, ++ _mesa_unpack_astc_2d_ldr(map, transfer->stride, + itransfer->temp_data, + itransfer->temp_stride, + transfer->box.width, transfer->box.height, +@@ -622,14 +688,22 @@ st_UnmapTextureImage(struct gl_context *ctx, + unreachable("unexpected format for a compressed format fallback"); + } + } ++ ++ st_texture_image_unmap(st, texImage, slice); ++ ++ if (log_unmap_time) { ++ log_unmap_time_delta(&itransfer->box, texImage, "CPU", ++ unmap_start_us); ++ } ++ ++ memset(&itransfer->box, 0, sizeof(struct pipe_box)); + } + + itransfer->temp_data = NULL; + itransfer->temp_stride = 0; +- itransfer->map = NULL; ++ } else { ++ st_texture_image_unmap(st, texImage, slice); + } +- +- st_texture_image_unmap(st, texImage, slice); + } + + +@@ -1964,6 +2038,11 @@ st_TexSubImage(struct gl_context *ctx, GLuint dims, + goto fallback; + } + ++ /* We need both the compressed and non-compressed textures updated, ++ * which neither the PBO nor memcpy code-paths does */ ++ if (st_compressed_format_fallback(st, texImage->TexFormat)) { ++ goto fallback; ++ } + + /* See if the destination format is supported. */ + if (format == GL_DEPTH_COMPONENT || format == GL_DEPTH_STENCIL) +diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c +index 0c6b630a2e4..bb08b66491c 100644 +--- a/src/mesa/state_tracker/st_context.c ++++ b/src/mesa/state_tracker/st_context.c +@@ -56,6 +56,7 @@ + #include "st_program.h" + #include "st_sampler_view.h" + #include "st_shader_cache.h" ++#include "st_texcompress_compute.h" + #include "st_texture.h" + #include "st_util.h" + #include "pipe/p_context.h" +@@ -68,6 +69,8 @@ + #include "compiler/glsl/glsl_parser_extras.h" + #include "nir/nir_to_tgsi.h" + ++#include ++ + DEBUG_GET_ONCE_BOOL_OPTION(mesa_mvp_dp4, "MESA_MVP_DP4", FALSE) + + static uint64_t +@@ -380,6 +383,10 @@ st_destroy_context_priv(struct st_context *st, bool destroy_pipe) + st_destroy_drawpix(st); + st_destroy_drawtex(st); + st_destroy_pbo_helpers(st); ++ ++ if (st->transcode_astc) ++ st_destroy_texcompress_compute(st); ++ + st_destroy_bound_texture_handles(st); + st_destroy_bound_image_handles(st); + +@@ -472,6 +479,7 @@ st_have_perfquery(struct st_context *ctx) + pipe->get_intel_perf_query_data; + } + ++static char isEnableHardEncode[PROPERTY_VALUE_MAX]; + static struct st_context * + st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe, + const struct st_config_options *options) +@@ -571,6 +579,8 @@ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe, + PIPE_TEXTURE_2D, 0, 0, + PIPE_BIND_SAMPLER_VIEW); + st->transcode_astc = options->transcode_astc && ++ property_get("sys.vmi.gl.texturecompress", isEnableHardEncode, "0") && ++ strcmp(isEnableHardEncode, "1") == 0 && + screen->is_format_supported(screen, PIPE_FORMAT_DXT5_SRGBA, + PIPE_TEXTURE_2D, 0, 0, + PIPE_BIND_SAMPLER_VIEW) && +@@ -776,6 +786,17 @@ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe, + return NULL; + } + ++ if (_mesa_has_compute_shaders(ctx) && ++ st->transcode_astc && !st_init_texcompress_compute(st)) { ++ /* Transcoding ASTC to DXT5 using compute shaders can provide a ++ * significant performance benefit over the CPU path. It isn't strictly ++ * necessary to fail if we can't use the compute shader path, but it's ++ * very convenient to do so. This should be rare. ++ */ ++ st_destroy_context_priv(st, false); ++ return NULL; ++ } ++ + /* This must be done after extensions are initialized to enable persistent + * mappings immediately. + */ +diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h +index 5f422769615..d83c614a91a 100644 +--- a/src/mesa/state_tracker/st_context.h ++++ b/src/mesa/state_tracker/st_context.h +@@ -335,6 +335,13 @@ struct st_context + bool use_gs; + } pbo; + ++ struct { ++ struct gl_program **progs; ++ struct pipe_resource *bc1_endpoint_buf; ++ struct pipe_sampler_view *astc_luts[5]; ++ struct hash_table *astc_partition_tables; ++ } texcompress_compute; ++ + /** for drawing with st_util_vertex */ + struct cso_velems_state util_velems; + +diff --git a/src/mesa/state_tracker/st_texcompress_compute.c b/src/mesa/state_tracker/st_texcompress_compute.c +new file mode 100644 +index 00000000000..118efc55d7c +--- /dev/null ++++ b/src/mesa/state_tracker/st_texcompress_compute.c +@@ -0,0 +1,846 @@ ++/************************************************************************** ++ * ++ * Copyright © 2022 Intel Corporation ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ * ++ **************************************************************************/ ++ ++#include "compiler/glsl/astc_glsl.h" ++#include "compiler/glsl/bc1_glsl.h" ++#include "compiler/glsl/bc4_glsl.h" ++#include "compiler/glsl/cross_platform_settings_piece_all.h" ++#include "compiler/glsl/etc2_rgba_stitch_glsl.h" ++ ++#include "main/context.h" ++#include "main/shaderapi.h" ++#include "main/shaderobj.h" ++#include "main/texcompress_astc.h" ++#include "util/texcompress_astc_luts_wrap.h" ++#include "main/uniforms.h" ++ ++#include "state_tracker/st_atom_constbuf.h" ++#include "state_tracker/st_bc1_tables.h" ++#include "state_tracker/st_context.h" ++#include "state_tracker/st_program.h" ++#include "state_tracker/st_texcompress_compute.h" ++#include "state_tracker/st_texture.h" ++ ++#include "util/u_hash_table.h" ++#include "util/u_string.h" ++ ++enum compute_program_id { ++ COMPUTE_PROGRAM_BC1, ++ COMPUTE_PROGRAM_BC4, ++ COMPUTE_PROGRAM_STITCH, ++ COMPUTE_PROGRAM_ASTC_4x4, ++ COMPUTE_PROGRAM_ASTC_5x4, ++ COMPUTE_PROGRAM_ASTC_5x5, ++ COMPUTE_PROGRAM_ASTC_6x5, ++ COMPUTE_PROGRAM_ASTC_6x6, ++ COMPUTE_PROGRAM_ASTC_8x5, ++ COMPUTE_PROGRAM_ASTC_8x6, ++ COMPUTE_PROGRAM_ASTC_8x8, ++ COMPUTE_PROGRAM_ASTC_10x5, ++ COMPUTE_PROGRAM_ASTC_10x6, ++ COMPUTE_PROGRAM_ASTC_10x8, ++ COMPUTE_PROGRAM_ASTC_10x10, ++ COMPUTE_PROGRAM_ASTC_12x10, ++ COMPUTE_PROGRAM_ASTC_12x12, ++ COMPUTE_PROGRAM_COUNT ++}; ++ ++static struct gl_program * PRINTFLIKE(3, 4) ++get_compute_program(struct st_context *st, ++ enum compute_program_id prog_id, ++ const char *source_fmt, ...) ++{ ++ /* Try to get the program from the cache. */ ++ assert(prog_id < COMPUTE_PROGRAM_COUNT); ++ if (st->texcompress_compute.progs[prog_id]) ++ return st->texcompress_compute.progs[prog_id]; ++ ++ /* Cache miss. Create the final source string. */ ++ char *source_str; ++ va_list ap; ++ va_start(ap, source_fmt); ++ int num_printed_bytes = vasprintf(&source_str, source_fmt, ap); ++ va_end(ap); ++ if (num_printed_bytes == -1) ++ return NULL; ++ ++ /* Compile and link the shader. Then, destroy the shader string. */ ++ const char *strings[] = { source_str }; ++ GLuint program = ++ _mesa_CreateShaderProgramv_impl(st->ctx, GL_COMPUTE_SHADER, 1, strings); ++ free(source_str); ++ ++ struct gl_shader_program *shProg = ++ _mesa_lookup_shader_program(st->ctx, program); ++ if (!shProg) ++ return NULL; ++ ++ if (shProg->data->LinkStatus == LINKING_FAILURE) { ++ fprintf(stderr, "Linking failed:\n%s\n", shProg->data->InfoLog); ++ _mesa_reference_shader_program(st->ctx, &shProg, NULL); ++ return NULL; ++ } ++ ++ /* Cache the program and return it. */ ++ return st->texcompress_compute.progs[prog_id] = ++ shProg->_LinkedShaders[MESA_SHADER_COMPUTE]->Program; ++} ++ ++static struct pipe_resource * ++create_bc1_endpoint_ssbo(struct pipe_context *pipe) ++{ ++ struct pipe_resource *buffer = ++ pipe_buffer_create(pipe->screen, PIPE_BIND_SHADER_BUFFER, ++ PIPE_USAGE_IMMUTABLE, sizeof(float) * ++ (sizeof(stb__OMatch5) + sizeof(stb__OMatch6))); ++ ++ if (!buffer) ++ return NULL; ++ ++ struct pipe_transfer *transfer; ++ float (*buffer_map)[2] = pipe_buffer_map(pipe, buffer, ++ PIPE_MAP_WRITE | ++ PIPE_MAP_DISCARD_WHOLE_RESOURCE, ++ &transfer); ++ if (!buffer_map) { ++ pipe_resource_reference(&buffer, NULL); ++ return NULL; ++ } ++ ++ for (int i = 0; i < 256; i++) { ++ for (int j = 0; j < 2; j++) { ++ buffer_map[i][j] = (float) stb__OMatch5[i][j]; ++ buffer_map[i + 256][j] = (float) stb__OMatch6[i][j]; ++ } ++ } ++ ++ pipe_buffer_unmap(pipe, transfer); ++ ++ return buffer; ++} ++ ++static void ++bind_compute_state(struct st_context *st, ++ struct gl_program *prog, ++ struct pipe_sampler_view **sampler_views, ++ const struct pipe_shader_buffer *shader_buffers, ++ const struct pipe_image_view *image_views, ++ bool cs_handle_from_prog, ++ bool constbuf0_from_prog) ++{ ++ assert(prog->info.stage == PIPE_SHADER_COMPUTE); ++ ++ /* Set compute states in the same order as defined in st_atom_list.h */ ++ ++ assert(prog->affected_states & ST_NEW_CS_STATE); ++ assert(st->shader_has_one_variant[PIPE_SHADER_COMPUTE]); ++ cso_set_compute_shader_handle(st->cso_context, ++ cs_handle_from_prog ? ++ prog->variants->driver_shader : NULL); ++ ++ if (prog->affected_states & ST_NEW_CS_SAMPLER_VIEWS) { ++ st->pipe->set_sampler_views(st->pipe, prog->info.stage, 0, ++ prog->info.num_textures, 0, false, ++ sampler_views); ++ } ++ ++ if (prog->affected_states & ST_NEW_CS_SAMPLERS) { ++ /* Programs seem to set this bit more often than needed. For example, if ++ * a program only uses texelFetch, this shouldn't be needed. Section ++ * "11.1.3.2 Texel Fetches", of the GL 4.6 spec says: ++ * ++ * Texel fetch proceeds similarly to the steps described for texture ++ * access in section 11.1.3.5, with the exception that none of the ++ * operations controlled by sampler object state are performed, ++ * ++ * We assume that the program is using texelFetch or doesn't care about ++ * this state for a similar reason. ++ * ++ * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/8014. ++ */ ++ } ++ ++ if (prog->affected_states & ST_NEW_CS_CONSTANTS) { ++ st_upload_constants(st, constbuf0_from_prog ? prog : NULL, ++ prog->info.stage); ++ } ++ ++ if (prog->affected_states & ST_NEW_CS_UBOS) { ++ unreachable("Uniform buffer objects not handled"); ++ } ++ ++ if (prog->affected_states & ST_NEW_CS_ATOMICS) { ++ unreachable("Atomic buffer objects not handled"); ++ } ++ ++ if (prog->affected_states & ST_NEW_CS_SSBOS) { ++ st->pipe->set_shader_buffers(st->pipe, prog->info.stage, 0, ++ prog->info.num_ssbos, shader_buffers, ++ prog->sh.ShaderStorageBlocksWriteAccess); ++ } ++ ++ if (prog->affected_states & ST_NEW_CS_IMAGES) { ++ st->pipe->set_shader_images(st->pipe, prog->info.stage, 0, ++ prog->info.num_images, 0, image_views); ++ } ++} ++ ++static void ++dispatch_compute_state(struct st_context *st, ++ struct gl_program *prog, ++ struct pipe_sampler_view **sampler_views, ++ const struct pipe_shader_buffer *shader_buffers, ++ const struct pipe_image_view *image_views, ++ unsigned num_workgroups_x, ++ unsigned num_workgroups_y, ++ unsigned num_workgroups_z) ++{ ++ assert(prog->info.stage == PIPE_SHADER_COMPUTE); ++ ++ /* Bind the state */ ++ bind_compute_state(st, prog, sampler_views, shader_buffers, image_views, ++ true, true); ++ ++ /* Launch the grid */ ++ const struct pipe_grid_info info = { ++ .block[0] = prog->info.workgroup_size[0], ++ .block[1] = prog->info.workgroup_size[1], ++ .block[2] = prog->info.workgroup_size[2], ++ .grid[0] = num_workgroups_x, ++ .grid[1] = num_workgroups_y, ++ .grid[2] = num_workgroups_z, ++ }; ++ ++ st->pipe->launch_grid(st->pipe, &info); ++ ++ /* Unbind the state */ ++ bind_compute_state(st, prog, NULL, NULL, NULL, false, false); ++ ++ /* If the previously used compute program was relying on any state that was ++ * trampled on by these state changes, dirty the relevant flags. ++ */ ++ if (st->cp) { ++ st->ctx->NewDriverState |= ++ st->cp->affected_states & prog->affected_states; ++ } ++} ++ ++static struct pipe_resource * ++cs_encode_bc1(struct st_context *st, ++ struct pipe_resource *rgba8_tex) ++{ ++ /* Create the required compute state */ ++ struct gl_program *prog = ++ get_compute_program(st, COMPUTE_PROGRAM_BC1, bc1_source, ++ cross_platform_settings_piece_all_header); ++ if (!prog) ++ return NULL; ++ ++ /* ... complete the program setup by defining the number of refinements to ++ * do on the created blocks. The program will attempt to create a more ++ * accurate encoding on each iteration. Doing at least one refinement ++ * provides a significant improvement in quality and is needed to give a ++ * result comparable to the CPU encoder (according to piglit tests). ++ * Additional refinements don't help as much. ++ */ ++ const unsigned num_refinements = 1; ++ _mesa_uniform(0, 1, &num_refinements, st->ctx, prog->shader_program, ++ GLSL_TYPE_UINT, 1); ++ ++ const struct pipe_sampler_view templ = { ++ .target = PIPE_TEXTURE_2D, ++ .format = PIPE_FORMAT_R8G8B8A8_UNORM, ++ .swizzle_r = PIPE_SWIZZLE_X, ++ .swizzle_g = PIPE_SWIZZLE_Y, ++ .swizzle_b = PIPE_SWIZZLE_Z, ++ .swizzle_a = PIPE_SWIZZLE_W, ++ }; ++ struct pipe_sampler_view *rgba8_view = ++ st->pipe->create_sampler_view(st->pipe, rgba8_tex, &templ); ++ if (!rgba8_view) ++ return NULL; ++ ++ const struct pipe_shader_buffer ssbo = { ++ .buffer = st->texcompress_compute.bc1_endpoint_buf, ++ .buffer_size = st->texcompress_compute.bc1_endpoint_buf->width0, ++ }; ++ ++ struct pipe_resource *bc1_tex = ++ st_texture_create(st, PIPE_TEXTURE_2D, PIPE_FORMAT_R32G32_UINT, 0, ++ DIV_ROUND_UP(rgba8_tex->width0, 4), ++ DIV_ROUND_UP(rgba8_tex->height0, 4), 1, 1, 0, ++ PIPE_BIND_SHADER_IMAGE | ++ PIPE_BIND_SAMPLER_VIEW, false); ++ if (!bc1_tex) ++ goto release_sampler_views; ++ ++ const struct pipe_image_view image = { ++ .resource = bc1_tex, ++ .format = PIPE_FORMAT_R16G16B16A16_UINT, ++ .access = PIPE_IMAGE_ACCESS_WRITE, ++ .shader_access = PIPE_IMAGE_ACCESS_WRITE, ++ }; ++ ++ /* Dispatch the compute state */ ++ dispatch_compute_state(st, prog, &rgba8_view, &ssbo, &image, ++ DIV_ROUND_UP(rgba8_tex->width0, 32), ++ DIV_ROUND_UP(rgba8_tex->height0, 32), 1); ++ ++release_sampler_views: ++ pipe_sampler_view_reference(&rgba8_view, NULL); ++ ++ return bc1_tex; ++} ++ ++static struct pipe_resource * ++cs_encode_bc4(struct st_context *st, ++ struct pipe_resource *rgba8_tex, ++ enum pipe_swizzle component, bool use_snorm) ++{ ++ /* Create the required compute state */ ++ struct gl_program *prog = ++ get_compute_program(st, COMPUTE_PROGRAM_BC4, bc4_source, ++ cross_platform_settings_piece_all_header); ++ if (!prog) ++ return NULL; ++ ++ /* ... complete the program setup by picking the channel to encode and ++ * whether to encode it as snorm. The shader doesn't actually support ++ * channel index 2. So, pick index 0 and rely on swizzling instead. ++ */ ++ const unsigned params[] = { 0, use_snorm }; ++ _mesa_uniform(0, 1, params, st->ctx, prog->shader_program, ++ GLSL_TYPE_UINT, 2); ++ ++ const struct pipe_sampler_view templ = { ++ .target = PIPE_TEXTURE_2D, ++ .format = PIPE_FORMAT_R8G8B8A8_UNORM, ++ .swizzle_r = component, ++ .swizzle_g = PIPE_SWIZZLE_0, ++ .swizzle_b = PIPE_SWIZZLE_0, ++ .swizzle_a = PIPE_SWIZZLE_1, ++ }; ++ struct pipe_sampler_view *rgba8_view = ++ st->pipe->create_sampler_view(st->pipe, rgba8_tex, &templ); ++ if (!rgba8_view) ++ return NULL; ++ ++ struct pipe_resource *bc4_tex = ++ st_texture_create(st, PIPE_TEXTURE_2D, PIPE_FORMAT_R32G32_UINT, 0, ++ DIV_ROUND_UP(rgba8_tex->width0, 4), ++ DIV_ROUND_UP(rgba8_tex->height0, 4), 1, 1, 0, ++ PIPE_BIND_SHADER_IMAGE | ++ PIPE_BIND_SAMPLER_VIEW, false); ++ if (!bc4_tex) ++ goto release_sampler_views; ++ ++ const struct pipe_image_view image = { ++ .resource = bc4_tex, ++ .format = PIPE_FORMAT_R16G16B16A16_UINT, ++ .access = PIPE_IMAGE_ACCESS_WRITE, ++ .shader_access = PIPE_IMAGE_ACCESS_WRITE, ++ }; ++ ++ /* Dispatch the compute state */ ++ dispatch_compute_state(st, prog, &rgba8_view, NULL, &image, 1, ++ DIV_ROUND_UP(rgba8_tex->width0, 16), ++ DIV_ROUND_UP(rgba8_tex->height0, 16)); ++ ++release_sampler_views: ++ pipe_sampler_view_reference(&rgba8_view, NULL); ++ ++ return bc4_tex; ++} ++ ++static struct pipe_resource * ++cs_stitch_64bpb_textures(struct st_context *st, ++ struct pipe_resource *tex_hi, ++ struct pipe_resource *tex_lo) ++{ ++ assert(util_format_get_blocksizebits(tex_hi->format) == 64); ++ assert(util_format_get_blocksizebits(tex_lo->format) == 64); ++ assert(tex_hi->width0 == tex_lo->width0); ++ assert(tex_hi->height0 == tex_lo->height0); ++ ++ struct pipe_resource *stitched_tex = NULL; ++ ++ /* Create the required compute state */ ++ struct gl_program *prog = ++ get_compute_program(st, COMPUTE_PROGRAM_STITCH, etc2_rgba_stitch_source, ++ cross_platform_settings_piece_all_header); ++ if (!prog) ++ return NULL; ++ ++ const struct pipe_sampler_view templ = { ++ .target = PIPE_TEXTURE_2D, ++ .format = PIPE_FORMAT_R32G32_UINT, ++ .swizzle_r = PIPE_SWIZZLE_X, ++ .swizzle_g = PIPE_SWIZZLE_Y, ++ .swizzle_b = PIPE_SWIZZLE_0, ++ .swizzle_a = PIPE_SWIZZLE_1, ++ }; ++ struct pipe_sampler_view *rg32_views[2] = { ++ [0] = st->pipe->create_sampler_view(st->pipe, tex_hi, &templ), ++ [1] = st->pipe->create_sampler_view(st->pipe, tex_lo, &templ), ++ }; ++ if (!rg32_views[0] || !rg32_views[1]) ++ goto release_sampler_views; ++ ++ stitched_tex = ++ st_texture_create(st, PIPE_TEXTURE_2D, PIPE_FORMAT_R32G32B32A32_UINT, 0, ++ tex_hi->width0, ++ tex_hi->height0, 1, 1, 0, ++ PIPE_BIND_SHADER_IMAGE | ++ PIPE_BIND_SAMPLER_VIEW, false); ++ if (!stitched_tex) ++ goto release_sampler_views; ++ ++ const struct pipe_image_view image = { ++ .resource = stitched_tex, ++ .format = PIPE_FORMAT_R32G32B32A32_UINT, ++ .access = PIPE_IMAGE_ACCESS_WRITE, ++ .shader_access = PIPE_IMAGE_ACCESS_WRITE, ++ }; ++ ++ /* Dispatch the compute state */ ++ dispatch_compute_state(st, prog, rg32_views, NULL, &image, ++ DIV_ROUND_UP(tex_hi->width0, 8), ++ DIV_ROUND_UP(tex_hi->height0, 8), 1); ++ ++release_sampler_views: ++ pipe_sampler_view_reference(&rg32_views[0], NULL); ++ pipe_sampler_view_reference(&rg32_views[1], NULL); ++ ++ return stitched_tex; ++} ++ ++static struct pipe_resource * ++cs_encode_bc3(struct st_context *st, ++ struct pipe_resource *rgba8_tex) ++{ ++ struct pipe_resource *bc3_tex = NULL; ++ ++ /* Encode RGB channels as BC1. */ ++ struct pipe_resource *bc1_tex = cs_encode_bc1(st, rgba8_tex); ++ if (!bc1_tex) ++ return NULL; ++ ++ /* Encode alpha channels as BC4. */ ++ struct pipe_resource *bc4_tex = ++ cs_encode_bc4(st, rgba8_tex, PIPE_SWIZZLE_W, false); ++ if (!bc4_tex) ++ goto release_textures; ++ ++ st->pipe->memory_barrier(st->pipe, PIPE_BARRIER_TEXTURE); ++ ++ /* Combine BC1 and BC4 to create BC3. */ ++ bc3_tex = cs_stitch_64bpb_textures(st, bc1_tex, bc4_tex); ++ if (!bc3_tex) ++ goto release_textures; ++ ++release_textures: ++ pipe_resource_reference(&bc1_tex, NULL); ++ pipe_resource_reference(&bc4_tex, NULL); ++ ++ return bc3_tex; ++} ++ ++static struct pipe_resource * ++sw_decode_astc(struct st_context *st, ++ uint8_t *astc_data, ++ unsigned astc_stride, ++ mesa_format astc_format, ++ unsigned width_px, unsigned height_px) ++{ ++ /* Create the destination */ ++ struct pipe_resource *rgba8_tex = ++ st_texture_create(st, PIPE_TEXTURE_2D, PIPE_FORMAT_R8G8B8A8_UNORM, 0, ++ width_px, height_px, 1, 1, 0, ++ PIPE_BIND_SAMPLER_VIEW, false); ++ if (!rgba8_tex) ++ return NULL; ++ ++ /* Temporarily map the destination and decode into the returned pointer */ ++ struct pipe_transfer *rgba8_xfer; ++ void *rgba8_map = pipe_texture_map(st->pipe, rgba8_tex, 0, 0, ++ PIPE_MAP_WRITE, 0, 0, ++ width_px, height_px, &rgba8_xfer); ++ if (!rgba8_map) { ++ pipe_resource_reference(&rgba8_tex, NULL); ++ return NULL; ++ } ++ ++ _mesa_unpack_astc_2d_ldr(rgba8_map, rgba8_xfer->stride, ++ astc_data, astc_stride, ++ width_px, height_px, astc_format); ++ ++ pipe_texture_unmap(st->pipe, rgba8_xfer); ++ ++ return rgba8_tex; ++} ++ ++static struct pipe_sampler_view * ++create_astc_cs_payload_view(struct st_context *st, ++ uint8_t *data, unsigned stride, ++ uint32_t width_el, uint32_t height_el) ++{ ++ const struct pipe_resource src_templ = { ++ .target = PIPE_TEXTURE_2D, ++ .format = PIPE_FORMAT_R32G32B32A32_UINT, ++ .bind = PIPE_BIND_SAMPLER_VIEW, ++ .usage = PIPE_USAGE_STAGING, ++ .width0 = width_el, ++ .height0 = height_el, ++ .depth0 = 1, ++ .array_size = 1, ++ }; ++ ++ struct pipe_resource *payload_res = ++ st->screen->resource_create(st->screen, &src_templ); ++ ++ if (!payload_res) ++ return NULL; ++ ++ struct pipe_box box; ++ u_box_origin_2d(width_el, height_el, &box); ++ ++ st->pipe->texture_subdata(st->pipe, payload_res, 0, 0, ++ &box, ++ data, ++ stride, ++ 0 /* unused */); ++ ++ const struct pipe_sampler_view view_templ = { ++ .target = PIPE_TEXTURE_2D, ++ .format = payload_res->format, ++ .swizzle_r = PIPE_SWIZZLE_X, ++ .swizzle_g = PIPE_SWIZZLE_Y, ++ .swizzle_b = PIPE_SWIZZLE_Z, ++ .swizzle_a = PIPE_SWIZZLE_W, ++ }; ++ ++ struct pipe_sampler_view *view = ++ st->pipe->create_sampler_view(st->pipe, payload_res, &view_templ); ++ ++ pipe_resource_reference(&payload_res, NULL); ++ ++ return view; ++} ++ ++static struct pipe_sampler_view * ++get_astc_partition_table_view(struct st_context *st, ++ unsigned block_w, ++ unsigned block_h) ++{ ++ unsigned lut_width; ++ unsigned lut_height; ++ struct pipe_box ptable_box; ++ void *ptable_data = ++ _mesa_get_astc_decoder_partition_table(block_w, block_h, &lut_width, &lut_height); ++ u_box_origin_2d(lut_width, lut_height, &ptable_box); ++ ++ struct pipe_sampler_view *view = ++ util_hash_table_get(st->texcompress_compute.astc_partition_tables, ++ ptable_data); ++ ++ if (view) ++ return view; ++ ++ struct pipe_resource *res = ++ st_texture_create(st, PIPE_TEXTURE_2D, PIPE_FORMAT_R8_UINT, 0, ++ ptable_box.width, ptable_box.height, ++ 1, 1, 0, ++ PIPE_BIND_SAMPLER_VIEW, false); ++ if (!res) ++ return NULL; ++ ++ st->pipe->texture_subdata(st->pipe, res, 0, 0, ++ &ptable_box, ++ ptable_data, ++ ptable_box.width, ++ 0 /* unused */); ++ ++ const struct pipe_sampler_view templ = { ++ .target = PIPE_TEXTURE_2D, ++ .format = res->format, ++ .swizzle_r = PIPE_SWIZZLE_X, ++ .swizzle_g = PIPE_SWIZZLE_Y, ++ .swizzle_b = PIPE_SWIZZLE_Z, ++ .swizzle_a = PIPE_SWIZZLE_W, ++ }; ++ ++ view = st->pipe->create_sampler_view(st->pipe, res, &templ); ++ ++ pipe_resource_reference(&res, NULL); ++ ++ if (view) { ++ _mesa_hash_table_insert(st->texcompress_compute.astc_partition_tables, ++ ptable_data, view); ++ ASSERTED const unsigned max_entries = ++ COMPUTE_PROGRAM_ASTC_12x12 - COMPUTE_PROGRAM_ASTC_4x4 + 1; ++ assert(_mesa_hash_table_num_entries( ++ st->texcompress_compute.astc_partition_tables) < max_entries); ++ } ++ ++ return view; ++} ++ ++static struct pipe_resource * ++cs_decode_astc(struct st_context *st, ++ uint8_t *astc_data, ++ unsigned astc_stride, ++ mesa_format astc_format, ++ unsigned width_px, unsigned height_px) ++{ ++ const enum compute_program_id astc_id = COMPUTE_PROGRAM_ASTC_4x4 + ++ util_format_linear(astc_format) - PIPE_FORMAT_ASTC_4x4; ++ ++ unsigned block_w, block_h; ++ _mesa_get_format_block_size(astc_format, &block_w, &block_h); ++ ++ struct gl_program *prog = ++ get_compute_program(st, astc_id, astc_source, block_w, block_h); ++ ++ if (!prog) ++ return NULL; ++ ++ struct pipe_sampler_view *ptable_view = ++ get_astc_partition_table_view(st, block_w, block_h); ++ ++ if (!ptable_view) ++ return NULL; ++ ++ struct pipe_sampler_view *payload_view = ++ create_astc_cs_payload_view(st, astc_data, astc_stride, ++ DIV_ROUND_UP(width_px, block_w), ++ DIV_ROUND_UP(height_px, block_h)); ++ ++ if (!payload_view) ++ return NULL; ++ ++ /* Create the destination */ ++ struct pipe_resource *rgba8_tex = ++ st_texture_create(st, PIPE_TEXTURE_2D, PIPE_FORMAT_R8G8B8A8_UNORM, 0, ++ width_px, height_px, 1, 1, 0, ++ PIPE_BIND_SAMPLER_VIEW, false); ++ ++ if (!rgba8_tex) ++ goto release_payload_view; ++ ++ const struct pipe_image_view image = { ++ .resource = rgba8_tex, ++ .format = PIPE_FORMAT_R8G8B8A8_UINT, ++ .access = PIPE_IMAGE_ACCESS_WRITE, ++ .shader_access = PIPE_IMAGE_ACCESS_WRITE, ++ }; ++ ++ struct pipe_sampler_view *sampler_views[] = { ++ st->texcompress_compute.astc_luts[0], ++ st->texcompress_compute.astc_luts[1], ++ st->texcompress_compute.astc_luts[2], ++ st->texcompress_compute.astc_luts[3], ++ st->texcompress_compute.astc_luts[4], ++ ptable_view, ++ payload_view, ++ }; ++ ++ dispatch_compute_state(st, prog, sampler_views, NULL, &image, ++ DIV_ROUND_UP(payload_view->texture->width0, 2), ++ DIV_ROUND_UP(payload_view->texture->height0, 2), ++ 1); ++ ++release_payload_view: ++ pipe_sampler_view_reference(&payload_view, NULL); ++ ++ return rgba8_tex; ++} ++ ++static struct pipe_sampler_view * ++get_sampler_view_for_lut(struct pipe_context *pipe, ++ const astc_decoder_lut *lut) ++{ ++ struct pipe_resource *res = ++ pipe_buffer_create_with_data(pipe, ++ PIPE_BIND_SAMPLER_VIEW, ++ PIPE_USAGE_DEFAULT, ++ lut->size_B, ++ lut->data); ++ if (!res) ++ return NULL; ++ ++ const struct pipe_sampler_view templ = { ++ .format = lut->format, ++ .target = PIPE_BUFFER, ++ .swizzle_r = PIPE_SWIZZLE_X, ++ .swizzle_g = PIPE_SWIZZLE_Y, ++ .swizzle_b = PIPE_SWIZZLE_Z, ++ .swizzle_a = PIPE_SWIZZLE_W, ++ .u.buf.offset = 0, ++ .u.buf.size = lut->size_B, ++ }; ++ ++ struct pipe_sampler_view *view = ++ pipe->create_sampler_view(pipe, res, &templ); ++ ++ pipe_resource_reference(&res, NULL); ++ ++ return view; ++} ++ ++/* Initializes required resources for Granite ASTC GPU decode. ++ * ++ * There are 5 texture buffer objects and one additional texture required. ++ * We initialize 5 tbo's here and a single texture later during runtime. ++ */ ++static bool ++initialize_astc_decoder(struct st_context *st) ++{ ++ astc_decoder_lut_holder astc_lut_holder; ++ _mesa_init_astc_decoder_luts(&astc_lut_holder); ++ ++ const astc_decoder_lut *luts[] = { ++ &astc_lut_holder.color_endpoint, ++ &astc_lut_holder.color_endpoint_unquant, ++ &astc_lut_holder.weights, ++ &astc_lut_holder.weights_unquant, ++ &astc_lut_holder.trits_quints, ++ }; ++ ++ for (unsigned i = 0; i < ARRAY_SIZE(luts); i++) { ++ st->texcompress_compute.astc_luts[i] = ++ get_sampler_view_for_lut(st->pipe, luts[i]); ++ if (!st->texcompress_compute.astc_luts[i]) ++ return false; ++ } ++ ++ st->texcompress_compute.astc_partition_tables = ++ _mesa_pointer_hash_table_create(NULL); ++ ++ if (!st->texcompress_compute.astc_partition_tables) ++ return false; ++ ++ return true; ++} ++ ++bool ++st_init_texcompress_compute(struct st_context *st) ++{ ++ st->texcompress_compute.progs = ++ calloc(COMPUTE_PROGRAM_COUNT, sizeof(struct gl_program *)); ++ if (!st->texcompress_compute.progs) ++ return false; ++ ++ st->texcompress_compute.bc1_endpoint_buf = ++ create_bc1_endpoint_ssbo(st->pipe); ++ if (!st->texcompress_compute.bc1_endpoint_buf) ++ return false; ++ ++ if (!initialize_astc_decoder(st)) ++ return false; ++ ++ return true; ++} ++ ++static void ++destroy_astc_decoder(struct st_context *st) ++{ ++ for (unsigned i = 0; i < ARRAY_SIZE(st->texcompress_compute.astc_luts); i++) ++ pipe_sampler_view_reference(&st->texcompress_compute.astc_luts[i], NULL); ++ ++ if (st->texcompress_compute.astc_partition_tables) { ++ hash_table_foreach(st->texcompress_compute.astc_partition_tables, ++ entry) { ++ pipe_sampler_view_reference( ++ (struct pipe_sampler_view **)&entry->data, NULL); ++ } ++ } ++ ++ _mesa_hash_table_destroy(st->texcompress_compute.astc_partition_tables, ++ NULL); ++} ++ ++void ++st_destroy_texcompress_compute(struct st_context *st) ++{ ++ /* The programs in the array are part of the gl_context (in st->ctx).They ++ * are automatically destroyed when the context is destroyed (via ++ * _mesa_free_context_data -> ... -> free_shader_program_data_cb). ++ */ ++ free(st->texcompress_compute.progs); ++ ++ /* Destroy the SSBO used by the BC1 shader program. */ ++ pipe_resource_reference(&st->texcompress_compute.bc1_endpoint_buf, NULL); ++ ++ destroy_astc_decoder(st); ++} ++ ++/* See st_texcompress_compute.h for more information. */ ++bool ++st_compute_transcode_astc_to_dxt5(struct st_context *st, ++ uint8_t *astc_data, ++ unsigned astc_stride, ++ mesa_format astc_format, ++ struct pipe_resource *dxt5_tex, ++ unsigned dxt5_level, ++ unsigned dxt5_layer) ++{ ++ assert(_mesa_has_compute_shaders(st->ctx)); ++ assert(_mesa_is_format_astc_2d(astc_format)); ++ assert(dxt5_tex->format == PIPE_FORMAT_DXT5_RGBA || ++ dxt5_tex->format == PIPE_FORMAT_DXT5_SRGBA); ++ assert(dxt5_level <= dxt5_tex->last_level); ++ assert(dxt5_layer <= util_max_layer(dxt5_tex, dxt5_level)); ++ ++ bool success = false; ++ ++ /* Decode ASTC to RGBA8. */ ++ struct pipe_resource *rgba8_tex = ++ cs_decode_astc(st, astc_data, astc_stride, astc_format, ++ u_minify(dxt5_tex->width0, dxt5_level), ++ u_minify(dxt5_tex->height0, dxt5_level)); ++ if (!rgba8_tex) ++ return false; ++ ++ st->pipe->memory_barrier(st->pipe, PIPE_BARRIER_TEXTURE); ++ ++ /* Encode RGBA8 to BC3. */ ++ struct pipe_resource *bc3_tex = cs_encode_bc3(st, rgba8_tex); ++ if (!bc3_tex) ++ goto release_textures; ++ ++ /* Upload the result. */ ++ struct pipe_box src_box; ++ u_box_origin_2d(bc3_tex->width0, bc3_tex->height0, &src_box); ++ st->pipe->resource_copy_region(st->pipe, dxt5_tex, dxt5_level, ++ 0, 0, dxt5_layer, bc3_tex, 0, &src_box); ++ ++ success = true; ++ ++release_textures: ++ pipe_resource_reference(&rgba8_tex, NULL); ++ pipe_resource_reference(&bc3_tex, NULL); ++ ++ return success; ++} +diff --git a/src/mesa/state_tracker/st_texcompress_compute.h b/src/mesa/state_tracker/st_texcompress_compute.h +new file mode 100644 +index 00000000000..fb2f08fdae7 +--- /dev/null ++++ b/src/mesa/state_tracker/st_texcompress_compute.h +@@ -0,0 +1,51 @@ ++/************************************************************************** ++ * ++ * Copyright © 2022 Intel Corporation ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER ++ * DEALINGS IN THE SOFTWARE. ++ * ++ **************************************************************************/ ++ ++#ifndef ST_TEXCOMPRESS_COMPUTE_H ++#define ST_TEXCOMPRESS_COMPUTE_H ++ ++bool ++st_init_texcompress_compute(struct st_context *st); ++ ++void ++st_destroy_texcompress_compute(struct st_context *st); ++ ++/** ++ * When this function returns true, the destination image will contain the ++ * contents of astc_data but transcoded to DXT5/BC3. ++ * ++ * Note that this function will internally create compute programs by using ++ * glCreateShaderProgramv with the application's GL context. ++ */ ++bool ++st_compute_transcode_astc_to_dxt5(struct st_context *st, ++ uint8_t *astc_data, ++ unsigned astc_stride, ++ mesa_format astc_format, ++ struct pipe_resource *dxt5_tex, ++ unsigned dxt5_level, ++ unsigned dxt5_layer); ++ ++#endif /* ST_TEXCOMPRESS_COMPUTE_H */ +diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c +index 0e72491e1b7..3e03d2fc3f5 100644 +--- a/src/mesa/state_tracker/st_texture.c ++++ b/src/mesa/state_tracker/st_texture.c +@@ -241,6 +241,41 @@ st_texture_match_image(struct st_context *st, + return GL_TRUE; + } + ++void ++st_texture_image_insert_transfer(struct gl_texture_image *stImage, ++ unsigned index, ++ struct pipe_transfer *transfer) ++{ ++ /* Enlarge the transfer array if it's not large enough. */ ++ if (index >= stImage->num_transfers) { ++ unsigned new_size = index + 1; ++ ++ stImage->transfer = realloc(stImage->transfer, ++ new_size * sizeof(struct st_texture_image_transfer)); ++ memset(&stImage->transfer[stImage->num_transfers], 0, ++ (new_size - stImage->num_transfers) * ++ sizeof(struct st_texture_image_transfer)); ++ stImage->num_transfers = new_size; ++ } ++ ++ assert(!stImage->transfer[index].transfer); ++ stImage->transfer[index].transfer = transfer; ++} ++ ++/* See st_texture.h for more information. */ ++GLuint ++st_texture_image_resource_level(struct gl_texture_image *stImage) ++{ ++ /* An image for a non-finalized texture object only has a single level. */ ++ if (stImage->pt != stImage->TexObject->pt) ++ return 0; ++ ++ /* An immutable texture object may have views with an LOD offset. */ ++ if (stImage->TexObject->Immutable) ++ return stImage->Level + stImage->TexObject->Attrib.MinLevel; ++ ++ return stImage->Level; ++} + + /** + * Map a texture image and return the address for a particular 2D face/slice/ +@@ -282,22 +317,10 @@ st_texture_image_map(struct st_context *st, struct gl_texture_image *stImage, + + map = pipe_texture_map_3d(st->pipe, stImage->pt, level, usage, + x, y, z, w, h, d, transfer); +- if (map) { +- /* Enlarge the transfer array if it's not large enough. */ +- if (z >= stImage->num_transfers) { +- unsigned new_size = z + 1; +- +- stImage->transfer = realloc(stImage->transfer, +- new_size * sizeof(struct st_texture_image_transfer)); +- memset(&stImage->transfer[stImage->num_transfers], 0, +- (new_size - stImage->num_transfers) * +- sizeof(struct st_texture_image_transfer)); +- stImage->num_transfers = new_size; +- } + +- assert(!stImage->transfer[z].transfer); +- stImage->transfer[z].transfer = *transfer; +- } ++ if (map) ++ st_texture_image_insert_transfer(stImage, z, *transfer); ++ + return map; + } + +diff --git a/src/mesa/state_tracker/st_texture.h b/src/mesa/state_tracker/st_texture.h +index 29e9c98a354..ea989707e49 100644 +--- a/src/mesa/state_tracker/st_texture.h ++++ b/src/mesa/state_tracker/st_texture.h +@@ -46,7 +46,7 @@ struct st_texture_image_transfer + /* For compressed texture fallback. */ + GLubyte *temp_data; /**< Temporary compressed texture storage. */ + unsigned temp_stride; /**< Stride of the compressed texture storage. */ +- GLubyte *map; /**< Saved map pointer of the uncompressed transfer. */ ++ struct pipe_box box; /**< Region of the transfer's resource to write. */ + }; + + +@@ -178,6 +178,21 @@ st_texture_match_image(struct st_context *st, + const struct pipe_resource *pt, + const struct gl_texture_image *image); + ++/* Insert a transfer pointer into the image's transfer array at the specified ++ * index. The array is reallocated if necessary. ++ */ ++void ++st_texture_image_insert_transfer(struct gl_texture_image *stImage, ++ unsigned index, ++ struct pipe_transfer *transfer); ++ ++/** ++ * Returns the level of the gl_texture_image with respect to the resource it ++ * is allocated within. Example: returns 0 for non-finalized texture. ++ */ ++GLuint ++st_texture_image_resource_level(struct gl_texture_image *stImage); ++ + /* Return a pointer to an image within a texture. Return image stride as + * well. + */ -- Gitee