From 37b6859865d3699ef7ad7ae2dfa2009a6cac9828 Mon Sep 17 00:00:00 2001
From: zhuanghb3 <zhuanghb3@163.com>
Date: Thu, 25 Sep 2025 02:18:18 +0000
Subject: [PATCH 1/3] =?UTF-8?q?=E4=BD=BF=E8=83=BDvulkan=E6=94=AF=E6=8C=81a?=
 =?UTF-8?q?stc=E7=BA=B9=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 patchForAndroid/external-mesa-0002.patch | 6992 +++++++++++++++++-----
 1 file changed, 5460 insertions(+), 1532 deletions(-)

diff --git a/patchForAndroid/external-mesa-0002.patch b/patchForAndroid/external-mesa-0002.patch
index bd9c3bf..50563ea 100644
--- a/patchForAndroid/external-mesa-0002.patch
+++ b/patchForAndroid/external-mesa-0002.patch
@@ -1,456 +1,1343 @@
-diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build
-index 71c8b16d4fc..691ebd9d616 100644
---- a/src/amd/vulkan/meson.build
-+++ b/src/amd/vulkan/meson.build
-@@ -70,6 +70,7 @@ libradv_files = files(
-   'radv_meta_fmask_copy.c',
-   'radv_meta_fmask_expand.c',
-   'radv_meta_astc_decode.c',
-+  'radv_meta_rgba_encode.c',
-   'radv_meta_resolve.c',
-   'radv_meta_resolve_cs.c',
-   'radv_meta_resolve_fs.c',
-diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c
-index 3fd4ea777a8..dfd7bdca894 100644
---- a/src/amd/vulkan/radv_image.c
-+++ b/src/amd/vulkan/radv_image.c
-@@ -38,6 +38,7 @@
- #include <cutils/properties.h>
+diff --git a/include/vulkan/vulkan_core.h b/include/vulkan/vulkan_core.h
+index 5c8b8461f1c..ca180aa5bdc 100644
+--- a/include/vulkan/vulkan_core.h
++++ b/include/vulkan/vulkan_core.h
+@@ -1499,6 +1499,8 @@ typedef enum VkFormat {
+     VK_FORMAT_PVRTC1_4BPP_SRGB_BLOCK_IMG = 1000054005,
+     VK_FORMAT_PVRTC2_2BPP_SRGB_BLOCK_IMG = 1000054006,
+     VK_FORMAT_PVRTC2_4BPP_SRGB_BLOCK_IMG = 1000054007,
++    VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR = 1000470000,
++    VK_FORMAT_A8_UNORM_KHR = 1000470001,
+     VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK_EXT = VK_FORMAT_ASTC_4x4_SFLOAT_BLOCK,
+     VK_FORMAT_ASTC_5x4_SFLOAT_BLOCK_EXT = VK_FORMAT_ASTC_5x4_SFLOAT_BLOCK,
+     VK_FORMAT_ASTC_5x5_SFLOAT_BLOCK_EXT = VK_FORMAT_ASTC_5x5_SFLOAT_BLOCK,
+diff --git a/meson.build b/meson.build
+index e479c3bf873..1b620f52f8d 100644
+--- a/meson.build
++++ b/meson.build
+@@ -666,6 +666,15 @@ if vdpau_drivers_path == ''
+   vdpau_drivers_path = join_paths(get_option('libdir'), 'vdpau')
+ endif
+ 
++prog_glslang = find_program('glslangValidator', native : true, required : with_vulkan_overlay_layer or with_aco_tests or with_amd_vk or with_intel_vk)
++if prog_glslang.found()
++  if run_command(prog_glslang, [ '--quiet', '--version' ], check : false).returncode() == 0
++    glslang_quiet = ['--quiet']
++  else
++    glslang_quiet = []
++  endif
++endif
++
+ if with_gallium_zink
+   dep_vulkan = dependency('vulkan')
+ endif
+diff --git a/src/amd/common/ac_surface.c b/src/amd/common/ac_surface.c
+index f07c465220c..54d49e5f12c 100644
+--- a/src/amd/common/ac_surface.c
++++ b/src/amd/common/ac_surface.c
+@@ -557,6 +557,82 @@ static int surf_config_sanity(const struct ac_surf_config *config, unsigned flag
+    return 0;
+ }
+ 
++static unsigned bpe_to_format(struct radeon_surf *surf)
++{
++   if (surf->blk_w != 1 || surf->blk_h != 1) {
++      if (surf->blk_w == 4 && surf->blk_h == 4) {
++         switch (surf->bpe) {
++         case 8:
++            return ADDR_FMT_BC1;
++         case 16:
++            /* since BC3 and ASTC4x4 has same blk dimension and bpe reporting BC3 also for ASTC4x4.
++             * matching is fine since addrlib needs only blk_w, blk_h and bpe to compute surface
++             * properties.
++             * TODO: If compress_type can be passed to this function, then this ugly BC3 and ASTC4x4
++             *       matching can be avoided.
++             */
++            return ADDR_FMT_BC3;
++         default:
++            unreachable("invalid compressed bpe");
++         }
++      } else if (surf->blk_w == 5 && surf->blk_h == 4)
++         return ADDR_FMT_ASTC_5x4;
++      else if (surf->blk_w == 5 && surf->blk_h == 5)
++         return ADDR_FMT_ASTC_5x5;
++      else if (surf->blk_w == 6 && surf->blk_h == 5)
++         return ADDR_FMT_ASTC_6x5;
++      else if (surf->blk_w == 6 && surf->blk_h == 6)
++         return ADDR_FMT_ASTC_6x6;
++      else if (surf->blk_w == 8 && surf->blk_h == 5)
++         return ADDR_FMT_ASTC_8x5;
++      else if (surf->blk_w == 8 && surf->blk_h == 6)
++         return ADDR_FMT_ASTC_8x6;
++      else if (surf->blk_w == 8 && surf->blk_h == 8)
++         return ADDR_FMT_ASTC_8x8;
++      else if (surf->blk_w == 10 && surf->blk_h == 5)
++         return ADDR_FMT_ASTC_10x5;
++      else if (surf->blk_w == 10 && surf->blk_h == 6)
++         return ADDR_FMT_ASTC_10x6;
++      else if (surf->blk_w == 10 && surf->blk_h == 8)
++         return ADDR_FMT_ASTC_10x8;
++      else if (surf->blk_w == 10 && surf->blk_h == 10)
++         return ADDR_FMT_ASTC_10x10;
++      else if (surf->blk_w == 12 && surf->blk_h == 10)
++         return ADDR_FMT_ASTC_12x10;
++      else if (surf->blk_w == 12 && surf->blk_h == 12)
++         return ADDR_FMT_ASTC_12x12;
++   } else {
++      switch (surf->bpe) {
++      case 1:
++         assert(!(surf->flags & RADEON_SURF_ZBUFFER));
++         return ADDR_FMT_8;
++      case 2:
++         assert(surf->flags & RADEON_SURF_ZBUFFER || !(surf->flags & RADEON_SURF_SBUFFER));
++         return ADDR_FMT_16;
++      case 4:
++         assert(surf->flags & RADEON_SURF_ZBUFFER || !(surf->flags & RADEON_SURF_SBUFFER));
++         return ADDR_FMT_32;
++      case 8:
++         assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
++         return ADDR_FMT_32_32;
++      case 12:
++         assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
++         return ADDR_FMT_32_32_32;
++      case 16:
++         assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
++         return ADDR_FMT_32_32_32_32;
++      default:
++         unreachable("invalid bpe");
++      }
++   }
++   return ADDR_FMT_INVALID;
++}
++
++/* The addrlib pitch alignment is forced to this number for all chips to support interop
++ * between any 2 chips.
++ */
++#define LINEAR_PITCH_ALIGNMENT 256
++
+ static int gfx6_compute_level(ADDR_HANDLE addrlib, const struct ac_surf_config *config,
+                               struct radeon_surf *surf, bool is_stencil, unsigned level,
+                               bool compressed, ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn,
+@@ -1002,23 +1078,10 @@ static int gfx6_compute_surface(ADDR_HANDLE addrlib, const struct radeon_info *i
+       assert(0);
+    }
  
- #include "gfx10_format_table.h"
-+#include "vk_texcompress_bcn.h"
+-   /* The format must be set correctly for the allocation of compressed
+-    * textures to work. In other cases, setting the bpp is sufficient.
+-    */
+-   if (compressed) {
+-      switch (surf->bpe) {
+-      case 8:
+-         AddrSurfInfoIn.format = ADDR_FMT_BC1;
+-         break;
+-      case 16:
+-         AddrSurfInfoIn.format = ADDR_FMT_BC3;
+-         break;
+-      default:
+-         assert(0);
+-      }
+-   } else {
++   AddrSurfInfoIn.format = bpe_to_format(surf);
++   if (!compressed)
+       AddrDccIn.bpp = AddrSurfInfoIn.bpp = surf->bpe * 8;
+-   }
++
  
- static const VkImageUsageFlagBits RADV_IMAGE_USAGE_WRITE_BITS =
-    VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
-@@ -546,6 +547,66 @@ radv_patch_image_from_extra_info(struct radv_device *device, struct radv_image *
+    AddrDccIn.numSamples = AddrSurfInfoIn.numSamples = MAX2(1, config->info.samples);
+    AddrSurfInfoIn.tileIndex = -1;
+@@ -2046,50 +2109,9 @@ static int gfx9_compute_surface(struct ac_addrlib *addrlib, const struct radeon_
  
- extern bool radv_is_format_emulated(const struct radv_physical_device *physical_device, VkFormat format);
+    compressed = surf->blk_w == 4 && surf->blk_h == 4;
  
-+static char isEnableHardEncode[PROPERTY_VALUE_MAX];
-+VkFormat
-+radv_translate_rgba2dxt5(VkFormat format)
+-   /* The format must be set correctly for the allocation of compressed
+-    * textures to work. In other cases, setting the bpp is sufficient. */
+-   if (compressed) {
+-      switch (surf->bpe) {
+-      case 8:
+-         AddrSurfInfoIn.format = ADDR_FMT_BC1;
+-         break;
+-      case 16:
+-         AddrSurfInfoIn.format = ADDR_FMT_BC3;
+-         break;
+-      default:
+-         assert(0);
+-      }
+-   } else {
+-      switch (surf->bpe) {
+-      case 1:
+-         assert(!(surf->flags & RADEON_SURF_ZBUFFER));
+-         AddrSurfInfoIn.format = ADDR_FMT_8;
+-         break;
+-      case 2:
+-         assert(surf->flags & RADEON_SURF_ZBUFFER || !(surf->flags & RADEON_SURF_SBUFFER));
+-         AddrSurfInfoIn.format = ADDR_FMT_16;
+-         break;
+-      case 4:
+-         assert(surf->flags & RADEON_SURF_ZBUFFER || !(surf->flags & RADEON_SURF_SBUFFER));
+-         AddrSurfInfoIn.format = ADDR_FMT_32;
+-         break;
+-      case 8:
+-         assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
+-         AddrSurfInfoIn.format = ADDR_FMT_32_32;
+-         break;
+-      case 12:
+-         assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
+-         AddrSurfInfoIn.format = ADDR_FMT_32_32_32;
+-         break;
+-      case 16:
+-         assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
+-         AddrSurfInfoIn.format = ADDR_FMT_32_32_32_32;
+-         break;
+-      default:
+-         assert(0);
+-      }
++   AddrSurfInfoIn.format = bpe_to_format(surf);
++   if (!compressed)
+       AddrSurfInfoIn.bpp = surf->bpe * 8;
+-   }
+ 
+    bool is_color_surface = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
+    AddrSurfInfoIn.flags.color = is_color_surface && !(surf->flags & RADEON_SURF_NO_RENDER_TARGET);
+@@ -2812,7 +2834,7 @@ bool ac_surface_override_offset_stride(const struct radeon_info *info, struct ra
+          }
+       }
+       surf->u.gfx9.surf_offset = offset;
+-      if (surf->u.gfx9.zs.stencil_offset)
++      if (surf->has_stencil)
+          surf->u.gfx9.zs.stencil_offset += offset;
+    } else {
+       if (pitch) {
+@@ -2919,6 +2941,84 @@ uint64_t ac_surface_get_plane_size(const struct radeon_surf *surf,
+    }
+ }
+ 
++uint64_t
++ac_surface_addr_from_coord(struct ac_addrlib *addrlib, const struct radeon_info *info,
++                           const struct radeon_surf *surf, const struct ac_surf_info *surf_info,
++                           unsigned level, unsigned x, unsigned y, unsigned layer, bool is_3d)
 +{
-+   switch (format) {
-+   case VK_FORMAT_R8G8B8A8_UNORM:
-+      return VK_FORMAT_BC3_UNORM_BLOCK;
-+   case VK_FORMAT_R8G8B8A8_SRGB:
-+      return VK_FORMAT_BC3_SRGB_BLOCK;
++   /* Only implemented for GFX9+ */
++   assert(info->gfx_level >= GFX9);
++
++   ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT input = {0};
++   input.size = sizeof(ADDR_COMPUTE_SURFACE_ADDRFROMCOORD_INPUT);
++   input.slice = layer;
++   input.mipId = level;
++   input.unalignedWidth = DIV_ROUND_UP(surf_info->width, surf->blk_w);
++   input.unalignedHeight = DIV_ROUND_UP(surf_info->height, surf->blk_h);
++   input.numSlices = is_3d ? surf_info->depth : surf_info->array_size;
++   input.numMipLevels = surf_info->levels;
++   input.numSamples = surf_info->samples;
++   input.numFrags = surf_info->samples;
++   input.swizzleMode = surf->u.gfx9.swizzle_mode;
++   input.resourceType = surf->u.gfx9.resource_type;
++   input.pipeBankXor = surf->tile_swizzle;
++   input.bpp = surf->bpe * 8;
++   input.x = x;
++   input.y = y;
++
++   ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT output = {0};
++   output.size = sizeof(ADDR2_COMPUTE_SURFACE_ADDRFROMCOORD_OUTPUT);
++   Addr2ComputeSurfaceAddrFromCoord(addrlib->handle, &input, &output);
++   return output.addr;
++}
++
++void
++ac_surface_compute_nbc_view(struct ac_addrlib *addrlib, const struct radeon_info *info,
++                            const struct radeon_surf *surf, const struct ac_surf_info *surf_info,
++                            unsigned level, unsigned layer, struct ac_surf_nbc_view *out)
++{
++   /* Only implemented for GFX10+ */
++   assert(info->gfx_level >= GFX10);
++
++   ADDR2_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT input = {0};
++   input.size = sizeof(ADDR2_COMPUTE_NONBLOCKCOMPRESSEDVIEW_INPUT);
++   input.swizzleMode = surf->u.gfx9.swizzle_mode;
++   input.resourceType = surf->u.gfx9.resource_type;
++   switch (surf->bpe) {
++   case 8:
++      input.format = ADDR_FMT_BC1;
++      break;
++   case 16:
++      input.format = ADDR_FMT_BC3;
++      break;
 +   default:
-+      return format;
++      assert(0);
++   }
++   input.width = surf_info->width;
++   input.height = surf_info->height;
++   input.numSlices = surf_info->array_size;
++   input.numMipLevels = surf_info->levels;
++   input.pipeBankXor = surf->tile_swizzle;
++   input.slice = layer;
++   input.mipId = level;
++
++   ADDR_E_RETURNCODE res;
++   ADDR2_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT output = {0};
++   output.size = sizeof(ADDR2_COMPUTE_NONBLOCKCOMPRESSEDVIEW_OUTPUT);
++   res = Addr2ComputeNonBlockCompressedView(addrlib->handle, &input, &output);
++   if (res == ADDR_OK) {
++      out->base_address_offset = output.offset;
++      out->tile_swizzle = output.pipeBankXor;
++      out->width = output.unalignedWidth;
++      out->height = output.unalignedHeight;
++      out->num_levels = output.numMipLevels;
++      out->level = output.mipId;
++      out->valid = true;
++   } else {
++      out->valid = false;
 +   }
 +}
 +
-+static bool
-+radv_need_hard_encode(const struct radv_physical_device *pdev, const VkImageCreateInfo *pCreateInfo)
+ void ac_surface_print_info(FILE *out, const struct radeon_info *info,
+                            const struct radeon_surf *surf)
+ {
+diff --git a/src/amd/common/ac_surface.h b/src/amd/common/ac_surface.h
+index 1f8114bb4b7..baf5aea3d12 100644
+--- a/src/amd/common/ac_surface.h
++++ b/src/amd/common/ac_surface.h
+@@ -415,6 +415,18 @@ struct ac_surf_config {
+    unsigned is_cube : 1;
+ };
+ 
++
++/* Output parameters for ac_surface_compute_nbc_view */
++struct ac_surf_nbc_view {
++   bool valid;
++   uint32_t width;
++   uint32_t height;
++   uint32_t level;
++   uint32_t num_levels; /* Used for max_mip in the resource descriptor */
++   uint8_t tile_swizzle;
++   uint64_t base_address_offset;
++};
++
+ struct ac_addrlib *ac_addrlib_create(const struct radeon_info *info, uint64_t *max_alignment);
+ void ac_addrlib_destroy(struct ac_addrlib *addrlib);
+ void *ac_addrlib_get_handle(struct ac_addrlib *addrlib);
+@@ -470,6 +482,16 @@ uint64_t ac_surface_get_plane_stride(enum chip_class chip_class,
+ uint64_t ac_surface_get_plane_size(const struct radeon_surf *surf,
+                                    unsigned plane);
+ 
++
++uint64_t ac_surface_addr_from_coord(struct ac_addrlib *addrlib, const struct radeon_info *info,
++                                    const struct radeon_surf *surf,
++                                    const struct ac_surf_info *surf_info, unsigned level,
++                                    unsigned x, unsigned y, unsigned layer, bool is_3d);
++void ac_surface_compute_nbc_view(struct ac_addrlib *addrlib, const struct radeon_info *info,
++                                 const struct radeon_surf *surf,
++                                 const struct ac_surf_info *surf_info, unsigned level,
++                                 unsigned layer, struct ac_surf_nbc_view *out);
++
+ void ac_surface_print_info(FILE *out, const struct radeon_info *info,
+                            const struct radeon_surf *surf);
+ 
+diff --git a/src/amd/compiler/aco_lower_phis.cpp b/src/amd/compiler/aco_lower_phis.cpp
+index 6b8f611ecc7..e49e9805734 100644
+--- a/src/amd/compiler/aco_lower_phis.cpp
++++ b/src/amd/compiler/aco_lower_phis.cpp
+@@ -50,73 +50,81 @@ struct ssa_state {
+    std::vector<Operand> outputs; /* the output per block */
+ };
+ 
+-Operand
+-get_ssa(Program* program, unsigned block_idx, ssa_state* state, bool input)
++Operand get_output(Program* program, unsigned block_idx, ssa_state* state);
++
++void
++fill_outputs(Program* program, ssa_state* state, unsigned start, unsigned end)
++
+ {
+-   if (!input) {
+-      if (state->visited[block_idx])
+-         return state->outputs[block_idx];
+-
+-      /* otherwise, output == input */
+-      Operand output = get_ssa(program, block_idx, state, true);
+-      state->visited[block_idx] = true;
+-      state->outputs[block_idx] = output;
+-      return output;
++   for (unsigned i = start; i < end; ++i) {
++      if (state->visited[i])
++         continue;
++      state->outputs[i] = get_output(program, i, state);
++      state->visited[i] = true;
+    }
++}
++
++Operand
++get_output(Program* program, unsigned block_idx, ssa_state* state)
 +{
-+   if (property_get("sys.vmi.vk.texturecompress", isEnableHardEncode, "0") && strcmp(isEnableHardEncode, "2") != 0) {
-+      return false;
++   Block& block = program->blocks[block_idx];
+ 
+-   /* retrieve the Operand by checking the predecessors */
+    if (state->any_pred_defined[block_idx] == pred_defined::undef)
+       return Operand(program->lane_mask);
+ 
+-   Block& block = program->blocks[block_idx];
+-   size_t pred = block.linear_preds.size();
+-   Operand op;
+-   if (block.loop_nest_depth < state->loop_nest_depth) {
++   if (block.loop_nest_depth < state->loop_nest_depth)
+       /* loop-carried value for loop exit phis */
+-      op = Operand::zero(program->lane_mask.bytes());
+-   } else if (block.loop_nest_depth > state->loop_nest_depth || pred == 1 ||
+-              block.kind & block_kind_loop_exit) {
+-      op = get_ssa(program, block.linear_preds[0], state, false);
+-   } else {
+-      assert(pred > 1);
+-      bool previously_visited = state->visited[block_idx];
+-      /* potential recursion: anchor at loop header */
+-      if (block.kind & block_kind_loop_header) {
+-         assert(!previously_visited);
+-         previously_visited = true;
+-         state->visited[block_idx] = true;
+-         state->outputs[block_idx] = Operand(Temp(program->allocateTmp(program->lane_mask)));
+-      }
++      return Operand::zero(program->lane_mask.bytes());
+ 
+-      /* collect predecessor output operands */
+-      std::vector<Operand> ops(pred);
+-      for (unsigned i = 0; i < pred; i++)
+-         ops[i] = get_ssa(program, block.linear_preds[i], state, false);
++   size_t pred = block.linear_preds.size();
++   if (block.loop_nest_depth > state->loop_nest_depth || pred == 1 ||
++       block.kind & block_kind_loop_exit)
++      return state->outputs[block.linear_preds[0]];
+ 
+-      /* check triviality */
+-      if (std::all_of(ops.begin() + 1, ops.end(), [&](Operand same) { return same == ops[0]; }))
+-         return ops[0];
++   if (block.kind & block_kind_loop_header) {
++      assert(!state->visited[block_idx]);
+ 
+-      /* Return if this was handled in a recursive call by a loop header phi */
+-      if (!previously_visited && state->visited[block_idx])
+-         return state->outputs[block_idx];
+ 
+-      if (block.kind & block_kind_loop_header)
+-         op = state->outputs[block_idx];
+-      else
+-         op = Operand(Temp(program->allocateTmp(program->lane_mask)));
+-
+-      /* create phi */
+-      aco_ptr<Pseudo_instruction> phi{
+-         create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)};
+-      for (unsigned i = 0; i < pred; i++)
+-         phi->operands[i] = ops[i];
+-      phi->definitions[0] = Definition(op.getTemp());
+-      block.instructions.emplace(block.instructions.begin(), std::move(phi));
++      unsigned start_idx = block_idx + 1;
++      unsigned end_idx = block_idx + 1;
++      while (program->blocks[end_idx].loop_nest_depth >= state->loop_nest_depth)
++         end_idx++;
++
++      state->outputs[block_idx] = Operand(Temp(program->allocateTmp(program->lane_mask)));
++      fill_outputs(program, state, start_idx, end_idx);
+    }
+ 
+-   assert(op.size() == program->lane_mask.size());
+-   return op;
++      /* collect predecessor output operands */
++   std::vector<Operand> ops(pred);
++   for (unsigned i = 0; i < pred; i++)
++      ops[i] = state->outputs[block.linear_preds[i]];
++
++   /* check triviality
++    * For loop exit phis that go through the loop header, we already defined a temp as the loop
++    * header block's output. This temp always needs to be defined, so we can't avoid creating the
++    * phi instruction. */
++   if (!(block.kind & block_kind_loop_header) &&
++       std::all_of(ops.begin() + 1, ops.end(), [&](Operand same) { return same == ops[0]; }))
++      return ops[0];
++
++   Operand output;
++
++   if (block.kind & block_kind_loop_header)
++      output = state->outputs[block_idx];
++   else
++      output = Operand(Temp(program->allocateTmp(program->lane_mask)));
++
++   /* create phi */
++   aco_ptr<Pseudo_instruction> phi{
++      create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, pred, 1)};
++   for (unsigned i = 0; i < pred; i++)
++      phi->operands[i] = ops[i];
++   phi->definitions[0] = Definition(output.getTemp());
++   block.instructions.emplace(block.instructions.begin(), std::move(phi));
++
++   assert(output.size() == program->lane_mask.size());
++
++   return output;
+ }
+ 
+ void
+@@ -139,7 +147,7 @@ build_merge_code(Program* program, ssa_state* state, Block* block, Operand cur)
+ {
+    unsigned block_idx = block->index;
+    Definition dst = Definition(state->outputs[block_idx].getTemp());
+-   Operand prev = get_ssa(program, block_idx, state, true);
++   Operand prev = get_output(program, block_idx, state);
+    if (cur.isUndefined())
+       cur = Operand::zero(program->lane_mask.bytes());
+ 
+@@ -193,8 +201,10 @@ build_merge_code(Program* program, ssa_state* state, Block* block, Operand cur)
+ }
+ 
+ void
+-init_any_pred_defined(Program* program, ssa_state* state, Block* block, aco_ptr<Instruction>& phi)
++fill_state(Program* program, Block* block, ssa_state* state, aco_ptr<Instruction>& phi)
+ {
++   Builder bld(program);
++
+    std::fill(state->any_pred_defined.begin(), state->any_pred_defined.end(), pred_defined::undef);
+    for (unsigned i = 0; i < block->logical_preds.size(); i++) {
+       if (phi->operands[i].isUndefined())
+@@ -209,14 +219,14 @@ init_any_pred_defined(Program* program, ssa_state* state, Block* block, aco_ptr<
+    unsigned start = block->logical_preds[0];
+    unsigned end = block->index;
+ 
+-   /* for loop exit phis, start at the loop header */
++   /* for loop exit phis, start at the loop pre-header */
+    if (block->kind & block_kind_loop_exit) {
+-      while (program->blocks[start - 1].loop_nest_depth >= state->loop_nest_depth)
++      while (program->blocks[start].loop_nest_depth >= state->loop_nest_depth)
+          start--;
+       /* If the loop-header has a back-edge, we need to insert a phi.
+        * This will contain a defined value */
+-      if (program->blocks[start].linear_preds.size() > 1)
+-         state->any_pred_defined[start] = pred_defined::temp;
++      if (program->blocks[start + 1].linear_preds.size() > 1)
++         state->any_pred_defined[start + 1] = pred_defined::temp;
+    }
+    /* for loop header phis, end at the loop exit */
+    if (block->kind & block_kind_loop_header) {
+@@ -231,10 +241,10 @@ init_any_pred_defined(Program* program, ssa_state* state, Block* block, aco_ptr<
+    // TODO: find more occasions where pred_defined::zero is beneficial (e.g. with 2+ temp merges)
+    if (block->kind & block_kind_loop_exit) {
+       /* zero the loop-carried variable */
+-      if (program->blocks[start].linear_preds.size() > 1) {
+-         state->any_pred_defined[start] |= pred_defined::zero;
++      if (program->blocks[start + 1].linear_preds.size() > 1) {
++         state->any_pred_defined[start + 1] |= pred_defined::zero;
+          // TODO: emit this zero explicitly
+-         state->any_pred_defined[start - 1] = pred_defined::const_0;
++         state->any_pred_defined[start] = pred_defined::const_0;
+       }
+    }
+ 
+@@ -246,14 +256,24 @@ init_any_pred_defined(Program* program, ssa_state* state, Block* block, aco_ptr<
+    }
+ 
+    state->any_pred_defined[block->index] = pred_defined::undef;
++   for (unsigned i = 0; i < phi->operands.size(); i++) {
++      unsigned pred = block->logical_preds[i];
++      if (state->any_pred_defined[pred] != pred_defined::undef)
++         state->outputs[pred] = Operand(bld.tmp(bld.lm));
++      else
++         state->outputs[pred] = phi->operands[i];
++      assert(state->outputs[pred].size() == bld.lm.size());
++      state->visited[pred] = true;
 +   }
 +
-+   // 暂不支持对图像数组进行压缩
-+   if (pCreateInfo->arrayLayers > 1) {
-+      return false;
-+   }
++   fill_outputs(program, state, start, end);
 +
-+   if (!radv_is_format_emulated(pdev, pCreateInfo->format)) {
-+      return false;
+ }
+ 
+ void
+ lower_divergent_bool_phi(Program* program, ssa_state* state, Block* block,
+                          aco_ptr<Instruction>& phi)
+ {
+-   Builder bld(program);
+-
+    if (!state->checked_preds_for_uniform) {
+       state->all_preds_uniform = !(block->kind & block_kind_merge) &&
+                                  block->linear_preds.size() == block->logical_preds.size();
+@@ -276,17 +296,7 @@ lower_divergent_bool_phi(Program* program, ssa_state* state, Block* block,
+    if (block->kind & block_kind_loop_exit)
+       state->loop_nest_depth += 1;
+    std::fill(state->visited.begin(), state->visited.end(), false);
+-   init_any_pred_defined(program, state, block, phi);
+-
+-   for (unsigned i = 0; i < phi->operands.size(); i++) {
+-      unsigned pred = block->logical_preds[i];
+-      if (state->any_pred_defined[pred] != pred_defined::undef)
+-         state->outputs[pred] = Operand(bld.tmp(bld.lm));
+-      else
+-         state->outputs[pred] = phi->operands[i];
+-      assert(state->outputs[pred].size() == bld.lm.size());
+-      state->visited[pred] = true;
+-   }
++   fill_state(program, block, state, phi);
+ 
+    for (unsigned i = 0; i < phi->operands.size(); i++)
+       build_merge_code(program, state, &program->blocks[block->logical_preds[i]], phi->operands[i]);
+@@ -303,7 +313,7 @@ lower_divergent_bool_phi(Program* program, ssa_state* state, Block* block,
+    assert(phi->operands.size() == num_preds);
+ 
+    for (unsigned i = 0; i < num_preds; i++)
+-      phi->operands[i] = get_ssa(program, block->linear_preds[i], state, false);
++      phi->operands[i] = state->outputs[block->linear_preds[i]];
+ 
+    return;
+ }
+diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build
+index 8e5c908dbbf..71c8b16d4fc 100644
+--- a/src/amd/vulkan/meson.build
++++ b/src/amd/vulkan/meson.build
+@@ -69,6 +69,7 @@ libradv_files = files(
+   'radv_meta_fast_clear.c',
+   'radv_meta_fmask_copy.c',
+   'radv_meta_fmask_expand.c',
++  'radv_meta_astc_decode.c',
+   'radv_meta_resolve.c',
+   'radv_meta_resolve_cs.c',
+   'radv_meta_resolve_fs.c',
+diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
+index 875f196f29c..4e85be520af 100644
+--- a/src/amd/vulkan/radv_cmd_buffer.c
++++ b/src/amd/vulkan/radv_cmd_buffer.c
+@@ -7531,7 +7531,7 @@ radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_inf
+    radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH);
+ }
+ 
+-static void
++void
+ radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
+ {
+    radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline,
+diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
+index 449ec3037a6..7dbaebafe5c 100644
+--- a/src/amd/vulkan/radv_device.c
++++ b/src/amd/vulkan/radv_device.c
+@@ -708,9 +708,11 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm
+ 
+ #ifdef ANDROID
+    device->emulate_etc2 = !radv_device_supports_etc(device);
++   device->emulate_astc = true;
+ #else
+    device->emulate_etc2 = !radv_device_supports_etc(device) &&
+                           driQueryOptionb(&device->instance->dri_options, "radv_require_etc2");
++   device->emulate_astc = driQueryOptionb(&device->instance->dri_options, "radv_require_astc");
+ #endif
+ 
+    snprintf(device->name, sizeof(device->name), "AMD RADV %s%s", device->rad_info.name,
+@@ -957,6 +959,7 @@ static const driOptionDescription radv_dri_options[] = {
+       DRI_CONF_RADV_DISABLE_DCC(false)
+       DRI_CONF_RADV_REPORT_APU_AS_DGPU(false)
+       DRI_CONF_RADV_REQUIRE_ETC2(false)
++      DRI_CONF_RADV_REQUIRE_ASTC(true)
+       DRI_CONF_RADV_DISABLE_HTILE_LAYERS(false)
+       DRI_CONF_RADV_DISABLE_ANISO_SINGLE_LEVEL(false)
+       DRI_CONF_RADV_DISABLE_SINKING_LOAD_INPUT_FS(false)
+@@ -1222,7 +1225,7 @@ radv_GetPhysicalDeviceFeatures(VkPhysicalDevice physicalDevice, VkPhysicalDevice
+       .multiViewport = true,
+       .samplerAnisotropy = true,
+       .textureCompressionETC2 = radv_device_supports_etc(pdevice) || pdevice->emulate_etc2,
+-      .textureCompressionASTC_LDR = false,
++      .textureCompressionASTC_LDR = pdevice->emulate_astc,
+       .textureCompressionBC = true,
+       .occlusionQueryPrecise = true,
+       .pipelineStatisticsQuery = true,
+@@ -5574,6 +5577,7 @@ radv_initialise_color_surface(struct radv_device *device, struct radv_color_buff
+    uint64_t va;
+    const struct radv_image_plane *plane = &iview->image->planes[iview->plane_id];
+    const struct radeon_surf *surf = &plane->surface;
++   uint8_t tile_swizzle = plane->surface.tile_swizzle;
+ 
+    desc = vk_format_description(iview->vk_format);
+ 
+@@ -5583,6 +5587,11 @@ radv_initialise_color_surface(struct radv_device *device, struct radv_color_buff
+    cb->cb_color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1);
+ 
+    va = radv_buffer_get_va(iview->image->bo) + iview->image->offset;
++   
++   if (iview->nbc_view.valid) {
++      va += iview->nbc_view.base_address_offset;
++      tile_swizzle = iview->nbc_view.tile_swizzle;
 +   }
+ 
+    cb->cb_color_base = va >> 8;
+ 
+@@ -5609,14 +5618,14 @@ radv_initialise_color_surface(struct radv_device *device, struct radv_color_buff
+       }
+ 
+       cb->cb_color_base += surf->u.gfx9.surf_offset >> 8;
+-      cb->cb_color_base |= surf->tile_swizzle;
++      cb->cb_color_base |= tile_swizzle;
+    } else {
+       const struct legacy_surf_level *level_info = &surf->u.legacy.level[iview->base_mip];
+       unsigned pitch_tile_max, slice_tile_max, tile_mode_index;
+ 
+       cb->cb_color_base += level_info->offset_256B;
+       if (level_info->mode == RADEON_SURF_MODE_2D)
+-         cb->cb_color_base |= surf->tile_swizzle;
++         cb->cb_color_base |= tile_swizzle;
+ 
+       pitch_tile_max = level_info->nblk_x / 8 - 1;
+       slice_tile_max = (level_info->nblk_x * level_info->nblk_y) / 64 - 1;
+@@ -5655,7 +5664,7 @@ radv_initialise_color_surface(struct radv_device *device, struct radv_color_buff
+        device->physical_device->rad_info.chip_class <= GFX8)
+       va += plane->surface.u.legacy.color.dcc_level[iview->base_mip].dcc_offset;
+ 
+-   unsigned dcc_tile_swizzle = surf->tile_swizzle;
++   unsigned dcc_tile_swizzle = tile_swizzle;
+    dcc_tile_swizzle &= ((1 << surf->meta_alignment_log2) - 1) >> 8;
+ 
+    cb->cb_dcc_base = va >> 8;
+@@ -5764,9 +5773,17 @@ radv_initialise_color_surface(struct radv_device *device, struct radv_color_buff
+          vk_format_get_plane_width(iview->image->vk_format, iview->plane_id, iview->extent.width);
+       unsigned height =
+          vk_format_get_plane_height(iview->image->vk_format, iview->plane_id, iview->extent.height);
++      unsigned max_mip = iview->image->info.levels - 1;
+ 
+       if (device->physical_device->rad_info.chip_class >= GFX10) {
+-         cb->cb_color_view |= S_028C6C_MIP_LEVEL_GFX10(iview->base_mip);
++         unsigned base_level = iview->base_mip;
 +
-+   const enum util_format_layout format_layout = vk_format_description(pCreateInfo->format)->layout;
-+   if (format_layout != UTIL_FORMAT_LAYOUT_ASTC &&
-+       vk_texcompress_etc2_emulation_format(pCreateInfo->format) != VK_FORMAT_R8G8B8A8_UNORM &&
-+       vk_texcompress_etc2_emulation_format(pCreateInfo->format) != VK_FORMAT_R8G8B8A8_SRGB) {
-+      return false;
-+   }
++         if (iview->nbc_view.valid) {
++            base_level = iview->nbc_view.level;
++            max_mip = iview->nbc_view.num_levels - 1;
++         }
 +
-+   // 过滤掉对稀疏纹理的压缩，对该部分纹理进行压缩时可能造成GPU reset。 
-+   if (pCreateInfo->flags & (VK_IMAGE_CREATE_SPARSE_BINDING_BIT | VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT |
-+      VK_IMAGE_CREATE_SPARSE_ALIASED_BIT)) {
-+      return false;
-+   }
++         cb->cb_color_view |= S_028C6C_MIP_LEVEL_GFX10(base_level);
+ 
+          cb->cb_color_attrib3 |= S_028EE0_MIP0_DEPTH(mip0_depth) |
+                                  S_028EE0_RESOURCE_TYPE(surf->u.gfx9.resource_type) |
+@@ -5778,7 +5795,7 @@ radv_initialise_color_surface(struct radv_device *device, struct radv_color_buff
+       }
+ 
+       cb->cb_color_attrib2 = S_028C68_MIP0_WIDTH(width - 1) | S_028C68_MIP0_HEIGHT(height - 1) |
+-                             S_028C68_MAX_MIP(iview->image->info.levels - 1);
++                             S_028C68_MAX_MIP(max_mip);
+    }
+ }
+ 
+diff --git a/src/amd/vulkan/radv_formats.c b/src/amd/vulkan/radv_formats.c
+index 475bd9a17d9..a3d6fd8b650 100644
+--- a/src/amd/vulkan/radv_formats.c
++++ b/src/amd/vulkan/radv_formats.c
+@@ -37,6 +37,7 @@
+ #include "util/half_float.h"
+ #include "vulkan/util/vk_format.h"
+ #include "vulkan/util/vk_enum_defines.h"
++#include "vk_texcompress_astc.h"
+ 
+ uint32_t
+ radv_translate_buffer_dataformat(const struct util_format_description *desc, int first_non_void)
+@@ -652,24 +653,56 @@ radv_is_zs_format_supported(VkFormat format)
+    return radv_translate_dbformat(format) != V_028040_Z_INVALID || format == VK_FORMAT_S8_UINT;
+ }
+ 
 +
-+	// 检查图像的用途，防止压缩 swapchain 的 images；并只对 VK_IMAGE_TYPE_2D 类型纹理进行压缩
-+	if (!(pCreateInfo->usage & VK_IMAGE_USAGE_SAMPLED_BIT) || pCreateInfo->imageType != VK_IMAGE_TYPE_2D) {
-+		return false;
-+	}
+ static bool
+-radv_is_filter_minmax_format_supported(VkFormat format)
++radv_is_filter_minmax_format_supported(const struct radv_physical_device *pdev, VkFormat format)
+ {
+-   /* From the Vulkan spec 1.1.71:
+-    *
+-    * "The following formats must support the
+-    *  VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT feature with
+-    *  VK_IMAGE_TILING_OPTIMAL, if they support
+-    *  VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT."
+-    */
+-   /* TODO: enable more formats. */
++   enum chip_class gfx_level = pdev->rad_info.chip_class;
++
+    switch (format) {
++   case VK_FORMAT_R4G4_UNORM_PACK8:
++   case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
++   case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
++   case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
++   case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
++   case VK_FORMAT_R5G6B5_UNORM_PACK16:
++   case VK_FORMAT_B5G6R5_UNORM_PACK16:
++   case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
++   case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
++   case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
++   case VK_FORMAT_A1B5G5R5_UNORM_PACK16_KHR:
+    case VK_FORMAT_R8_UNORM:
++   case VK_FORMAT_A8_UNORM_KHR:
+    case VK_FORMAT_R8_SNORM:
++   case VK_FORMAT_R8_SRGB:
++   case VK_FORMAT_R8G8_UNORM:
++   case VK_FORMAT_R8G8_SNORM:
++   case VK_FORMAT_R8G8_SRGB:
++   case VK_FORMAT_R8G8B8A8_UNORM:
++   case VK_FORMAT_R8G8B8A8_SNORM:
++   case VK_FORMAT_R8G8B8A8_SRGB:
++   case VK_FORMAT_B8G8R8A8_UNORM:
++   case VK_FORMAT_B8G8R8A8_SNORM:
++   case VK_FORMAT_B8G8R8A8_SRGB:
++   case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
++   case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
++   case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
++   case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
++   case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+    case VK_FORMAT_R16_UNORM:
+    case VK_FORMAT_R16_SNORM:
+    case VK_FORMAT_R16_SFLOAT:
++   case VK_FORMAT_R16G16_UNORM:
++   case VK_FORMAT_R16G16_SNORM:
++   case VK_FORMAT_R16G16_SFLOAT:
++   case VK_FORMAT_R16G16B16A16_UNORM:
++   case VK_FORMAT_R16G16B16A16_SNORM:
++   case VK_FORMAT_R16G16B16A16_SFLOAT:
+    case VK_FORMAT_R32_SFLOAT:
++   case VK_FORMAT_R32G32_SFLOAT:
++   case VK_FORMAT_R32G32B32_SFLOAT:
++   case VK_FORMAT_R32G32B32A32_SFLOAT:
++   case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+    case VK_FORMAT_D16_UNORM:
+    case VK_FORMAT_X8_D24_UNORM_PACK32:
+    case VK_FORMAT_D32_SFLOAT:
+@@ -677,11 +710,53 @@ radv_is_filter_minmax_format_supported(VkFormat format)
+    case VK_FORMAT_D24_UNORM_S8_UINT:
+    case VK_FORMAT_D32_SFLOAT_S8_UINT:
+       return true;
++   case VK_FORMAT_R8_UINT:
++   case VK_FORMAT_R8_SINT:
++   case VK_FORMAT_R8G8_UINT:
++   case VK_FORMAT_R8G8_SINT:
++   case VK_FORMAT_R8G8B8A8_UINT:
++   case VK_FORMAT_R8G8B8A8_SINT:
++   case VK_FORMAT_B8G8R8A8_UINT:
++   case VK_FORMAT_B8G8R8A8_SINT:
++   case VK_FORMAT_A8B8G8R8_UINT_PACK32:
++   case VK_FORMAT_A8B8G8R8_SINT_PACK32:
++   case VK_FORMAT_A2R10G10B10_UINT_PACK32:
++   case VK_FORMAT_A2B10G10R10_UINT_PACK32:
++   case VK_FORMAT_R16_UINT:
++   case VK_FORMAT_R16_SINT:
++   case VK_FORMAT_R16G16_UINT:
++   case VK_FORMAT_R16G16_SINT:
++   case VK_FORMAT_R16G16B16A16_UINT:
++   case VK_FORMAT_R16G16B16A16_SINT:
++   case VK_FORMAT_R32_UINT:
++   case VK_FORMAT_R32_SINT:
++   case VK_FORMAT_R32G32_UINT:
++   case VK_FORMAT_R32G32_SINT:
++   case VK_FORMAT_R32G32B32_UINT:
++   case VK_FORMAT_R32G32B32_SINT:
++   case VK_FORMAT_R32G32B32A32_UINT:
++   case VK_FORMAT_R32G32B32A32_SINT:
++   case VK_FORMAT_S8_UINT:
++      return gfx_level >= GFX9;
++   case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
++      return gfx_level >= GFX7;
+    default:
+       return false;
+    }
+ }
+ 
++bool
++radv_is_format_emulated(const struct radv_physical_device *physical_device, VkFormat format)
++{
++   if (physical_device->emulate_etc2 && vk_texcompress_etc2_emulation_format(format) != VK_FORMAT_UNDEFINED)
++      return true;
 +
-+   // 图像太大内存申请不了
-+	if (pCreateInfo->extent.width >= 3960 && pCreateInfo->extent.height >= 3960) {
-+	   return false;
-+	}
++   if (physical_device->emulate_astc && vk_texcompress_astc_emulation_format(format) != VK_FORMAT_UNDEFINED)
++      return true;
 +
-+	// 检查图像大小
-+	if (pCreateInfo->extent.width >= 256 && pCreateInfo->extent.height >= 256) {
-+	   return true;
-+	}
-+	return false;
++   return false;
 +}
 +
+ bool
+ radv_device_supports_etc(struct radv_physical_device *physical_device)
+ {
+@@ -708,8 +783,21 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
+       return;
+    }
+ 
+-   if (desc->layout == UTIL_FORMAT_LAYOUT_ETC && !radv_device_supports_etc(physical_device) &&
+-       !physical_device->emulate_etc2) {
++   if ((desc->layout == UTIL_FORMAT_LAYOUT_ETC && !radv_device_supports_etc(physical_device))
++      || desc->layout == UTIL_FORMAT_LAYOUT_ASTC) {
++      if (radv_is_format_emulated(physical_device, format)) {
++         /* required features for compressed formats */
++         tiled = VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT | VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_LINEAR_BIT |
++                 VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT |
++                 VK_FORMAT_FEATURE_2_BLIT_SRC_BIT;
++
++         VkFormat emulation_format = vk_texcompress_etc2_emulation_format(format);
++         if (emulation_format == VK_FORMAT_UNDEFINED)
++            emulation_format = vk_texcompress_astc_emulation_format(format);
++
++         if (radv_is_filter_minmax_format_supported(physical_device, emulation_format))
++            tiled |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT;
++      }
+       out_properties->linearTilingFeatures = linear;
+       out_properties->optimalTilingFeatures = tiled;
+       out_properties->bufferFeatures = buffer;
+@@ -763,7 +851,7 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
+          tiled |= VK_FORMAT_FEATURE_2_BLIT_SRC_BIT | VK_FORMAT_FEATURE_2_BLIT_DST_BIT;
+          tiled |= VK_FORMAT_FEATURE_2_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_2_TRANSFER_DST_BIT;
+ 
+-         if (radv_is_filter_minmax_format_supported(format))
++         if (radv_is_filter_minmax_format_supported(physical_device, format))
+             tiled |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT;
+ 
+          if (vk_format_has_depth(format)) {
+@@ -784,7 +872,7 @@ radv_physical_device_get_format_properties(struct radv_physical_device *physical
+          linear |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT | VK_FORMAT_FEATURE_2_BLIT_SRC_BIT;
+          tiled |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_BIT | VK_FORMAT_FEATURE_2_BLIT_SRC_BIT;
+ 
+-         if (radv_is_filter_minmax_format_supported(format))
++         if (radv_is_filter_minmax_format_supported(physical_device, format))
+             tiled |= VK_FORMAT_FEATURE_2_SAMPLED_IMAGE_FILTER_MINMAX_BIT;
+ 
+          if (linear_sampling) {
+@@ -1357,6 +1445,9 @@ radv_check_modifier_support(struct radv_physical_device *dev,
+    if (info->type != VK_IMAGE_TYPE_2D)
+       return VK_ERROR_FORMAT_NOT_SUPPORTED;
+ 
++   if (radv_is_format_emulated(dev, format))
++      return VK_ERROR_FORMAT_NOT_SUPPORTED;
++
+    if (!desc || (desc->layout == UTIL_FORMAT_LAYOUT_ETC && dev->emulate_etc2))
+       return VK_ERROR_FORMAT_NOT_SUPPORTED;
+ 
+@@ -1493,6 +1584,8 @@ radv_get_image_format_properties(struct radv_physical_device *physical_device,
+    if (format_feature_flags == 0)
+       goto unsupported;
+ 
++   if (info->type == VK_IMAGE_TYPE_1D && radv_is_format_emulated(physical_device, format))
++      goto unsupported;
+    if (info->type != VK_IMAGE_TYPE_2D && vk_format_is_depth_or_stencil(format))
+       goto unsupported;
+ 
+@@ -1633,9 +1726,8 @@ radv_get_image_format_properties(struct radv_physical_device *physical_device,
+          goto unsupported;
+    }
+ 
+-   if ((info->flags &
+-        (VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT | VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT)) &&
+-       (desc->layout == UTIL_FORMAT_LAYOUT_ETC && physical_device->emulate_etc2)) {
++   if ((info->flags & (VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT | VK_IMAGE_CREATE_BLOCK_TEXEL_VIEW_COMPATIBLE_BIT)) &&
++       radv_is_format_emulated(physical_device, format)) {
+       goto unsupported;
+    }
+ 
+@@ -1681,9 +1773,8 @@ get_external_image_format_properties(struct radv_physical_device *physical_devic
+    VkExternalMemoryFeatureFlagBits flags = 0;
+    VkExternalMemoryHandleTypeFlags export_flags = 0;
+    VkExternalMemoryHandleTypeFlags compat_flags = 0;
+-   const struct util_format_description *desc = vk_format_description(pImageFormatInfo->format);
+ 
+-   if (!desc || (desc->layout == UTIL_FORMAT_LAYOUT_ETC && physical_device->emulate_etc2))
++   if (radv_is_format_emulated(physical_device, pImageFormatInfo->format))
+       return;
+ 
+    if (pImageFormatInfo->flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT)
+diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c
+index ae19fc1987c..3fd4ea777a8 100644
+--- a/src/amd/vulkan/radv_image.c
++++ b/src/amd/vulkan/radv_image.c
+@@ -544,40 +544,19 @@ radv_patch_image_from_extra_info(struct radv_device *device, struct radv_image *
+    return VK_SUCCESS;
+ }
+ 
+-static VkFormat
+-etc2_emulation_format(VkFormat format)
+-{
+-   switch (format) {
+-   case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK:
+-   case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK:
+-   case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK:
+-      return VK_FORMAT_R8G8B8A8_UNORM;
+-   case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
+-   case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
+-   case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
+-      return VK_FORMAT_R8G8B8A8_SRGB;
+-   case VK_FORMAT_EAC_R11_UNORM_BLOCK:
+-      return VK_FORMAT_R16_UNORM;
+-   case VK_FORMAT_EAC_R11_SNORM_BLOCK:
+-      return VK_FORMAT_R16_SNORM;
+-   case VK_FORMAT_EAC_R11G11_UNORM_BLOCK:
+-      return VK_FORMAT_R16G16_UNORM;
+-   case VK_FORMAT_EAC_R11G11_SNORM_BLOCK:
+-      return VK_FORMAT_R16G16_SNORM;
+-   default:
+-      unreachable("Unhandled ETC format");
+-   }
+-}
++extern bool radv_is_format_emulated(const struct radv_physical_device *physical_device, VkFormat format);
+ 
  static VkFormat
  radv_image_get_plane_format(const struct radv_physical_device *pdev, const struct radv_image *image,
                              unsigned plane)
-@@ -553,10 +614,15 @@ radv_image_get_plane_format(const struct radv_physical_device *pdev, const struc
-    if (radv_is_format_emulated(pdev, image->vk_format)) {
+ {
+-   if (pdev->emulate_etc2 &&
+-       vk_format_description(image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ETC) {
++   if (radv_is_format_emulated(pdev, image->vk_format)) {
        if (plane == 0)
           return image->vk_format;
--      if (vk_format_description(image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ASTC)
-+      if (vk_format_description(image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ASTC) {
-+         if (image->isNeedHardEncode)
-+            return radv_translate_rgba2dxt5(vk_texcompress_astc_emulation_format(image->vk_format));
-          return vk_texcompress_astc_emulation_format(image->vk_format);
--      else
-+      } else {
-+         if (image->isNeedHardEncode)
-+            return radv_translate_rgba2dxt5(vk_texcompress_etc2_emulation_format(image->vk_format));
-          return vk_texcompress_etc2_emulation_format(image->vk_format);
-+      }
+-      return etc2_emulation_format(image->vk_format);
++      if (vk_format_description(image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ASTC)
++         return vk_texcompress_astc_emulation_format(image->vk_format);
++      else
++         return vk_texcompress_etc2_emulation_format(image->vk_format);
     }
     return vk_format_get_plane_format(image->vk_format, plane);
  }
-@@ -1710,6 +1776,12 @@ radv_destroy_image(struct radv_device *device, const VkAllocationCallbacks *pAll
-    vk_free2(&device->vk.alloc, pAllocator, image);
+@@ -773,11 +752,13 @@ si_set_mutable_tex_desc_fields(struct radv_device *device, struct radv_image *im
+                                const struct legacy_surf_level *base_level_info, unsigned plane_id,
+                                unsigned base_level, unsigned first_level, unsigned block_width,
+                                bool is_stencil, bool is_storage_image, bool disable_compression,
+-                               bool enable_write_compression, uint32_t *state)
++                               bool enable_write_compression, uint32_t *state,
++                               const struct ac_surf_nbc_view *nbc_view)
+ {
+    struct radv_image_plane *plane = &image->planes[plane_id];
+    uint64_t gpu_address = image->bo ? radv_buffer_get_va(image->bo) + image->offset : 0;
+    uint64_t va = gpu_address;
++   uint8_t swizzle = plane->surface.tile_swizzle;
+    enum chip_class chip_class = device->physical_device->rad_info.chip_class;
+    uint64_t meta_va = 0;
+    if (chip_class >= GFX9) {
+@@ -785,12 +766,16 @@ si_set_mutable_tex_desc_fields(struct radv_device *device, struct radv_image *im
+          va += plane->surface.u.gfx9.zs.stencil_offset;
+       else
+          va += plane->surface.u.gfx9.surf_offset;
++      if (nbc_view && nbc_view->valid) {
++         va += nbc_view->base_address_offset;
++         swizzle = nbc_view->tile_swizzle;
++      }
+    } else
+       va += (uint64_t)base_level_info->offset_256B * 256;
+ 
+    state[0] = va >> 8;
+    if (chip_class >= GFX9 || base_level_info->mode == RADEON_SURF_MODE_2D)
+-      state[0] |= plane->surface.tile_swizzle;
++      state[0] |= swizzle;
+    state[1] &= C_008F14_BASE_ADDRESS_HI;
+    state[1] |= S_008F14_BASE_ADDRESS_HI(va >> 40);
+ 
+@@ -960,7 +945,8 @@ gfx10_make_texture_descriptor(struct radv_device *device, struct radv_image *ima
+                               const VkComponentMapping *mapping, unsigned first_level,
+                               unsigned last_level, unsigned first_layer, unsigned last_layer,
+                               unsigned width, unsigned height, unsigned depth, float min_lod,
+-                              uint32_t *state, uint32_t *fmask_state)
++                              uint32_t *state, uint32_t *fmask_state,
++                              const struct ac_surf_nbc_view *nbc_view)
+ {
+    const struct util_format_description *desc;
+    enum pipe_swizzle swizzle[4];
+@@ -1015,12 +1001,17 @@ gfx10_make_texture_descriptor(struct radv_device *device, struct radv_image *ima
+    state[4] = S_00A010_DEPTH(type == V_008F1C_SQ_RSRC_IMG_3D ? depth - 1 : last_layer) |
+               S_00A010_BASE_ARRAY(first_layer);
+    state[5] = S_00A014_ARRAY_PITCH(0) |
+-              S_00A014_MAX_MIP(image->info.samples > 1 ? util_logbase2(image->info.samples)
+-                                                       : image->info.levels - 1) |
+               S_00A014_PERF_MOD(4);
+    state[6] = 0;
+    state[7] = 0;
+ 
++   unsigned max_mip =
++      image->info.samples > 1 ? util_logbase2(image->info.samples) : image->info.levels - 1;
++   if (nbc_view && nbc_view->valid)
++      max_mip = nbc_view->num_levels - 1;
++
++   state[5] |= S_00A014_MAX_MIP(max_mip);
++
+    if (radv_dcc_enabled(image, first_level)) {
+       state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
+                   S_00A018_MAX_COMPRESSED_BLOCK_SIZE(
+@@ -1299,12 +1290,12 @@ radv_make_texture_descriptor(struct radv_device *device, struct radv_image *imag
+                              const VkComponentMapping *mapping, unsigned first_level,
+                              unsigned last_level, unsigned first_layer, unsigned last_layer,
+                              unsigned width, unsigned height, unsigned depth, float min_lod, uint32_t *state,
+-                             uint32_t *fmask_state)
++                             uint32_t *fmask_state, const struct ac_surf_nbc_view *nbc_view)
+ {
+    if (device->physical_device->rad_info.chip_class >= GFX10) {
+       gfx10_make_texture_descriptor(device, image, is_storage_image, view_type, vk_format, mapping,
+                                     first_level, last_level, first_layer, last_layer, width, height,
+-                                    depth, min_lod, state, fmask_state);
++                                    depth, min_lod, state, fmask_state, nbc_view);
+    } else {
+       si_make_texture_descriptor(device, image, is_storage_image, view_type, vk_format, mapping,
+                                  first_level, last_level, first_layer, last_layer, width, height,
+@@ -1324,11 +1315,11 @@ radv_query_opaque_metadata(struct radv_device *device, struct radv_image *image,
+    radv_make_texture_descriptor(device, image, false, (VkImageViewType)image->type,
+                                 image->vk_format, &fixedmapping, 0, image->info.levels - 1, 0,
+                                 image->info.array_size - 1, image->info.width, image->info.height,
+-                                image->info.depth, 0.0f, desc, NULL);
++                                image->info.depth, 0.0f, desc, NULL, NULL);
+ 
+    si_set_mutable_tex_desc_fields(device, image, &image->planes[0].surface.u.legacy.level[0], 0, 0,
+                                   0, image->planes[0].surface.blk_w, false, false, false, false,
+-                                  desc);
++                                  desc, NULL);
+ 
+    ac_surface_get_umd_metadata(&device->physical_device->rad_info, &image->planes[0].surface,
+                                image->info.levels, desc, &md->size_metadata, md->metadata);
+@@ -1562,7 +1553,7 @@ radv_image_use_comp_to_single(const struct radv_device *device, const struct rad
+ static unsigned
+ radv_get_internal_plane_count(const struct radv_physical_device *pdev, VkFormat fmt)
+ {
+-   if (pdev->emulate_etc2 && vk_format_description(fmt)->layout == UTIL_FORMAT_LAYOUT_ETC)
++   if (radv_is_format_emulated(pdev, fmt))
+       return 2;
+    return vk_format_get_plane_count(fmt);
+ }
+@@ -2024,17 +2015,34 @@ radv_image_create(VkDevice _device, const struct radv_image_create_info *create_
+    return VK_SUCCESS;
  }
  
-+void
-+radv_internal_image_finish(struct radv_image *image)
++
++static inline void
++compute_non_block_compressed_view(const struct radv_device *device,
++                                  const struct radv_image_view *iview,
++                                  struct ac_surf_nbc_view *nbc_view)
 +{
-+   vk_object_base_finish(&image->base);
++   const struct radv_image *image = iview->image;
++   const struct radeon_surf *surf = &image->planes[0].surface;
++   struct ac_addrlib *addrlib = device->ws->get_addrlib(device->ws);
++
++   ac_surface_compute_nbc_view(addrlib, &device->physical_device->rad_info, surf, &image->info,
++                               iview->base_mip, iview->base_layer, nbc_view);
 +}
++
 +
  static void
- radv_image_print_info(struct radv_device *device, struct radv_image *image)
+ radv_image_view_make_descriptor(struct radv_image_view *iview, struct radv_device *device,
+                                 VkFormat vk_format, const VkComponentMapping *components,
+                                 float min_lod,
+                                 bool is_storage_image, bool disable_compression,
+                                 bool enable_compression, unsigned plane_id,
+-                                unsigned descriptor_plane_id)
++                                unsigned descriptor_plane_id,
++                                const struct ac_surf_nbc_view *nbc_view)
  {
-@@ -1890,9 +1962,11 @@ radv_image_create(VkDevice _device, const struct radv_image_create_info *create_
-       vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_MEMORY_IMAGE_CREATE_INFO);
-    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO);
- 
-+   bool needHardEncode = radv_need_hard_encode(device->physical_device, pCreateInfo);
-+
-    unsigned plane_count = radv_get_internal_plane_count(device->physical_device, format);
- 
--   bool needSoftEncode = !external_info && radv_need_soft_encode(pCreateInfo);
-+   bool needSoftEncode = !external_info && radv_need_soft_encode(pCreateInfo) && !needHardEncode;
-    bool needSoftDecode = needSoftEncode && radv_need_soft_decode(format);
-    VkFormat softDecodeFormat = radv_translate_etc(format);
-    VkFormat softEncodeFormat = radv_translate_rgb2BC(format);
-@@ -1936,6 +2010,7 @@ radv_image_create(VkDevice _device, const struct radv_image_create_info *create_
-    image->usage = pCreateInfo->usage;
-    image->flags = pCreateInfo->flags;
-    image->plane_count = vk_format_get_plane_count(format);
-+   image->isNeedHardEncode = needHardEncode;
- 
-    //满足要求的 ETC2 纹理使用软解
-    if (needSoftDecode) {
+    struct radv_image *image = iview->image;
+    struct radv_image_plane *plane = &image->planes[plane_id];
+    bool is_stencil = iview->aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT;
++   unsigned first_layer = iview->base_layer;
+    uint32_t blk_w;
+    union radv_descriptor *descriptor;
+    uint32_t hw_level = 0;
+@@ -2050,16 +2058,27 @@ radv_image_view_make_descriptor(struct radv_image_view *iview, struct radv_devic
+    blk_w = plane->surface.blk_w / vk_format_get_blockwidth(plane->format) *
+            vk_format_get_blockwidth(vk_format);
+ 
+-   if (device->physical_device->rad_info.chip_class >= GFX9)
++   if (device->physical_device->rad_info.chip_class >= GFX9) {
+       hw_level = iview->base_mip;
++      if (nbc_view->valid) {
++         hw_level = nbc_view->level;
++         iview->extent.width = nbc_view->width;
++         iview->extent.height = nbc_view->height;
++
++         /* Clear the base array layer because addrlib adds it as part of the base addr offset. */
++         first_layer = 0;
++      }
++   }
++
+    radv_make_texture_descriptor(
+       device, image, is_storage_image, iview->type, vk_format, components, hw_level,
+-      hw_level + iview->level_count - 1, iview->base_layer,
++      hw_level + iview->level_count - 1, first_layer,
+       iview->base_layer + iview->layer_count - 1,
+       vk_format_get_plane_width(image->vk_format, plane_id, iview->extent.width),
+       vk_format_get_plane_height(image->vk_format, plane_id, iview->extent.height),
+       iview->extent.depth, min_lod, descriptor->plane_descriptors[descriptor_plane_id],
+-      descriptor_plane_id || is_storage_image ? NULL : descriptor->fmask_descriptor);
++      descriptor_plane_id || is_storage_image ? NULL : descriptor->fmask_descriptor,
++      nbc_view);
+ 
+    const struct legacy_surf_level *base_level_info = NULL;
+    if (device->physical_device->rad_info.chip_class <= GFX9) {
+@@ -2075,7 +2094,7 @@ radv_image_view_make_descriptor(struct radv_image_view *iview, struct radv_devic
+    si_set_mutable_tex_desc_fields(device, image, base_level_info, plane_id, iview->base_mip,
+                                   iview->base_mip, blk_w, is_stencil, is_storage_image,
+                                   disable_compression, enable_write_compression,
+-                                  descriptor->plane_descriptors[descriptor_plane_id]);
++                                  descriptor->plane_descriptors[descriptor_plane_id], nbc_view);
+ }
+ 
+ static unsigned
+@@ -2182,6 +2201,7 @@ radv_image_view_init(struct radv_image_view *iview, struct radv_device *device,
+    iview->image = image;
+    iview->type = pCreateInfo->viewType;
+    iview->plane_id = radv_plane_from_aspect(pCreateInfo->subresourceRange.aspectMask);
++   iview->nbc_view.valid = false;
+    iview->aspect_mask = pCreateInfo->subresourceRange.aspectMask;
+    iview->base_layer = range->baseArrayLayer;
+    iview->layer_count = radv_get_layerCount(image, range);
+@@ -2215,15 +2235,11 @@ radv_image_view_init(struct radv_image_view *iview, struct radv_device *device,
+       iview->vk_format = radv_translate_etc(iview->vk_format);
+    }
+ 
+-   if (device->physical_device->emulate_etc2 &&
+-       vk_format_description(image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ETC) {
+-      const struct util_format_description *desc = vk_format_description(iview->vk_format);
+-      assert(desc);
+-      if (desc->layout == UTIL_FORMAT_LAYOUT_ETC) {
+-         iview->plane_id = 1;
+-         iview->vk_format = etc2_emulation_format(iview->vk_format);
+-      }
+-
++   /* when the view format is emulated, redirect the view to the hidden plane 1 */
++   if (radv_is_format_emulated(device->physical_device, iview->vk_format)) {
++      assert(radv_is_format_emulated(device->physical_device, image->vk_format));
++      iview->plane_id = 1;
++      iview->vk_format = image->planes[iview->plane_id].format;
+       plane_count = 1;
+    }
+ 
+@@ -2242,13 +2258,14 @@ radv_image_view_init(struct radv_image_view *iview, struct radv_device *device,
+    }
+ 
+    if (iview->vk_format != image->planes[iview->plane_id].format) {
++      const struct radv_image_plane *plane = &image->planes[iview->plane_id];
+       unsigned view_bw = vk_format_get_blockwidth(iview->vk_format);
+       unsigned view_bh = vk_format_get_blockheight(iview->vk_format);
+-      unsigned img_bw = vk_format_get_blockwidth(image->planes[iview->plane_id].format);
+-      unsigned img_bh = vk_format_get_blockheight(image->planes[iview->plane_id].format);
++      unsigned plane_bw = vk_format_get_blockwidth(plane->format);
++      unsigned plane_bh = vk_format_get_blockheight(plane->format);
+ 
+-      iview->extent.width = round_up_u32(iview->extent.width * view_bw, img_bw);
+-      iview->extent.height = round_up_u32(iview->extent.height * view_bh, img_bh);
++      iview->extent.width = round_up_u32(iview->extent.width * view_bw, plane_bw);
++      iview->extent.height = round_up_u32(iview->extent.height * view_bh, plane_bh);
+ 
+       /* Comment ported from amdvlk -
+        * If we have the following image:
+@@ -2277,27 +2294,37 @@ radv_image_view_init(struct radv_image_view *iview, struct radv_device *device,
+        * the plain converted dimensions the physical layout is correct.
+        */
+       if (device->physical_device->rad_info.chip_class >= GFX9 &&
+-          vk_format_is_compressed(image->vk_format) && !vk_format_is_compressed(iview->vk_format)) {
++          vk_format_is_block_compressed(image->vk_format) &&
++          !vk_format_is_block_compressed(iview->vk_format)) {
+          /* If we have multiple levels in the view we should ideally take the last level,
+           * but the mip calculation has a max(..., 1) so walking back to the base mip in an
+           * useful way is hard. */
+          if (iview->level_count > 1) {
+-            iview->extent.width = iview->image->planes[0].surface.u.gfx9.base_mip_width;
+-            iview->extent.height = iview->image->planes[0].surface.u.gfx9.base_mip_height;
++            iview->extent.width = plane->surface.u.gfx9.base_mip_width;
++            iview->extent.height = plane->surface.u.gfx9.base_mip_height;
+          } else {
+             unsigned lvl_width = radv_minify(image->info.width, range->baseMipLevel);
+             unsigned lvl_height = radv_minify(image->info.height, range->baseMipLevel);
+ 
+-            lvl_width = round_up_u32(lvl_width * view_bw, img_bw);
+-            lvl_height = round_up_u32(lvl_height * view_bh, img_bh);
+-
+-            lvl_width <<= range->baseMipLevel;
+-            lvl_height <<= range->baseMipLevel;
+-
+-            iview->extent.width = CLAMP(lvl_width, iview->extent.width,
+-                                        iview->image->planes[0].surface.u.gfx9.base_mip_width);
+-            iview->extent.height = CLAMP(lvl_height, iview->extent.height,
+-                                         iview->image->planes[0].surface.u.gfx9.base_mip_height);
++            lvl_width = round_up_u32(lvl_width * view_bw, plane_bw);
++            lvl_height = round_up_u32(lvl_height * view_bh, plane_bh);
++
++            iview->extent.width = CLAMP(lvl_width << range->baseMipLevel, iview->extent.width, 
++                                       plane->surface.u.gfx9.base_mip_width);
++            iview->extent.height = CLAMP(lvl_height << range->baseMipLevel, iview->extent.height, 
++                                       plane->surface.u.gfx9.base_mip_height);
++
++            /* If the hardware-computed extent is still be too small, on GFX10
++             * we can attempt another workaround provided by addrlib that
++             * changes the descriptor's base level, and adjusts the address and
++             * extents accordingly.
++             */
++            if (device->physical_device->rad_info.chip_class >= GFX10 &&
++                (radv_minify(iview->extent.width, range->baseMipLevel) < lvl_width ||
++                 radv_minify(iview->extent.height, range->baseMipLevel) < lvl_height) &&
++                iview->layer_count == 1) {
++               compute_non_block_compressed_view(device, iview, &iview->nbc_view);
++            }
+          }
+       }
+    }
+@@ -2311,10 +2338,10 @@ radv_image_view_init(struct radv_image_view *iview, struct radv_device *device,
+       VkFormat format = vk_format_get_plane_format(iview->vk_format, i);
+       radv_image_view_make_descriptor(iview, device, format, &pCreateInfo->components, min_lod, false,
+                                       disable_compression, enable_compression, iview->plane_id + i,
+-                                      i);
++                                      i, &iview->nbc_view);
+       radv_image_view_make_descriptor(iview, device, format, &pCreateInfo->components, min_lod, true,
+                                       disable_compression, enable_compression, iview->plane_id + i,
+-                                      i);
++                                      i, &iview->nbc_view);
+    }
+ }
+ 
 diff --git a/src/amd/vulkan/radv_meta.c b/src/amd/vulkan/radv_meta.c
-index b1080ef8935..9b5a73cd1be 100644
+index 82eb45748dc..b1080ef8935 100644
 --- a/src/amd/vulkan/radv_meta.c
 +++ b/src/amd/vulkan/radv_meta.c
-@@ -601,8 +601,14 @@ radv_device_init_meta(struct radv_device *device)
+@@ -597,8 +597,14 @@ radv_device_init_meta(struct radv_device *device)
     if (result != VK_SUCCESS)
-       goto fail_astc_decode;
+       goto fail_etc_decode;
  
-+   result = radv_device_init_meta_rgba_encode_state(device);
++   result = radv_device_init_meta_astc_decode_state(device, on_demand);
 +   if (result != VK_SUCCESS)
-+      goto fail_bcn_encode;
++      goto fail_astc_decode;
 +
     return VK_SUCCESS;
  
-+fail_bcn_encode:
-+   radv_device_finish_meta_rgba_encode_state(device);
- fail_astc_decode:
-    radv_device_finish_meta_astc_decode_state(device);
++fail_astc_decode:
++   radv_device_finish_meta_astc_decode_state(device);
  fail_etc_decode:
-@@ -644,6 +650,7 @@ fail_clear:
- void
+    radv_device_finish_meta_etc_decode_state(device);
+ fail_fmask_copy:
+@@ -639,6 +645,7 @@ void
  radv_device_finish_meta(struct radv_device *device)
  {
-+   radv_device_finish_meta_rgba_encode_state(device);
     radv_device_finish_meta_etc_decode_state(device);
-    radv_device_finish_meta_astc_decode_state(device);
++   radv_device_finish_meta_astc_decode_state(device);
     radv_device_finish_accel_struct_build_state(device);
+    radv_device_finish_meta_clear_state(device);
+    radv_device_finish_meta_resolve_state(device);
 diff --git a/src/amd/vulkan/radv_meta.h b/src/amd/vulkan/radv_meta.h
-index d8ab75481d9..9702bfc9c7b 100644
+index 854cd37766c..d8ab75481d9 100644
 --- a/src/amd/vulkan/radv_meta.h
 +++ b/src/amd/vulkan/radv_meta.h
-@@ -107,6 +107,9 @@ void radv_device_finish_meta_etc_decode_state(struct radv_device *device);
- VkResult radv_device_init_meta_astc_decode_state(struct radv_device *device, bool on_demand);
- void radv_device_finish_meta_astc_decode_state(struct radv_device *device);
+@@ -104,6 +104,9 @@ void radv_device_finish_accel_struct_build_state(struct radv_device *device);
+ VkResult radv_device_init_meta_etc_decode_state(struct radv_device *device, bool on_demand);
+ void radv_device_finish_meta_etc_decode_state(struct radv_device *device);
  
-+VkResult radv_device_init_meta_rgba_encode_state(struct radv_device *device);
-+void radv_device_finish_meta_rgba_encode_state(struct radv_device *device);
++VkResult radv_device_init_meta_astc_decode_state(struct radv_device *device, bool on_demand);
++void radv_device_finish_meta_astc_decode_state(struct radv_device *device);
 +
  void radv_meta_save(struct radv_meta_saved_state *saved_state, struct radv_cmd_buffer *cmd_buffer,
                      uint32_t flags);
  
-@@ -232,6 +235,9 @@ void radv_meta_decode_etc(struct radv_cmd_buffer *cmd_buffer, struct radv_image
- void radv_meta_decode_astc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout layout,
-                            const VkImageSubresourceLayers *subresource, VkOffset3D offset, VkExtent3D extent);
+@@ -226,6 +229,9 @@ void radv_meta_decode_etc(struct radv_cmd_buffer *cmd_buffer, struct radv_image
+                           VkImageLayout layout, const VkImageSubresourceLayers *subresource,
+                           VkOffset3D offset, VkExtent3D extent);
  
-+void radv_meta_bcn_encoded(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout layout,
++void radv_meta_decode_astc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout layout,
 +                           const VkImageSubresourceLayers *subresource, VkOffset3D offset, VkExtent3D extent);
 +
  /**
   * Return whether the bound pipeline is the FMASK decompress pass.
   */
 diff --git a/src/amd/vulkan/radv_meta_astc_decode.c b/src/amd/vulkan/radv_meta_astc_decode.c
-index 93a63692c09..006394c639b 100644
---- a/src/amd/vulkan/radv_meta_astc_decode.c
+new file mode 100644
+index 00000000000..93a63692c09
+--- /dev/null
 +++ b/src/amd/vulkan/radv_meta_astc_decode.c
-@@ -166,8 +166,15 @@ radv_meta_decode_astc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *ima
-    struct radv_image_view src_iview, dst_iview;
-    image_view_init(device, image, VK_FORMAT_R32G32B32A32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
-                    subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &src_iview);
--   image_view_init(device, image, VK_FORMAT_R8G8B8A8_UINT, VK_IMAGE_ASPECT_PLANE_1_BIT, subresource->mipLevel,
--                   subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &dst_iview);
-+   if (image->isNeedHardEncode) {
-+      RADV_FROM_HANDLE(radv_image, store_image, image->staging_images->image_rgba);
-+      cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, store_image);
-+      image_view_init(device, store_image, VK_FORMAT_R8G8B8A8_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
-+         subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &dst_iview);
-+   } else {
-+      image_view_init(device, image, VK_FORMAT_R8G8B8A8_UINT, VK_IMAGE_ASPECT_PLANE_1_BIT, subresource->mipLevel,
-+         subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &dst_iview);
-+   }
- 
-    VkExtent3D extent_copy = {
-       .width = extent.width,
-diff --git a/src/amd/vulkan/radv_meta_copy.c b/src/amd/vulkan/radv_meta_copy.c
-index b418d3f2ab9..a0a6b41cce9 100644
---- a/src/amd/vulkan/radv_meta_copy.c
-+++ b/src/amd/vulkan/radv_meta_copy.c
-@@ -26,6 +26,7 @@
- #include "main/formats.h"
- #include "main/texcompress_etc.h"
- #include "main/texcompress_bptc_tmp.h"
-+#include "vk_texcompress_bcn.h"
- 
- static VkExtent3D
- meta_image_block_size(const struct radv_image *image)
-@@ -471,6 +472,74 @@ copy_buffer_to_image(struct radv_cmd_buffer *cmd_buffer, struct radv_buffer *buf
-    radv_meta_restore(&saved_state, cmd_buffer);
- }
- 
-+void radv_create_staging_images(struct radv_image *dst_image, struct radv_cmd_buffer *cmd_buffer)
-+{
-+   struct vk_texcompress_bcn_image *staging_images =
-+      vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(struct vk_texcompress_bcn_image), 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
-+   dst_image->staging_images = staging_images;
-+
-+   uint32_t size = 0;
-+   VkExtent3D extent = (VkExtent3D){dst_image->info.width, dst_image->info.height, 1};
-+   VkFormat format = VK_FORMAT_R8G8B8A8_UNORM;
-+   vk_texcompress_create_image(&cmd_buffer->device->vk, NULL, &staging_images->image_mem,
-+                               &staging_images->image_rgba, extent, format, 0);
-+   RADV_FROM_HANDLE(radv_image, _image_rgba, dst_image->staging_images->image_rgba);
-+   size += _image_rgba->size;
-+
-+   extent = (VkExtent3D){(dst_image->info.width + 3) / 4, (dst_image->info.height + 3) / 4, 1};
-+   format = VK_FORMAT_R32G32_UINT;
-+   vk_texcompress_create_image(&cmd_buffer->device->vk, NULL, &staging_images->image_mem,
-+                               &staging_images->image_bc1, extent, format, 0);
-+   RADV_FROM_HANDLE(radv_image, _image_bc1, dst_image->staging_images->image_bc1);
-+   size += _image_bc1->size;
-+
-+   extent = (VkExtent3D){(dst_image->info.width + 3) / 4, (dst_image->info.height + 3) / 4, 1};
-+   format = VK_FORMAT_R32G32_UINT;
-+   vk_texcompress_create_image(&cmd_buffer->device->vk, NULL, &staging_images->image_mem,
-+                               &staging_images->image_bc4, extent, format, 0);
-+   RADV_FROM_HANDLE(radv_image, _image_bc4, dst_image->staging_images->image_bc4);
-+   size += _image_bc4->size;
-+
-+   extent = (VkExtent3D){dst_image->info.width, dst_image->info.height, 1};
-+   format = VK_FORMAT_R8G8B8A8_UNORM;
-+   vk_texcompress_create_image(&cmd_buffer->device->vk, NULL, &staging_images->image_mem,
-+      &staging_images->image_dummy, extent, format, size);
-+   RADV_FROM_HANDLE(radv_image, _image_dummy, dst_image->staging_images->image_dummy);
-+
-+   bind_memory(&cmd_buffer->device->vk, &staging_images->image_rgba, &staging_images->image_mem, 0);
-+   bind_memory(&cmd_buffer->device->vk, &staging_images->image_bc1, &staging_images->image_mem, _image_rgba->size);
-+   bind_memory(&cmd_buffer->device->vk, &staging_images->image_bc4, &staging_images->image_mem, _image_bc1->size);
-+
-+   cmd_buffer->vk.is_need_hard_encode = true;
-+
-+   vk_texcompress_insert_head(&cmd_buffer->vk.staging_image_list_head, dst_image->staging_images);
-+}
-+
-+void radv_copy_buffer_hard_encode(struct radv_image *dst_image,
-+                                  const enum util_format_layout format_layout,
-+                                  struct radv_cmd_buffer *cmd_buffer,
-+                                  const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
-+{
-+   radv_create_staging_images(dst_image, cmd_buffer);
-+   for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) {
-+      if (format_layout == UTIL_FORMAT_LAYOUT_ASTC) {
-+         radv_meta_decode_astc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout,
-+                               &pCopyBufferToImageInfo->pRegions[r].imageSubresource,
-+                               pCopyBufferToImageInfo->pRegions[r].imageOffset,
-+                               pCopyBufferToImageInfo->pRegions[r].imageExtent);
-+      } else {
-+         radv_meta_decode_etc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout,
-+                              &pCopyBufferToImageInfo->pRegions[r].imageSubresource,
-+                              pCopyBufferToImageInfo->pRegions[r].imageOffset,
-+                              pCopyBufferToImageInfo->pRegions[r].imageExtent);
-+      }
-+      radv_meta_bcn_encoded(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout,
-+                            &pCopyBufferToImageInfo->pRegions[r].imageSubresource,
-+                            pCopyBufferToImageInfo->pRegions[r].imageOffset,
-+                            pCopyBufferToImageInfo->pRegions[r].imageExtent);
-+   }
-+}
+@@ -0,0 +1,183 @@
++/* Copyright (c) 2017-2023 Hans-Kristian Arntzen
++ *
++ * SPDX-License-Identifier: MIT
++ */
 +
- extern bool
- radv_is_format_emulated(const struct radv_physical_device *physical_device, VkFormat format);
- 
-@@ -495,6 +564,12 @@ radv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
-          radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_image);
- 
-       const enum util_format_layout format_layout = vk_format_description(dst_image->vk_format)->layout;
++#include <assert.h>
++#include <stdbool.h>
 +
-+      if (dst_image->isNeedHardEncode) {
-+         radv_copy_buffer_hard_encode(dst_image, format_layout, cmd_buffer, pCopyBufferToImageInfo);
-+         return;
-+      }
-+
-       for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) {
-          if (format_layout == UTIL_FORMAT_LAYOUT_ASTC) {
-             radv_meta_decode_astc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout,
-@@ -847,6 +922,36 @@ copy_image(struct radv_cmd_buffer *cmd_buffer, struct radv_image *src_image,
-    radv_meta_restore(&saved_state, cmd_buffer);
- }
- 
-+void radv_copy_image_hard_encode(struct radv_image *dst_image,
-+                                  const enum util_format_layout format_layout,
-+                                  struct radv_cmd_buffer *cmd_buffer,
-+                                  const VkCopyImageInfo2 *pCopyImageInfo)
-+{
-+   radv_create_staging_images(dst_image, cmd_buffer);
-+   RADV_FROM_HANDLE(radv_image, src_image, pCopyImageInfo->srcImage);
-+   for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) {
-+      VkExtent3D dst_extent = pCopyImageInfo->pRegions[r].extent;
-+      if (src_image->vk_format != dst_image->vk_format) {
-+         dst_extent.width = dst_extent.width / vk_format_get_blockwidth(src_image->vk_format) *
-+                            vk_format_get_blockwidth(dst_image->vk_format);
-+         dst_extent.height = dst_extent.height / vk_format_get_blockheight(src_image->vk_format) *
-+                             vk_format_get_blockheight(dst_image->vk_format);
-+      }
-+      if (format_layout == UTIL_FORMAT_LAYOUT_ASTC) {
-+         radv_meta_decode_astc(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout,
-+                               &pCopyImageInfo->pRegions[r].dstSubresource, pCopyImageInfo->pRegions[r].dstOffset,
-+                               dst_extent);
-+      } else {
-+         radv_meta_decode_etc(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout,
-+                              &pCopyImageInfo->pRegions[r].dstSubresource, pCopyImageInfo->pRegions[r].dstOffset,
-+                              dst_extent);
-+      }
-+      radv_meta_bcn_encoded(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout,
-+                            &pCopyImageInfo->pRegions[r].dstSubresource, pCopyImageInfo->pRegions[r].dstOffset,
-+                            dst_extent);
-+   }
-+}
-+
- VKAPI_ATTR void VKAPI_CALL
- radv_CmdCopyImage2(VkCommandBuffer commandBuffer, const VkCopyImageInfo2 *pCopyImageInfo)
- {
-@@ -866,6 +971,12 @@ radv_CmdCopyImage2(VkCommandBuffer commandBuffer, const VkCopyImageInfo2 *pCopyI
-          radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_image);
- 
-       const enum util_format_layout format_layout = vk_format_description(dst_image->vk_format)->layout;
-+
-+      if (dst_image->isNeedHardEncode) {
-+         radv_copy_image_hard_encode(dst_image, format_layout, cmd_buffer, pCopyImageInfo);
-+         return;
-+      }
-+
-       for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) {
-          VkExtent3D dst_extent = pCopyImageInfo->pRegions[r].extent;
-          if (src_image->vk_format != dst_image->vk_format) {
-diff --git a/src/amd/vulkan/radv_meta_etc_decode.c b/src/amd/vulkan/radv_meta_etc_decode.c
-index d5d002c63a0..3685218d915 100644
---- a/src/amd/vulkan/radv_meta_etc_decode.c
-+++ b/src/amd/vulkan/radv_meta_etc_decode.c
-@@ -801,23 +801,45 @@ radv_meta_decode_etc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *imag
-       store_format = VK_FORMAT_R8G8B8A8_UNORM;
-    }
-    struct radv_image_view dest_iview;
--   radv_image_view_init(
--      &dest_iview, cmd_buffer->device,
--      &(VkImageViewCreateInfo){
--         .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
--         .image = radv_image_to_handle(image),
--         .viewType = radv_meta_get_view_type(image),
--         .format = store_format,
--         .subresourceRange =
--            {
--               .aspectMask = VK_IMAGE_ASPECT_PLANE_1_BIT,
--               .baseMipLevel = subresource->mipLevel,
--               .levelCount = 1,
--               .baseArrayLayer = 0,
--               .layerCount = subresource->baseArrayLayer + subresource->layerCount,
--            },
--      },
--      NULL);
-+   if (image->isNeedHardEncode) {
-+      RADV_FROM_HANDLE(radv_image, store_image, image->staging_images->image_rgba);
-+      cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, store_image);
-+      radv_image_view_init(
-+         &dest_iview, cmd_buffer->device,
-+         &(VkImageViewCreateInfo){
-+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
-+            .image = radv_image_to_handle(store_image),
-+            .viewType = radv_meta_get_view_type(store_image),
-+            .format = store_format,
-+            .subresourceRange =
-+               {
-+                  .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
-+                  .baseMipLevel = subresource->mipLevel,
-+                  .levelCount = 1,
-+                  .baseArrayLayer = 0,
-+                  .layerCount = subresource->baseArrayLayer + subresource->layerCount,
-+               },
-+         },
-+         NULL);
-+   } else {
-+      radv_image_view_init(
-+         &dest_iview, cmd_buffer->device,
-+         &(VkImageViewCreateInfo){
-+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
-+            .image = radv_image_to_handle(image),
-+            .viewType = radv_meta_get_view_type(image),
-+            .format = store_format,
-+            .subresourceRange =
-+               {
-+                  .aspectMask = VK_IMAGE_ASPECT_PLANE_1_BIT,
-+                  .baseMipLevel = subresource->mipLevel,
-+                  .levelCount = 1,
-+                  .baseArrayLayer = 0,
-+                  .layerCount = subresource->baseArrayLayer + subresource->layerCount,
-+               },
-+         },
-+         NULL);
-+   }
- 
-    decode_etc(cmd_buffer, &src_iview, &dest_iview, &(VkOffset3D){offset.x, offset.y, base_slice},
-               &(VkExtent3D){extent.width, extent.height, slice_count});
-diff --git a/src/amd/vulkan/radv_meta_rgba_encode.c b/src/amd/vulkan/radv_meta_rgba_encode.c
-new file mode 100644
-index 00000000000..b144767143d
---- /dev/null
-+++ b/src/amd/vulkan/radv_meta_rgba_encode.c
-@@ -0,0 +1,312 @@
-+/* Copyright (c) 2017-2023 Hans-Kristian Arntzen
-+ *
-+ * SPDX-License-Identifier: MIT
-+ */
-+
-+#include <assert.h>
-+#include <stdbool.h>
-+
-+#include "radv_meta.h"
-+#include "sid.h"
-+#include "vk_common_entrypoints.h"
-+#include "vk_format.h"
-+#include "vk_texcompress_bcn.h"
-+#include "radv_debug.h"
++#include "radv_meta.h"
++#include "sid.h"
++#include "vk_common_entrypoints.h"
++#include "vk_format.h"
 +
 +struct radv_dispatch_info {
 +   /**
@@ -482,27 +1369,73 @@ index 00000000000..b144767143d
 +};
 +
 +VkResult
-+radv_device_init_meta_rgba_encode_state(struct radv_device *device)
++radv_device_init_meta_astc_decode_state(struct radv_device *device, bool on_demand)
 +{
 +   const struct radv_physical_device *pdev = device->physical_device;
 +   struct radv_meta_state *state = &device->meta_state;
 +
-+   return vk_texcompress_bcn_init(&device->vk, &state->alloc, radv_pipeline_cache_to_handle(&state->cache), state->bcn_encoded);
++   if (!pdev->emulate_astc)
++      return VK_SUCCESS;
++
++   return vk_texcompress_astc_init(&device->vk, &state->alloc, radv_pipeline_cache_to_handle(&state->cache), &state->astc_decode);
 +}
 +
 +void
-+radv_device_finish_meta_rgba_encode_state(struct radv_device *device)
++radv_device_finish_meta_astc_decode_state(struct radv_device *device)
 +{
 +   struct radv_meta_state *state = &device->meta_state;
-+   struct vk_texcompress_bcn_state **bcns = state->bcn_encoded;
++   struct vk_texcompress_astc_state *astc = state->astc_decode;
 +
-+   if (bcns)
-+      vk_texcompress_bcn_finish(&device->vk, &state->alloc, state->bcn_encoded);
++   if (astc)
++      vk_texcompress_astc_finish(&device->vk, &state->alloc, astc);
 +}
 +
 +extern void
 +radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info);
 +
++static void
++decode_astc(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *src_iview, struct radv_image_view *dst_iview,
++            VkImageLayout layout, const VkOffset3D *offset, const VkExtent3D *extent)
++{
++   struct radv_device *device = cmd_buffer->device;
++   struct radv_meta_state *state = &device->meta_state;
++   struct vk_texcompress_astc_write_descriptor_set write_desc_set;
++   VkFormat format = src_iview->image->vk_format;
++   int blk_w = vk_format_get_blockwidth(format);
++   int blk_h = vk_format_get_blockheight(format);
++
++   vk_texcompress_astc_fill_write_descriptor_sets(state->astc_decode, &write_desc_set,
++                                                  radv_image_view_to_handle(src_iview), layout,
++                                                  radv_image_view_to_handle(dst_iview), format);
++   radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, state->astc_decode->p_layout,
++                                 0, /* set number */
++                                 VK_TEXCOMPRESS_ASTC_WRITE_DESC_SET_COUNT, write_desc_set.descriptor_set);
++
++   VkPipeline pipeline =
++      vk_texcompress_astc_get_decode_pipeline(&device->vk, &state->alloc, state->astc_decode, radv_pipeline_cache_to_handle(&state->cache), format);
++   if (pipeline == VK_NULL_HANDLE)
++      return;
++
++   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
++
++   bool is_3Dimage = (src_iview->image->type == VK_IMAGE_TYPE_3D) ? true : false;
++   int push_constants[5] = {offset->x / blk_w, offset->y / blk_h, extent->width + offset->x, extent->height + offset->y,
++                            is_3Dimage};
++   radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), device->meta_state.etc_decode.p_layout,
++                         VK_SHADER_STAGE_COMPUTE_BIT, 0, 20, push_constants);
++
++   struct radv_dispatch_info info = {
++      .blocks[0] = DIV_ROUND_UP(extent->width, blk_w * 2),
++      .blocks[1] = DIV_ROUND_UP(extent->height, blk_h * 2),
++      .blocks[2] = extent->depth,
++      .offsets[0] = 0,
++      .offsets[1] = 0,
++      .offsets[2] = offset->z,
++      .unaligned = 0,
++   };
++   radv_compute_dispatch(cmd_buffer, &info);
++}
++
 +static VkImageViewType
 +get_view_type(const struct radv_image *image)
 +{
@@ -518,8 +1451,7 @@ index 00000000000..b144767143d
 +
 +static void
 +image_view_init(struct radv_device *device, struct radv_image *image, VkFormat format, VkImageAspectFlags aspectMask,
-+                uint32_t baseMipLevel, uint32_t baseArrayLayer, uint32_t layerCount, VkComponentMapping *component,
-+                struct radv_image_view *iview)
++                uint32_t baseMipLevel, uint32_t baseArrayLayer, uint32_t layerCount, struct radv_image_view *iview)
 +{
 +   VkImageViewCreateInfo iview_create_info = {
 +      .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
@@ -535,461 +1467,3507 @@ index 00000000000..b144767143d
 +            .layerCount = baseArrayLayer + layerCount,
 +         },
 +   };
-+   if (component) {
-+      iview_create_info.components = *component;
-+   }
 +
 +   radv_image_view_init(iview, device, &iview_create_info, NULL);
 +}
 +
-+static void
-+encode_bc1(struct radv_cmd_buffer *cmd_buffer, VkImageLayout layout,
-+           const VkExtent3D *extent, const VkImageSubresourceLayers *subresource,
-+           struct radv_image *src_image, struct radv_image *dst_image)
++void
++radv_meta_decode_astc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout layout,
++                      const VkImageSubresourceLayers *subresource, VkOffset3D offset, VkExtent3D extent)
 +{
 +   struct radv_device *device = cmd_buffer->device;
-+   struct radv_meta_state *state = &device->meta_state;
++   struct radv_meta_saved_state saved_state;
++   radv_meta_save(&saved_state, cmd_buffer,
++                  RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS | RADV_META_SAVE_DESCRIPTORS);
 +
-+   struct radv_image_view src_iview, dst_iview;
-+   VkComponentMapping component = {
-+      .r = VK_COMPONENT_SWIZZLE_R,
-+      .g = VK_COMPONENT_SWIZZLE_G,
-+      .b = VK_COMPONENT_SWIZZLE_B,
-+      .a = VK_COMPONENT_SWIZZLE_A
-+   };
-+   image_view_init(device, src_image, VK_FORMAT_R8G8B8A8_UNORM, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
-+                   subresource->baseArrayLayer, 1, &component, &src_iview);
-+   image_view_init(device, dst_image, VK_FORMAT_R16G16B16A16_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
-+                   subresource->baseArrayLayer, 1, NULL, &dst_iview);
-+
-+   cmd_buffer->state.flush_bits |=
-+      RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
-+      radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, src_iview.image) |
-+      radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_iview.image);
-+
-+   struct vk_texcompress_bcn_write_descriptor_set write_desc_set;
-+   vk_texcompress_bc1_fill_write_descriptor_sets(state->bcn_encoded[BC1_ENCODE], &write_desc_set,
-+                                                  radv_image_view_to_handle(&src_iview), layout,
-+                                                  radv_image_view_to_handle(&dst_iview));
-+
-+   radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, state->bcn_encoded[BC1_ENCODE]->p_layout,
-+                                 0, /* set number */
-+                                 3, write_desc_set.descriptor_set);
++   VkImage _image = radv_image_to_handle(image);
++   VK_FROM_HANDLE(vk_image, vkImage, _image);
 +
-+   VkPipeline pipeline =
-+      vk_texcompress_rgba_get_encode_pipeline(&device->vk, &state->alloc,
-+                                              state->bcn_encoded[BC1_ENCODE],
-+                                              radv_pipeline_cache_to_handle(&state->cache), BC1_ENCODE);
-+   if (pipeline == VK_NULL_HANDLE)
-+      return;
++   uint32_t base_slice = radv_meta_get_iview_layer(image, subresource, &offset);
++   uint32_t slice_count = image->type == VK_IMAGE_TYPE_3D
++                           ? extent.depth 
++                           : vk_image_subresource_layer_count(vkImage, subresource);
 +
-+   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
++   extent = radv_sanitize_image_extent(image->type, extent);
++   offset = radv_sanitize_image_offset(image->type, offset);
 +
-+   const unsigned num_refinements = 1;
-+   const unsigned push_constants[2] = {num_refinements};
-+   radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), state->bcn_encoded[BC1_ENCODE]->p_layout,
-+                         VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants);
++   struct radv_image_view src_iview, dst_iview;
++   image_view_init(device, image, VK_FORMAT_R32G32B32A32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
++                   subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &src_iview);
++   image_view_init(device, image, VK_FORMAT_R8G8B8A8_UINT, VK_IMAGE_ASPECT_PLANE_1_BIT, subresource->mipLevel,
++                   subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &dst_iview);
 +
-+   struct radv_dispatch_info info = {
-+      .blocks[0] = DIV_ROUND_UP(extent->width, 32),
-+      .blocks[1] = DIV_ROUND_UP(extent->height, 32),
-+      .blocks[2] = 1,
++   VkExtent3D extent_copy = {
++      .width = extent.width,
++      .height = extent.height,
++      .depth = slice_count,
 +   };
-+   radv_compute_dispatch(cmd_buffer, &info);
++   decode_astc(cmd_buffer, &src_iview, &dst_iview, layout, &(VkOffset3D){offset.x, offset.y, base_slice}, &extent_copy);
++
 +   radv_image_view_finish(&src_iview);
 +   radv_image_view_finish(&dst_iview);
-+}
 +
++   radv_meta_restore(&saved_state, cmd_buffer);
++}
+diff --git a/src/amd/vulkan/radv_meta_bufimage.c b/src/amd/vulkan/radv_meta_bufimage.c
+index bf2d7fcee33..f1a98179b05 100644
+--- a/src/amd/vulkan/radv_meta_bufimage.c
++++ b/src/amd/vulkan/radv_meta_bufimage.c
+@@ -1324,6 +1324,92 @@ create_bview_for_r32g32b32(struct radv_cmd_buffer *cmd_buffer, struct radv_buffe
+                          });
+ }
+ 
++/* GFX9+ has an issue where the HW does not calculate mipmap degradations
++ * for block-compressed images correctly (see the comment in
++ * radv_image_view_init). Some texels are unaddressable and cannot be copied
++ * to/from by a compute shader. Here we will perform a buffer copy to copy the
++ * texels that the hardware missed.
++ *
++ * GFX10 will not use this workaround because it can be fixed by adjusting its
++ * image view descriptors instead.
++ */
 +static void
-+encode_bc4(struct radv_cmd_buffer *cmd_buffer, VkImageLayout layout,
-+           const VkExtent3D *extent, const VkImageSubresourceLayers *subresource,
-+           struct radv_image *src_image, struct radv_image *dst_image)
++fixup_gfx9_cs_copy(struct radv_cmd_buffer *cmd_buffer,
++                   const struct radv_meta_blit2d_buffer *buf_bsurf,
++                   const struct radv_meta_blit2d_surf *img_bsurf,
++                   const struct radv_meta_blit2d_rect *rect, bool to_image)
 +{
-+   struct radv_device *device = cmd_buffer->device;
-+   struct radv_meta_state *state = &device->meta_state;
++   const unsigned mip_level = img_bsurf->level;
++   const struct radv_image *image = img_bsurf->image;
++   const struct radeon_surf *surf = &image->planes[0].surface;
++   const struct radv_device *device = cmd_buffer->device;
++   const struct radeon_info *rad_info = &device->physical_device->rad_info;
++   struct ac_addrlib *addrlib = device->ws->get_addrlib(device->ws);
++
++   /* GFX10 will use a different workaround unless this is not a 2D image */
++   if (rad_info->chip_class < GFX9 ||
++       (rad_info->chip_class >= GFX10 && image->type == VK_IMAGE_TYPE_2D) ||
++       image->info.levels == 1 || !vk_format_is_block_compressed(image->vk_format))
++      return;
 +
-+   struct radv_image_view src_iview, dst_iview;
-+   VkComponentMapping component = {
-+      .r = VK_COMPONENT_SWIZZLE_A,
-+      .g = VK_COMPONENT_SWIZZLE_ZERO,
-+      .b = VK_COMPONENT_SWIZZLE_ZERO,
-+      .a = VK_COMPONENT_SWIZZLE_ONE
-+   };
-+   image_view_init(device, src_image, VK_FORMAT_R8G8B8A8_UNORM, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
-+                   subresource->baseArrayLayer, 1, &component, &src_iview);
-+   image_view_init(device, dst_image, VK_FORMAT_R16G16B16A16_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
-+                   subresource->baseArrayLayer, 1, NULL, &dst_iview);
-+
-+   cmd_buffer->state.flush_bits |=
-+      RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
-+      radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, src_iview.image) |
-+      radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_iview.image);
-+
-+   struct vk_texcompress_bc4_write_descriptor_set write_desc_set;
-+   vk_texcompress_bc4_fill_write_descriptor_sets(state->bcn_encoded[BC4_ENCODE], &write_desc_set,
-+                                                  radv_image_view_to_handle(&src_iview), layout,
-+                                                  radv_image_view_to_handle(&dst_iview));
-+
-+   radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, state->bcn_encoded[BC4_ENCODE]->p_layout,
-+                                 0, /* set number */
-+                                 2, write_desc_set.descriptor_set);
++   /* The physical extent of the base mip */
++   VkExtent2D hw_base_extent = {surf->u.gfx9.base_mip_width, surf->u.gfx9.base_mip_height};
 +
-+   VkPipeline pipeline =
-+      vk_texcompress_rgba_get_encode_pipeline(&device->vk, &state->alloc,
-+                                              state->bcn_encoded[BC4_ENCODE],
-+                                              radv_pipeline_cache_to_handle(&state->cache), BC4_ENCODE);
-+   if (pipeline == VK_NULL_HANDLE)
-+      return;
++   /* The hardware-calculated extent of the selected mip
++    * (naive divide-by-two integer math)
++    */
++   VkExtent2D hw_mip_extent = {radv_minify(hw_base_extent.width, mip_level),
++                               radv_minify(hw_base_extent.height, mip_level)};
 +
-+   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
++   /* The actual extent we want to copy */
++   VkExtent2D mip_extent = {rect->width, rect->height};
 +
-+   const unsigned alpha_channel_id = 0;  // r通道
-+   const unsigned use_norm = 0;
-+   const unsigned push_constants[2] = {alpha_channel_id, use_norm};
-+   radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), state->bcn_encoded[BC4_ENCODE]->p_layout,
-+                         VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants);
++   VkOffset2D mip_offset = {to_image ? rect->dst_x : rect->src_x,
++                            to_image ? rect->dst_y : rect->src_y};
 +
-+   struct radv_dispatch_info info = {
-+      .blocks[0] = 1,
-+      .blocks[1] = DIV_ROUND_UP(extent->width, 16),
-+      .blocks[2] = DIV_ROUND_UP(extent->height, 16),
-+   };
-+   radv_compute_dispatch(cmd_buffer, &info);
-+   radv_image_view_finish(&src_iview);
-+   radv_image_view_finish(&dst_iview);
++   if (hw_mip_extent.width >= mip_offset.x + mip_extent.width &&
++       hw_mip_extent.height >= mip_offset.y + mip_extent.height)
++      return;
++
++   if (!to_image) {
++      /* If we are writing to a buffer, then we need to wait for the compute
++       * shader to finish because it may write over the unaddressable texels
++       * while we're fixing them. If we're writing to an image, we do not need
++       * to wait because the compute shader cannot write to those texels
++       */
++      cmd_buffer->state.flush_bits |=
++         RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_L2 | RADV_CMD_FLAG_INV_VCACHE;
++   }
++
++   for (uint32_t y = 0; y < mip_extent.width; y++) {
++      uint32_t coordY = y + mip_offset.y;
++      /* If the default copy algorithm (done previously) has already seen this
++       * scanline, then we can bias the starting X coordinate over to skip the
++       * region already copied by the default copy.
++       */
++      uint32_t x = (coordY < hw_mip_extent.height) ? hw_mip_extent.width : 0;
++      for (; x < mip_extent.width; x++) {
++         uint32_t coordX = x + mip_offset.x;
++         uint64_t addr = ac_surface_addr_from_coord(addrlib, rad_info, surf, &image->info,
++                                                    mip_level, coordX, coordY, img_bsurf->layer,
++                                                    image->type == VK_IMAGE_TYPE_3D);
++         struct radeon_winsys_bo *img_bo = image->bo;
++         struct radeon_winsys_bo *mem_bo = buf_bsurf->buffer->bo;
++         const uint64_t img_offset = image->offset + addr;
++         /* buf_bsurf->offset already includes the layer offset */
++         const uint64_t mem_offset = buf_bsurf->buffer->offset +
++                                     buf_bsurf->offset +
++                                     y * buf_bsurf->pitch * surf->bpe +
++                                     x * surf->bpe;
++         if (to_image) {
++            radv_copy_buffer(cmd_buffer, mem_bo, img_bo, mem_offset, img_offset, surf->bpe);
++         } else {
++            radv_copy_buffer(cmd_buffer, img_bo, mem_bo, img_offset, mem_offset, surf->bpe);
++         }
++      }
++   }
 +}
 +
-+static void
-+encode_bc3(struct radv_cmd_buffer *cmd_buffer,
-+           const VkExtent3D *extent, const VkImageSubresourceLayers *subresource,
-+           struct radv_image *bc1_image, struct radv_image *bc4_image, struct radv_image *dst_image)
-+{
-+   struct radv_device *device = cmd_buffer->device;
-+   struct radv_meta_state *state = &device->meta_state;
+ static unsigned
+ get_image_stride_for_r32g32b32(struct radv_cmd_buffer *cmd_buffer,
+                                struct radv_meta_blit2d_surf *surf)
+@@ -1399,6 +1485,7 @@ radv_meta_image_to_buffer(struct radv_cmd_buffer *cmd_buffer, struct radv_meta_b
+                             16, push_constants);
+ 
+       radv_unaligned_dispatch(cmd_buffer, rects[r].width, rects[r].height, 1);
++      fixup_gfx9_cs_copy(cmd_buffer, dst, src, &rects[r], false);
+    }
+ 
+    radv_image_view_finish(&src_view);
+@@ -1553,6 +1640,7 @@ radv_meta_buffer_to_image_cs(struct radv_cmd_buffer *cmd_buffer,
+                             16, push_constants);
+ 
+       radv_unaligned_dispatch(cmd_buffer, rects[r].width, rects[r].height, 1);
++      fixup_gfx9_cs_copy(cmd_buffer, src, dst, &rects[r], true);
+    }
+ 
+    radv_image_view_finish(&dst_view);
+diff --git a/src/amd/vulkan/radv_meta_copy.c b/src/amd/vulkan/radv_meta_copy.c
+index 25368e64a04..b418d3f2ab9 100644
+--- a/src/amd/vulkan/radv_meta_copy.c
++++ b/src/amd/vulkan/radv_meta_copy.c
+@@ -471,6 +471,9 @@ copy_buffer_to_image(struct radv_cmd_buffer *cmd_buffer, struct radv_buffer *buf
+    radv_meta_restore(&saved_state, cmd_buffer);
+ }
+ 
++extern bool
++radv_is_format_emulated(const struct radv_physical_device *physical_device, VkFormat format);
 +
-+   struct radv_image_view bc1_iview, bc4_iview, dst_iview;
-+   VkComponentMapping component = {
-+      .r = VK_COMPONENT_SWIZZLE_R,
-+      .g = VK_COMPONENT_SWIZZLE_G,
-+      .b = VK_COMPONENT_SWIZZLE_ZERO,
-+      .a = VK_COMPONENT_SWIZZLE_ONE
-+   };
-+   image_view_init(device, bc1_image, VK_FORMAT_R32G32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
-+                   subresource->baseArrayLayer, 1, &component, &bc1_iview);
-+   image_view_init(device, bc4_image, VK_FORMAT_R32G32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
-+                   subresource->baseArrayLayer, 1, &component, &bc4_iview);
-+   image_view_init(device, dst_image, VK_FORMAT_R32G32B32A32_UINT, VK_IMAGE_ASPECT_PLANE_1_BIT, subresource->mipLevel,
-+                   subresource->baseArrayLayer, 1, NULL, &dst_iview);
-+
-+   cmd_buffer->state.flush_bits |=
-+      RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
-+      radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, bc1_iview.image) |
-+      radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, bc4_iview.image) |
-+      radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_iview.image);
-+
-+   struct vk_texcompress_bcn_write_descriptor_set write_desc_set;
-+   vk_texcompress_bc3_fill_write_descriptor_sets(state->bcn_encoded[BC3_ENCODE], &write_desc_set,
-+                                                 radv_image_view_to_handle(&bc1_iview), radv_image_view_to_handle(&bc4_iview),
-+                                                 radv_image_view_to_handle(&dst_iview));
-+
-+   radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, state->bcn_encoded[BC3_ENCODE]->p_layout,
-+                                 0, /* set number */
-+                                 3, write_desc_set.descriptor_set);
+ VKAPI_ATTR void VKAPI_CALL
+ radv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
+                            const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
+@@ -485,18 +488,25 @@ radv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
+                            &pCopyBufferToImageInfo->pRegions[r]);
+    }
+ 
+-   if (cmd_buffer->device->physical_device->emulate_etc2 &&
+-       vk_format_description(dst_image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ETC) {
++   if (radv_is_format_emulated(cmd_buffer->device->physical_device, dst_image->vk_format)) {
+       cmd_buffer->state.flush_bits |=
+          RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
+          radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, dst_image) |
+-         radv_dst_access_flush(
+-            cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_image);
++         radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_image);
++
++      const enum util_format_layout format_layout = vk_format_description(dst_image->vk_format)->layout;
+       for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) {
+-         radv_meta_decode_etc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout,
+-                              &pCopyBufferToImageInfo->pRegions[r].imageSubresource,
+-                              pCopyBufferToImageInfo->pRegions[r].imageOffset,
+-                              pCopyBufferToImageInfo->pRegions[r].imageExtent);
++         if (format_layout == UTIL_FORMAT_LAYOUT_ASTC) {
++            radv_meta_decode_astc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout,
++                                  &pCopyBufferToImageInfo->pRegions[r].imageSubresource,
++                                  pCopyBufferToImageInfo->pRegions[r].imageOffset,
++                                  pCopyBufferToImageInfo->pRegions[r].imageExtent);
++         } else {
++            radv_meta_decode_etc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout,
++                                 &pCopyBufferToImageInfo->pRegions[r].imageSubresource,
++                                 pCopyBufferToImageInfo->pRegions[r].imageOffset,
++                                 pCopyBufferToImageInfo->pRegions[r].imageExtent);
++         }
+       }
+    }
+ }
+@@ -849,18 +859,30 @@ radv_CmdCopyImage2(VkCommandBuffer commandBuffer, const VkCopyImageInfo2 *pCopyI
+                  pCopyImageInfo->dstImageLayout, &pCopyImageInfo->pRegions[r]);
+    }
+ 
+-   if (cmd_buffer->device->physical_device->emulate_etc2 &&
+-       vk_format_description(dst_image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ETC) {
++   if (radv_is_format_emulated(cmd_buffer->device->physical_device, dst_image->vk_format)) {
+       cmd_buffer->state.flush_bits |=
+          RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
+          radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, dst_image) |
+-         radv_dst_access_flush(
+-            cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_image);
++         radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_image);
++
++      const enum util_format_layout format_layout = vk_format_description(dst_image->vk_format)->layout;
+       for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) {
+-         radv_meta_decode_etc(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout,
+-                              &pCopyImageInfo->pRegions[r].dstSubresource,
+-                              pCopyImageInfo->pRegions[r].dstOffset,
+-                              pCopyImageInfo->pRegions[r].extent);
++         VkExtent3D dst_extent = pCopyImageInfo->pRegions[r].extent;
++         if (src_image->vk_format != dst_image->vk_format) {
++            dst_extent.width = dst_extent.width / vk_format_get_blockwidth(src_image->vk_format) *
++                               vk_format_get_blockwidth(dst_image->vk_format);
++            dst_extent.height = dst_extent.height / vk_format_get_blockheight(src_image->vk_format) *
++                                vk_format_get_blockheight(dst_image->vk_format);
++         }
++         if (format_layout == UTIL_FORMAT_LAYOUT_ASTC) {
++            radv_meta_decode_astc(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout,
++                                  &pCopyImageInfo->pRegions[r].dstSubresource, pCopyImageInfo->pRegions[r].dstOffset,
++                                  dst_extent);
++         } else {
++            radv_meta_decode_etc(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout,
++                                 &pCopyImageInfo->pRegions[r].dstSubresource, pCopyImageInfo->pRegions[r].dstOffset,
++                                 dst_extent);
++         }
+       }
+    }
+ }
+diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
+index 950ded15f58..472858dd401 100644
+--- a/src/amd/vulkan/radv_private.h
++++ b/src/amd/vulkan/radv_private.h
+@@ -96,6 +96,7 @@ typedef uint32_t xcb_window_t;
+ 
+ #include "radv_entrypoints.h"
+ 
++#include "vk_texcompress_astc.h"
+ #include "wsi_common.h"
+ 
+ #ifdef __cplusplus
+@@ -303,6 +304,9 @@ struct radv_physical_device {
+    /* Whether to emulate ETC2 image support on HW without support. */
+    bool emulate_etc2;
+ 
++   /* Whether to emulate ASTC image support on HW without support. */
++   bool emulate_astc;
++
+    /* This is the drivers on-disk cache used as a fallback as opposed to
+     * the pipeline cache defined by apps.
+     */
+@@ -675,6 +679,8 @@ struct radv_meta_state {
+       VkPipelineLayout p_layout;
+       VkPipeline pipeline;
+    } etc_decode;
 +
-+   VkPipeline pipeline =
-+      vk_texcompress_rgba_get_encode_pipeline(&device->vk, &state->alloc,
-+                                              state->bcn_encoded[BC3_ENCODE],
-+                                              radv_pipeline_cache_to_handle(&state->cache), BC3_ENCODE);
-+   if (pipeline == VK_NULL_HANDLE)
-+      return;
++   struct vk_texcompress_astc_state *astc_decode;
+ };
+ 
+ #define RADV_NUM_HW_CTX (RADEON_CTX_PRIORITY_REALTIME + 1)
+@@ -2415,6 +2421,9 @@ struct radv_image_view {
+     * This has a few differences for cube maps (e.g. type).
+     */
+    union radv_descriptor storage_descriptor;
 +
-+   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
++   /* Block-compressed image views on GFX10+. */
++   struct ac_surf_nbc_view nbc_view;
+ };
+ 
+ struct radv_image_create_info {
+diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h
+index d72b4d670d8..83a2785d4bd 100644
+--- a/src/amd/vulkan/radv_radeon_winsys.h
++++ b/src/amd/vulkan/radv_radeon_winsys.h
+@@ -295,6 +295,8 @@ struct radeon_winsys {
+ 
+    int (*get_fd)(struct radeon_winsys *ws);
+ 
++   struct ac_addrlib *(*get_addrlib)(struct radeon_winsys *ws);
 +
-+   struct radv_dispatch_info info = {
-+      .blocks[0] = DIV_ROUND_UP(extent->width, 32),
-+      .blocks[1] = DIV_ROUND_UP(extent->height, 32),
-+      .blocks[2] = 1,
-+   };
-+   radv_compute_dispatch(cmd_buffer, &info);
-+   radv_image_view_finish(&bc1_iview);
-+   radv_image_view_finish(&bc4_iview);
-+   radv_image_view_finish(&dst_iview);
+    const struct vk_sync_type *const *(*get_sync_types)(struct radeon_winsys *ws);
+ };
+ 
+diff --git a/src/amd/vulkan/vk_format.h b/src/amd/vulkan/vk_format.h
+index 3cb035eb3cc..e6367784002 100644
+--- a/src/amd/vulkan/vk_format.h
++++ b/src/amd/vulkan/vk_format.h
+@@ -195,4 +195,29 @@ vk_format_get_plane_format(VkFormat format, unsigned plane_id)
+    }
+ }
+ 
++static inline VkFormat
++vk_texcompress_etc2_emulation_format(VkFormat etc2_format)
++{
++   switch (etc2_format) {
++   case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK:
++   case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK:
++   case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK:
++      return VK_FORMAT_R8G8B8A8_UNORM;
++   case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
++   case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
++   case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
++      return VK_FORMAT_R8G8B8A8_SRGB;
++   case VK_FORMAT_EAC_R11_UNORM_BLOCK:
++      return VK_FORMAT_R16_UNORM;
++   case VK_FORMAT_EAC_R11_SNORM_BLOCK:
++      return VK_FORMAT_R16_SNORM;
++   case VK_FORMAT_EAC_R11G11_UNORM_BLOCK:
++      return VK_FORMAT_R16G16_UNORM;
++   case VK_FORMAT_EAC_R11G11_SNORM_BLOCK:
++      return VK_FORMAT_R16G16_SNORM;
++   default:
++      return VK_FORMAT_UNDEFINED;
++   }
 +}
 +
-+void
-+radv_meta_bcn_encoded(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout layout,
-+                      const VkImageSubresourceLayers *subresource, VkOffset3D offset, VkExtent3D extent)
+ #endif /* VK_FORMAT_H */
+diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.c
+index 9d160b65ed4..8947ee1dc50 100644
+--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.c
++++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_surface.c
+@@ -95,8 +95,17 @@ radv_amdgpu_winsys_surface_init(struct radeon_winsys *_ws, const struct ac_surf_
+    return ac_compute_surface(ws->addrlib, &ws->info, &config, mode, surf);
+ }
+ 
++
++static struct ac_addrlib *
++radv_amdgpu_get_addrlib(struct radeon_winsys *rws)
 +{
-+   struct radv_device *device = cmd_buffer->device;
-+   struct radv_meta_saved_state saved_state;
-+   radv_meta_save(&saved_state, cmd_buffer,
-+                  RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS | RADV_META_SAVE_DESCRIPTORS);
++   struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(rws);
++   return ws->addrlib;
++}
 +
-+   VkImage _image = radv_image_to_handle(image);
-+   VK_FROM_HANDLE(vk_image, vkImage, _image);
+ void
+ radv_amdgpu_surface_init_functions(struct radv_amdgpu_winsys *ws)
+ {
++   ws->base.get_addrlib = radv_amdgpu_get_addrlib;
+    ws->base.surface_init = radv_amdgpu_winsys_surface_init;
+ }
+diff --git a/src/compiler/glsl/CrossPlatformSettings_piece_all.glsl b/src/compiler/glsl/CrossPlatformSettings_piece_all.glsl
+new file mode 100644
+index 00000000000..7ef940a1603
+--- /dev/null
++++ b/src/compiler/glsl/CrossPlatformSettings_piece_all.glsl
+@@ -0,0 +1,98 @@
++/*
++ * Copyright 2020-2022 Matias N. Goldberg
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
 +
-+   extent = radv_sanitize_image_extent(image->type, extent);
-+   offset = radv_sanitize_image_offset(image->type, offset);
 +
-+   VkExtent3D extent_copy = {
-+      .width = extent.width,
-+      .height = extent.height,
-+      .depth = 1,
-+   };
++#define min3( a, b, c ) min( a, min( b, c ) )
++#define max3( a, b, c ) max( a, max( b, c ) )
++
++#define float2 vec2
++#define float3 vec3
++#define float4 vec4
++
++#define int2 ivec2
++#define int3 ivec3
++#define int4 ivec4
++
++#define uint2 uvec2
++#define uint3 uvec3
++#define uint4 uvec4
++
++#define float2x2 mat2
++#define float3x3 mat3
++#define float4x4 mat4
++#define ogre_float4x3 mat3x4
++
++#define ushort uint
++#define ushort3 uint3
++#define ushort4 uint4
++
++//Short used for read operations. It's an int in GLSL & HLSL. An ushort in Metal
++#define rshort int
++#define rshort2 int2
++#define rint int
++//Short used for write operations. It's an int in GLSL. An ushort in HLSL & Metal
++#define wshort2 int2
++#define wshort3 int3
++
++#define toFloat3x3( x ) mat3( x )
++#define buildFloat3x3( row0, row1, row2 ) mat3( row0, row1, row2 )
++
++#define mul( x, y ) ((x) * (y))
++#define saturate(x) clamp( (x), 0.0, 1.0 )
++#define lerp mix
++#define rsqrt inversesqrt
++#define INLINE
++#define NO_INTERPOLATION_PREFIX flat
++#define NO_INTERPOLATION_SUFFIX
++
++#define PARAMS_ARG_DECL
++#define PARAMS_ARG
++
++#define reversebits bitfieldReverse
++
++#define OGRE_Sample( tex, sampler, uv ) texture( tex, uv )
++#define OGRE_SampleLevel( tex, sampler, uv, lod ) textureLod( tex, uv, lod )
++#define OGRE_SampleArray2D( tex, sampler, uv, arrayIdx ) texture( tex, vec3( uv, arrayIdx ) )
++#define OGRE_SampleArray2DLevel( tex, sampler, uv, arrayIdx, lod ) textureLod( tex, vec3( uv, arrayIdx ), lod )
++#define OGRE_SampleArrayCubeLevel( tex, sampler, uv, arrayIdx, lod ) textureLod( tex, vec4( uv, arrayIdx ), lod )
++#define OGRE_SampleGrad( tex, sampler, uv, ddx, ddy ) textureGrad( tex, uv, ddx, ddy )
++#define OGRE_SampleArray2DGrad( tex, sampler, uv, arrayIdx, ddx, ddy ) textureGrad( tex, vec3( uv, arrayIdx ), ddx, ddy )
++#define OGRE_ddx( val ) dFdx( val )
++#define OGRE_ddy( val ) dFdy( val )
++#define OGRE_Load2D( tex, iuv, lod ) texelFetch( tex, iuv, lod )
++#define OGRE_LoadArray2D( tex, iuv, arrayIdx, lod ) texelFetch( tex, ivec3( iuv, arrayIdx ), lod )
++#define OGRE_Load2DMS( tex, iuv, subsample ) texelFetch( tex, iuv, subsample )
++
++#define OGRE_Load3D( tex, iuv, lod ) texelFetch( tex, ivec3( iuv ), lod )
++
++#define OGRE_GatherRed( tex, sampler, uv ) textureGather( tex, uv, 0 )
++#define OGRE_GatherGreen( tex, sampler, uv ) textureGather( tex, uv, 1 )
++#define OGRE_GatherBlue( tex, sampler, uv ) textureGather( tex, uv, 2 )
++
++#define bufferFetch1( buffer, idx ) texelFetch( buffer, idx ).x
++
++#define OGRE_SAMPLER_ARG_DECL( samplerName )
++#define OGRE_SAMPLER_ARG( samplerName )
++
++#define OGRE_Texture3D_float4 sampler3D
++#define OGRE_OUT_REF( declType, variableName ) out declType variableName
++#define OGRE_INOUT_REF( declType, variableName ) inout declType variableName
+diff --git a/src/compiler/glsl/astc_decoder.glsl b/src/compiler/glsl/astc_decoder.glsl
+new file mode 100644
+index 00000000000..ec00cc5ab41
+--- /dev/null
++++ b/src/compiler/glsl/astc_decoder.glsl
+@@ -0,0 +1,1382 @@
++#version 320 es
++precision highp float;
++precision highp int;
++precision highp usamplerBuffer;
++precision highp usampler2D;
++precision highp image2D;
++precision highp uimage2D;
++
++/* Copyright (c) 2020-2022 Hans-Kristian Arntzen
++ * Copyright (c) 2022 Intel Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining
++ * a copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sublicense, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be
++ * included in all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#ifdef VULKAN
++
++precision highp utextureBuffer;
++precision highp utexture2DArray;
++precision highp uimage2DArray;
++precision highp uimage3D;
++precision highp utexture3D;
++
++#extension GL_EXT_samplerless_texture_functions : require
++layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z = 4) in;
++
++layout(set = 0, binding = 0) writeonly uniform uimage2DArray OutputImage2Darray;
++layout(set = 0, binding = 0) writeonly uniform uimage3D OutputImage3D;
++layout(set = 0, binding = 1) uniform utexture2DArray PayloadInput2Darray;
++layout(set = 0, binding = 1) uniform utexture3D PayloadInput3D;
++layout(set = 0, binding = 2) uniform utextureBuffer LUTRemainingBitsToEndpointQuantizer;
++layout(set = 0, binding = 3) uniform utextureBuffer LUTEndpointUnquantize;
++layout(set = 0, binding = 4) uniform utextureBuffer LUTWeightQuantizer;
++layout(set = 0, binding = 5) uniform utextureBuffer LUTWeightUnquantize;
++layout(set = 0, binding = 6) uniform utextureBuffer LUTTritQuintDecode;
++layout(set = 0, binding = 7) uniform utextureBuffer LUTPartitionTable;
++
++layout(constant_id = 2) const bool DECODE_8BIT = false;
++
++layout(push_constant, std430) uniform pc {
++   ivec2 texel_blk_start;
++   ivec2 texel_end;
++   bool is_3Dimage;
++};
 +
-+   RADV_FROM_HANDLE(radv_image, rgba_image, image->staging_images->image_rgba);
-+   RADV_FROM_HANDLE(radv_image, image1, image->staging_images->image_bc1);
-+   RADV_FROM_HANDLE(radv_image, image4, image->staging_images->image_bc4);
++#else /* VULKAN */
 +
-+   encode_bc1(cmd_buffer, layout, &extent_copy, subresource, rgba_image, image1);
++layout(local_size_x = %u, local_size_y = %u, local_size_z = 4) in;
++
++#define utextureBuffer usamplerBuffer
++#define utexture2D usampler2D
++
++layout(binding = 0) uniform utextureBuffer LUTRemainingBitsToEndpointQuantizer;
++layout(binding = 1) uniform utextureBuffer LUTEndpointUnquantize;
++layout(binding = 2) uniform utextureBuffer LUTWeightQuantizer;
++layout(binding = 3) uniform utextureBuffer LUTWeightUnquantize;
++layout(binding = 4) uniform utextureBuffer LUTTritQuintDecode;
++layout(binding = 5) uniform utexture2D LUTPartitionTable;
++layout(binding = 6) uniform utexture2D PayloadInput;
++
++layout(rgba8ui, binding = 7) writeonly uniform uimage2D OutputImage;
++const bool DECODE_8BIT = true;
++
++#endif /* VULKAN */
++
++const int MODE_LDR = 0;
++const int MODE_HDR = 1;
++const int MODE_HDR_LDR_ALPHA = 2;
++
++const uvec4 error_color = uvec4(255, 0, 255, 255);
++
++/* bitextract.h */
++int extract_bits(uvec4 payload, int offset, int bits)
++{
++        int last_offset = offset + bits - 1;
++        int result;
++
++        if (bits <= 0)
++                result = 0;
++        else if ((last_offset >> 5) == (offset >> 5))
++                result = int(bitfieldExtract(payload[offset >> 5], offset & 31, bits));
++        else
++        {
++                int first_bits = 32 - (offset & 31);
++                int result_first = int(bitfieldExtract(payload[offset >> 5], offset & 31, first_bits));
++                int result_second = int(bitfieldExtract(payload[(offset >> 5) + 1], 0, bits - first_bits));
++                result = result_first | (result_second << first_bits);
++        }
++        return result;
++}
++
++/* bitextract.h */
++int extract_bits_sign(uvec4 payload, int offset, int bits)
++{
++        int last_offset = offset + bits - 1;
++        int result;
++
++        if (bits <= 0)
++                result = 0;
++        else if ((last_offset >> 5) == (offset >> 5))
++                result = bitfieldExtract(int(payload[offset >> 5]), offset & 31, bits);
++        else
++        {
++                int first_bits = 32 - (offset & 31);
++                int result_first = int(bitfieldExtract(payload[offset >> 5], offset & 31, first_bits));
++                int result_second = bitfieldExtract(int(payload[(offset >> 5) + 1]), 0, bits - first_bits);
++                result = result_first | (result_second << first_bits);
++        }
++        return result;
++}
++
++/* bitextract.h */
++int extract_bits_reverse(uvec4 payload, int offset, int bits)
++{
++        int last_offset = offset + bits - 1;
++        int result;
++
++        if (bits <= 0)
++                result = 0;
++        else if ((last_offset >> 5) == (offset >> 5))
++                result = int(bitfieldReverse(bitfieldExtract(payload[offset >> 5], offset & 31, bits)) >> (32 - bits));
++        else
++        {
++                int first_bits = 32 - (offset & 31);
++                uint result_first = bitfieldExtract(payload[offset >> 5], offset & 31, first_bits);
++                uint result_second = bitfieldExtract(payload[(offset >> 5) + 1], 0, bits - first_bits);
++                result = int(bitfieldReverse(result_first | (result_second << first_bits)) >> (32 - bits));
++        }
++        return result;
++}
++
++void swap(inout int a, inout int b)
++{
++    int tmp = a;
++    a = b;
++    b = tmp;
++}
++
++ivec4 build_coord()
++{
++    ivec2 payload_coord = ivec2(gl_WorkGroupID.xy) * 2;
++    payload_coord.x += int(gl_LocalInvocationID.z) & 1;
++    payload_coord.y += (int(gl_LocalInvocationID.z) >> 1) & 1;
++#ifdef VULKAN
++    payload_coord += texel_blk_start;
++#endif /* VULKAN */
++    ivec2 coord = payload_coord * ivec2(gl_WorkGroupSize.xy);
++    coord += ivec2(gl_LocalInvocationID.xy);
++    return ivec4(coord, payload_coord);
++}
++
++ivec4 interpolate_endpoint(ivec4 ep0, ivec4 ep1, ivec4 weight, int decode_mode)
++{
++    if (decode_mode == MODE_HDR)
++    {
++        ep0 <<= 4;
++        ep1 <<= 4;
++    }
++    else if (decode_mode == MODE_HDR_LDR_ALPHA)
++    {
++        ep0.rgb <<= 4;
++        ep1.rgb <<= 4;
++        ep0.a *= 0x101;
++        ep1.a *= 0x101;
++    }
++    else if (DECODE_8BIT)
++    {
++        // This isn't quite right in all cases.
++        // In normal ASTC with sRGB, the alpha channel is supposed to
++        // be decoded as FP16,
++        // even when color components are SRGB 8-bit (?!?!?!?!).
++        // This is correct if decode_unorm8 mode is used though,
++        // for sanity, we're going to assume unorm8 decoding mode
++        // is implied when using sRGB.
++        ep0 = (ep0 << 8) | ivec4(0x80);
++        ep1 = (ep1 << 8) | ivec4(0x80);
++    }
++    else
++    {
++        ep0 *= 0x101;
++        ep1 *= 0x101;
++    }
++
++    ivec4 color = (ep0 * (64 - weight) + ep1 * weight + 32) >> 6;
++    return color;
++}
++
++bvec4 bvec_or(bvec4 a, bvec4 b)
++{
++    return bvec4(ivec4(a) | ivec4(b));
++}
++
++uint round_down_quantize_fp16(int color)
++{
++    // ASTC has a very peculiar way of converting the decoded result to FP16.
++    // 0xffff -> 1.0, and for everything else we get roundDownQuantizeFP16(vec4(c) / vec4(0x10000)).
++    int msb = findMSB(color);
++    int shamt = msb;
++    int m = ((color << 10) >> shamt) & 0x3ff;
++    int e = msb - 1;
++    uint decoded = color == 0xffff ? 0x3c00u : uint(e < 1 ? (color << 8) : (m | (e << 10)));
++    return decoded;
++}
++
++uvec4 round_down_quantize_fp16(ivec4 color)
++{
++    // ASTC has a very peculiar way of converting the decoded result to FP16.
++    // 0xffff -> 1.0, and for everything else we get roundDownQuantizeFP16(vec4(c) / vec4(0x10000)).
++    ivec4 msb = findMSB(color);
++    ivec4 shamt = msb;
++    ivec4 m = ((color << 10) >> shamt) & 0x3ff;
++    ivec4 e = msb - 1;
++    uvec4 decoded = uvec4(m | (e << 10));
++    uvec4 denorm_decode = uvec4(color << 8);
++    decoded = mix(decoded, uvec4(denorm_decode), lessThan(e, ivec4(1)));
++    decoded = mix(decoded, uvec4(0x3c00), equal(color, ivec4(0xffff)));
++    return decoded;
++}
++
++uvec4 decode_fp16(ivec4 color, int decode_mode)
++{
++    if (decode_mode != MODE_LDR)
++    {
++        // Interpret the value as FP16, but with some extra fixups along the way to make the interpolation more
++        // logarithmic (apparently). From spec:
++        ivec4 e = color >> 11;
++        ivec4 m = color & 0x7ff;
++        ivec4 mt = 4 * m - 512;
++        mt = mix(mt, ivec4(3 * m), lessThan(m, ivec4(512)));
++        mt = mix(mt, ivec4(5 * m - 2048), greaterThanEqual(m, ivec4(1536)));
++
++        ivec4 decoded = (e << 10) + (mt >> 3);
++        // +Inf or NaN are decoded to 0x7bff (max finite value).
++        decoded = mix(decoded, ivec4(0x7bff), bvec_or(greaterThan(decoded & 0x7fff, ivec4(0x7c00)), equal(decoded, ivec4(0x7c00))));
++
++        if (decode_mode == MODE_HDR_LDR_ALPHA)
++            decoded.a = int(round_down_quantize_fp16(color.a));
++
++        return uvec4(decoded);
++    }
++    else
++    {
++        return round_down_quantize_fp16(color);
++    }
++}
++
++struct BlockMode
++{
++    ivec2 weight_grid_size;
++    int weight_mode_index;
++    int num_partitions;
++    int seed;
++    int cem;
++    int config_bits;
++    int primary_config_bits;
++    bool dual_plane;
++    bool void_extent;
++};
++
++bool decode_error = false;
++
++BlockMode decode_block_mode(uvec4 payload)
++{
++    BlockMode mode;
++    mode.void_extent = (payload.x & 0x1ffu) == 0x1fcu;
++    if (mode.void_extent)
++        return mode;
++
++    mode.dual_plane = (payload.x & (1u << 10u)) != 0u;
++
++    uint higher = (payload.x >> 2u) & 3u;
++    uint lower = payload.x & 3u;
++
++    if (lower != 0u)
++    {
++        mode.weight_mode_index = int((payload.x >> 4u) & 1u);
++        mode.weight_mode_index |= int((payload.x << 1u) & 6u);
++        mode.weight_mode_index |= int((payload.x >> 6u) & 8u);
++
++        if (higher < 2u)
++        {
++            mode.weight_grid_size.x = int(bitfieldExtract(payload.x, 7, 2) + 4u + 4u * higher);
++            mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 5, 2) + 2u);
++        }
++        else if (higher == 2u)
++        {
++            mode.weight_grid_size.x = int(bitfieldExtract(payload.x, 5, 2) + 2u);
++            mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 7, 2) + 8u);
++        }
++        else
++        {
++            if ((payload.x & (1u << 8u)) != 0u)
++            {
++                mode.weight_grid_size.x = int(bitfieldExtract(payload.x, 7, 1) + 2u);
++                mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 5, 2) + 2u);
++            }
++            else
++            {
++                mode.weight_grid_size.x = int(bitfieldExtract(payload.x, 5, 2) + 2u);
++                mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 7, 1) + 6u);
++            }
++        }
++    }
++    else
++    {
++        int p3 = int(bitfieldExtract(payload.x, 9, 1));
++        int hi = int(bitfieldExtract(payload.x, 7, 2));
++        int lo = int(bitfieldExtract(payload.x, 5, 2));
++        if (hi == 0)
++        {
++            mode.weight_grid_size.x = 12;
++            mode.weight_grid_size.y = lo + 2;
++        }
++        else if (hi == 1)
++        {
++            mode.weight_grid_size.x = lo + 2;
++            mode.weight_grid_size.y = 12;
++        }
++        else if (hi == 2)
++        {
++            mode.dual_plane = false;
++            p3 = 0;
++            mode.weight_grid_size.x = lo + 6;
++            mode.weight_grid_size.y = int(bitfieldExtract(payload.x, 9, 2) + 6u);
++        }
++        else
++        {
++            if (lo == 0)
++                mode.weight_grid_size = ivec2(6, 10);
++            else if (lo == 1)
++                mode.weight_grid_size = ivec2(10, 6);
++            else
++                decode_error = true;
++        }
++
++        int p0 = int(bitfieldExtract(payload.x, 4, 1));
++        int p1 = int(bitfieldExtract(payload.x, 2, 1));
++        int p2 = int(bitfieldExtract(payload.x, 3, 1));
++        mode.weight_mode_index = p0 + (p1 << 1) + (p2 << 2) + (p3 << 3);
++    }
++
++    // 11 bits for block mode.
++    // 2 bits for partition select
++    // If partitions > 1:
++    //   4 bits CEM selector
++    //   If dual_plane:
++    //     2 bits of CCS
++    // else:
++    //   10 for partition seed
++    //   2 bits for CEM main selector
++    //   If CEM[1:0] = 00:
++    //     4 bits for CEM extra selector if all same type.
++    //   else:
++    //     (1 + 2) * num_partitions if different types.
++    //     First 4 bits are encoded next to CEM[1:0], otherwise, packed before weights.
++    //   If dual_plane:
++    //     2 bits of CCS before extra CEM bits.
++    const int CONFIG_BITS_BLOCK = 11;
++    const int CONFIG_BITS_PARTITION_MODE = 2;
++    const int CONFIG_BITS_SEED = 10;
++    const int CONFIG_BITS_PRIMARY_MULTI_CEM = 2;
++    const int CONFIG_BITS_CEM = 4;
++    const int CONFIG_BITS_EXTRA_CEM_PER_PARTITION = 3;
++    const int CONFIG_BITS_CCS = 2;
++
++    mode.num_partitions = int(bitfieldExtract(payload.x, CONFIG_BITS_BLOCK, CONFIG_BITS_PARTITION_MODE)) + 1;
++
++    if (mode.num_partitions > 1)
++    {
++        mode.seed = int(bitfieldExtract(payload.x, CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE, CONFIG_BITS_SEED));
++        mode.cem = int(bitfieldExtract(payload.x, CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE + CONFIG_BITS_SEED,
++                                       CONFIG_BITS_PRIMARY_MULTI_CEM + CONFIG_BITS_CEM));
++    }
++    else
++        mode.cem = int(bitfieldExtract(payload.x, CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE, CONFIG_BITS_CEM));
++
++    int config_bits;
++    if (mode.num_partitions > 1)
++    {
++        bool single_cem = (mode.cem & 3) == 0;
++        if (single_cem)
++        {
++            config_bits = CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE +
++                          CONFIG_BITS_SEED + CONFIG_BITS_PRIMARY_MULTI_CEM + CONFIG_BITS_CEM;
++        }
++        else
++        {
++            config_bits = CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE +
++                          CONFIG_BITS_SEED + CONFIG_BITS_PRIMARY_MULTI_CEM +
++                          CONFIG_BITS_EXTRA_CEM_PER_PARTITION * mode.num_partitions;
++        }
++    }
++    else
++    {
++        config_bits = CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE + CONFIG_BITS_CEM;
++    }
++
++    // Other config bits are packed before the weights.
++    int primary_config_bits;
++    if (mode.num_partitions > 1)
++    {
++        primary_config_bits = CONFIG_BITS_BLOCK + CONFIG_BITS_PARTITION_MODE + CONFIG_BITS_SEED +
++                              CONFIG_BITS_PRIMARY_MULTI_CEM + CONFIG_BITS_CEM;
++    }
++    else
++        primary_config_bits = config_bits;
++
++    if (mode.dual_plane)
++        config_bits += CONFIG_BITS_CCS;
++
++    // This is not allowed.
++    if (any(greaterThan(mode.weight_grid_size, ivec2(gl_WorkGroupSize.xy))))
++        decode_error = true;
++    if (mode.dual_plane && mode.num_partitions > 3)
++        decode_error = true;
++
++    mode.config_bits = config_bits;
++    mode.primary_config_bits = primary_config_bits;
++    return mode;
++}
++
++int idiv3_floor(int v)
++{
++    return (v * 0x5556) >> 16;
++}
++
++int idiv3_ceil(int v)
++{
++    return idiv3_floor(v + 2);
++}
++
++int idiv5_floor(int v)
++{
++    return (v * 0x3334) >> 16;
++}
++
++int idiv5_ceil(int v)
++{
++    return idiv5_floor(v + 4);
++}
++
++uvec4 build_bitmask(int bits)
++{
++    ivec4 num_bits = ivec4(bits, bits - 32, bits - 64, bits - 96);
++    uvec4 mask = uvec4(1) << clamp(num_bits, ivec4(0), ivec4(31));
++    mask--;
++    mask = mix(mask, uvec4(0xffffffffu), greaterThanEqual(uvec4(bits), uvec4(32, 64, 96, 128)));
++    return mask;
++}
++
++int decode_integer_sequence(uvec4 payload, int start_bit, int index, ivec3 quant)
++{
++    int ret;
++    if (quant.y != 0)
++    {
++        // Trit-decoding.
++        int block = idiv5_floor(index);
++        int offset = index - block * 5;
++        start_bit += block * (5 * quant.x + 8);
++
++        int t0_t1_offset = start_bit + (quant.x * 1 + 0);
++        int t2_t3_offset = start_bit + (quant.x * 2 + 2);
++        int t4_offset    = start_bit + (quant.x * 3 + 4);
++        int t5_t6_offset = start_bit + (quant.x * 4 + 5);
++        int t7_offset    = start_bit + (quant.x * 5 + 7);
++
++        int t = (extract_bits(payload, t0_t1_offset, 2) << 0) |
++                (extract_bits(payload, t2_t3_offset, 2) << 2) |
++                (extract_bits(payload, t4_offset, 1) << 4) |
++                (extract_bits(payload, t5_t6_offset, 2) << 5) |
++                (extract_bits(payload, t7_offset, 1) << 7);
++
++        t = int(texelFetch(LUTTritQuintDecode, t).x);
++        t = (t >> (3 * offset)) & 7;
++
++        int m_offset = offset * quant.x;
++        m_offset += idiv5_ceil(offset * 8);
++
++        if (quant.x != 0)
++        {
++            int m = extract_bits(payload, m_offset + start_bit, quant.x);
++            ret = (t << quant.x) | m;
++        }
++        else
++            ret = t;
++    }
++    else if (quant.z != 0)
++    {
++        // Quint-decoding
++        int block = idiv3_floor(index);
++        int offset = index - block * 3;
++        start_bit += block * (3 * quant.x + 7);
++
++        int q0_q1_q2_offset = start_bit + (quant.x * 1 + 0);
++        int q3_q4_offset    = start_bit + (quant.x * 2 + 3);
++        int q5_q6_offset    = start_bit + (quant.x * 3 + 5);
++
++        int q = (extract_bits(payload, q0_q1_q2_offset, 3) << 0) |
++                (extract_bits(payload, q3_q4_offset, 2) << 3) |
++                (extract_bits(payload, q5_q6_offset, 2) << 5);
++
++        q = int(texelFetch(LUTTritQuintDecode, 256 + q).x);
++        q = (q >> (3 * offset)) & 7;
++
++        int m_offset = offset * quant.x;
++        m_offset += idiv3_ceil(offset * 7);
++
++        if (quant.x != 0)
++        {
++            int m = extract_bits(payload, m_offset + start_bit, quant.x);
++            ret = (q << quant.x) | m;
++        }
++        else
++            ret = q;
++    }
++    else
++    {
++        int bit = index * quant.x;
++        ret = extract_bits(payload, start_bit + bit, quant.x);
++    }
++    return ret;
++}
++
++ivec2 normalize_coord(ivec2 pixel_coord)
++{
++    ivec2 D = ivec2((vec2((1024 + ivec2(gl_WorkGroupSize.xy >> 1u))) + 0.5) / vec2(gl_WorkGroupSize.xy - 1u));
++    ivec2 c = D * pixel_coord;
++    return c;
++}
++
++int decode_weight(uvec4 payload, int weight_index, ivec4 quant)
++{
++    int primary_weight = decode_integer_sequence(payload, 0, weight_index, quant.xyz);
++    primary_weight = int(texelFetch(LUTWeightUnquantize, primary_weight + quant.w).x);
++    return primary_weight;
++}
++
++int decode_weight_bilinear(uvec4 payload, ivec2 coord, int weight_resolution,
++                           int stride, int offset, ivec2 fractional, ivec4 quant)
++{
++    int index = coord.y * weight_resolution + coord.x;
++    int p00 = decode_weight(payload, stride * index + offset, quant);
++    int p10, p01, p11;
++
++    if (fractional.x != 0)
++        p10 = decode_weight(payload, stride * (index + 1) + offset, quant);
++    else
++        p10 = p00;
++
++    if (fractional.y != 0)
++    {
++        p01 = decode_weight(payload, stride * (index + weight_resolution) + offset, quant);
++        if (fractional.x != 0)
++            p11 = decode_weight(payload, stride * (index + weight_resolution + 1) + offset, quant);
++        else
++            p11 = p01;
++    }
++    else
++    {
++        p01 = p00;
++        p11 = p10;
++    }
++
++    int w11 = (fractional.x * fractional.y + 8) >> 4;
++    int w10 = fractional.x - w11;
++    int w01 = fractional.y - w11;
++    int w00 = 16 - fractional.x - fractional.y + w11;
++    return (p00 * w00 + p10 * w10 + p01 * w01 + p11 * w11 + 8) >> 4;
++}
++
++ivec4 decode_weights(uvec4 payload, BlockMode mode, ivec2 normalized_pixel, out int weight_cost_bits)
++{
++    ivec4 quant = ivec4(texelFetch(LUTWeightQuantizer, mode.weight_mode_index));
++    int num_weights = mode.weight_grid_size.x * mode.weight_grid_size.y;
++    num_weights <<= int(mode.dual_plane);
++    weight_cost_bits =
++        quant.x * num_weights +
++        idiv5_ceil(num_weights * 8 * quant.y) +
++        idiv3_ceil(num_weights * 7 * quant.z);
++
++    // Decoders must deal with error conditions and return the correct error color.
++    if (weight_cost_bits < 24 || weight_cost_bits > 96 || num_weights > 64)
++    {
++        decode_error = true;
++        return ivec4(0);
++    }
++
++    int ccs;
++    if (mode.dual_plane)
++    {
++        int extra_cem_bits = 0;
++        if ((mode.cem & 3) != 0)
++            extra_cem_bits = max(mode.num_partitions * 3 - 4, 0);
++        ccs = extract_bits(payload, 126 - weight_cost_bits - extra_cem_bits, 2);
++    }
++
++    payload = bitfieldReverse(payload);
++    payload = payload.wzyx;
++    payload &= build_bitmask(weight_cost_bits);
++
++    // Scale the normalized coordinate to weight grid.
++    ivec2 weight_pixel_fixed_point = (normalized_pixel * (mode.weight_grid_size - 1) + 32) >> 6;
++    ivec2 weight_pixel = weight_pixel_fixed_point >> 4;
++    ivec2 weight_pixel_fractional = weight_pixel_fixed_point & 0xf;
++
++    ivec4 ret;
++    int primary_weight = decode_weight_bilinear(payload, weight_pixel, mode.weight_grid_size.x,
++                                                1 << int(mode.dual_plane), 0,
++                                                weight_pixel_fractional, quant);
++    if (mode.dual_plane)
++    {
++        int secondary_weight = decode_weight_bilinear(payload, weight_pixel, mode.weight_grid_size.x,
++                                                      2, 1,
++                                                      weight_pixel_fractional, quant);
++        ret = mix(ivec4(primary_weight), ivec4(secondary_weight), equal(ivec4(ccs), ivec4(0, 1, 2, 3)));
++    }
++    else
++        ret = ivec4(primary_weight);
++
++    return ret;
++}
++
++void decode_endpoint_ldr_luma_direct(out ivec4 ep0, out ivec4 ep1,
++        int v0, int v1)
++{
++    ep0 = ivec4(ivec3(v0), 0xff);
++    ep1 = ivec4(ivec3(v1), 0xff);
++}
++
++void decode_endpoint_hdr_luma_direct(out ivec4 ep0, out ivec4 ep1,
++        int v0, int v1)
++{
++    int y0, y1;
++    if (v1 >= v0)
++    {
++        y0 = v0 << 4;
++        y1 = v1 << 4;
++    }
++    else
++    {
++        y0 = (v1 << 4) + 8;
++        y1 = (v0 << 4) - 8;
++    }
++
++    ep0 = ivec4(ivec3(y0), 0x780);
++    ep1 = ivec4(ivec3(y1), 0x780);
++}
++
++void decode_endpoint_hdr_luma_direct_small_range(out ivec4 ep0, out ivec4 ep1,
++        int v0, int v1)
++{
++    int y0, y1, d;
++
++    if ((v0 & 0x80) != 0)
++    {
++        y0 = ((v1 & 0xe0) << 4) | ((v0 & 0x7f) << 2);
++        d = (v1 & 0x1f) << 2;
++    }
++    else
++    {
++        y0 = ((v1 & 0xf0) << 4) | ((v0 & 0x7f) << 1);
++        d = (v1 & 0x0f)  << 1;
++    }
++
++    y1 = min(y0 + d, 0xfff);
++
++    ep0 = ivec4(ivec3(y0), 0x780);
++    ep1 = ivec4(ivec3(y1), 0x780);
++}
++
++void decode_endpoint_ldr_luma_base_offset(out ivec4 ep0, out ivec4 ep1,
++        int v0, int v1)
++{
++    int l0 = (v0 >> 2) | (v1 & 0xc0);
++    int l1 = l0 + (v1 & 0x3f);
++    l1 = min(l1, 0xff);
++    ep0 = ivec4(ivec3(l0), 0xff);
++    ep1 = ivec4(ivec3(l1), 0xff);
++}
++
++void decode_endpoint_ldr_luma_alpha_direct(out ivec4 ep0, out ivec4 ep1,
++    int v0, int v1, int v2, int v3)
++{
++    ep0 = ivec4(ivec3(v0), v2);
++    ep1 = ivec4(ivec3(v1), v3);
++}
++
++ivec4 blue_contract(int r, int g, int b, int a)
++{
++    ivec4 ret;
++    ret.r = (r + b) >> 1;
++    ret.g = (g + b) >> 1;
++    ret.b = b;
++    ret.a = a;
++    return ret;
++}
++
++void bit_transfer_signed(inout int a, inout int b)
++{
++    b >>= 1;
++    b |= a & 0x80;
++    a >>= 1;
++    a &= 0x3f;
++    a = bitfieldExtract(a, 0, 6);
++}
++
++void decode_endpoint_ldr_luma_alpha_base_offset(out ivec4 ep0, out ivec4 ep1,
++    int v0, int v1, int v2, int v3)
++{
++    bit_transfer_signed(v1, v0);
++    bit_transfer_signed(v3, v2);
++    int v0_v1 = clamp(v0 + v1, 0, 0xff);
++    int v2_v3 = clamp(v2 + v3, 0, 0xff);
++    v0 = clamp(v0, 0, 0xff);
++    v2 = clamp(v2, 0, 0xff);
++    ep0 = ivec4(ivec3(v0), v2);
++    ep1 = ivec4(ivec3(v0_v1), v2_v3);
++}
++
++void decode_endpoint_ldr_rgb_base_scale(out ivec4 ep0, out ivec4 ep1,
++        int v0, int v1, int v2, int v3)
++{
++    ep0 = ivec4((ivec3(v0, v1, v2) * v3) >> 8, 0xff);
++    ep1 = ivec4(v0, v1, v2, 0xff);
++}
++
++void decode_endpoint_ldr_rgb_base_scale_two_a(out ivec4 ep0, out ivec4 ep1,
++        int v0, int v1, int v2, int v3, int v4, int v5)
++{
++    ep0 = ivec4((ivec3(v0, v1, v2) * v3) >> 8, v4);
++    ep1 = ivec4(v0, v1, v2, v5);
++}
++
++void decode_endpoint_ldr_rgb_direct(out ivec4 ep0, out ivec4 ep1,
++        int v0, int v1, int v2, int v3, int v4, int v5)
++{
++    int s0 = v0 + v2 + v4;
++    int s1 = v1 + v3 + v5;
++    if (s1 >= s0)
++    {
++        ep0 = ivec4(v0, v2, v4, 0xff);
++        ep1 = ivec4(v1, v3, v5, 0xff);
++    }
++    else
++    {
++        ep0 = blue_contract(v1, v3, v5, 0xff);
++        ep1 = blue_contract(v0, v2, v4, 0xff);
++    }
++}
++
++void decode_endpoint_hdr_rgb_scale(out ivec4 ep0, out ivec4 ep1,
++    int v0, int v1, int v2, int v3)
++{
++    // Mind-numbing weird format, just copy from spec ...
++    int mode_value = ((v0 & 0xc0) >> 6) | ((v1 & 0x80) >> 5) | ((v2 & 0x80) >> 4);
++    int major_component;
++    int mode;
++
++    if ((mode_value & 0xc) != 0xc)
++    {
++        major_component = mode_value >> 2;
++        mode = mode_value & 3;
++    }
++    else if (mode_value != 0xf)
++    {
++        major_component = mode_value & 3;
++        mode = 4;
++    }
++    else
++    {
++        major_component = 0;
++        mode = 5;
++    }
++
++    int red = v0 & 0x3f;
++    int green = v1 & 0x1f;
++    int blue = v2 & 0x1f;
++    int scale = v3 & 0x1f;
++
++    int x0 = (v1 >> 6) & 1;
++    int x1 = (v1 >> 5) & 1;
++    int x2 = (v2 >> 6) & 1;
++    int x3 = (v2 >> 5) & 1;
++    int x4 = (v3 >> 7) & 1;
++    int x5 = (v3 >> 6) & 1;
++    int x6 = (v3 >> 5) & 1;
++
++    int ohm = 1 << mode;
++    if ((ohm & 0x30) != 0) green |= x0 << 6;
++    if ((ohm & 0x3a) != 0) green |= x1 << 5;
++    if ((ohm & 0x30) != 0) blue |= x2 << 6;
++    if ((ohm & 0x3a) != 0) blue |= x3 << 5;
++    if ((ohm & 0x3d) != 0) scale |= x6 << 5;
++    if ((ohm & 0x2d) != 0) scale |= x5 << 6;
++    if ((ohm & 0x04) != 0) scale |= x4 << 7;
++    if ((ohm & 0x3b) != 0) red |= x4 << 6;
++    if ((ohm & 0x04) != 0) red |= x3 << 6;
++    if ((ohm & 0x10) != 0) red |= x5 << 7;
++    if ((ohm & 0x0f) != 0) red |= x2 << 7;
++    if ((ohm & 0x05) != 0) red |= x1 << 8;
++    if ((ohm & 0x0a) != 0) red |= x0 << 8;
++    if ((ohm & 0x05) != 0) red |= x0 << 9;
++    if ((ohm & 0x02) != 0) red |= x6 << 9;
++    if ((ohm & 0x01) != 0) red |= x3 << 10;
++    if ((ohm & 0x02) != 0) red |= x5 << 10;
++
++    int shamt = max(mode, 1);
++    red <<= shamt;
++    green <<= shamt;
++    blue <<= shamt;
++    scale <<= shamt;
++
++    if (mode != 5)
++    {
++        green = red - green;
++        blue = red - blue;
++    }
++
++    if (major_component == 1)
++        swap(red, green);
++    else if (major_component == 2)
++        swap(red, blue);
++
++    ep1 = ivec4(clamp(ivec3(red, green, blue), ivec3(0), ivec3(0xfff)), 0x780);
++    ep0 = ivec4(clamp(ivec3(red, green, blue) - scale, ivec3(0), ivec3(0xfff)), 0x780);
++}
++
++void decode_endpoint_hdr_rgb_direct(out ivec4 ep0, out ivec4 ep1,
++        int v0, int v1, int v2, int v3, int v4, int v5)
++{
++    int major_component = ((v4 & 0x80) >> 7) | ((v5 & 0x80) >> 6);
++
++    if (major_component == 3)
++    {
++        ep0 = ivec4(v0 << 4, v2 << 4, (v4 & 0x7f) << 5, 0x780);
++        ep1 = ivec4(v1 << 4, v3 << 4, (v5 & 0x7f) << 5, 0x780);
++        return;
++    }
++
++    int mode = ((v1 & 0x80) >> 7) | ((v2 & 0x80) >> 6) | ((v3 & 0x80) >> 5);
++    int va = v0 | ((v1 & 0x40) << 2);
++    int vb0 = v2 & 0x3f;
++    int vb1 =  v3 & 0x3f;
++    int vc = v1 & 0x3f;
++    int vd0 = v4 & 0x7f;
++    int vd1 = v5 & 0x7f;
++
++    int d_bits = 7 - (mode & 1);
++    if ((mode & 5) == 4)
++        d_bits -= 2;
++
++    vd0 = bitfieldExtract(vd0, 0, d_bits);
++    vd1 = bitfieldExtract(vd1, 0, d_bits);
++
++    int x0 = (v2 >> 6) & 1;
++    int x1 = (v3 >> 6) & 1;
++    int x2 = (v4 >> 6) & 1;
++    int x3 = (v5 >> 6) & 1;
++    int x4 = (v4 >> 5) & 1;
++    int x5 = (v5 >> 5) & 1;
++
++    int ohm = 1 << mode;
++    if ((ohm & 0xa4) != 0) va |= x0 << 9;
++    if ((ohm & 0x08) != 0) va |= x2 << 9;
++    if ((ohm & 0x50) != 0) va |= x4 << 9;
++    if ((ohm & 0x50) != 0) va |= x5 << 10;
++    if ((ohm & 0xa0) != 0) va |= x1 << 10;
++    if ((ohm & 0xc0) != 0) va |= x2 << 11;
++
++    if ((ohm & 0x04) != 0) vc |= x1 << 6;
++    if ((ohm & 0xe8) != 0) vc |= x3 << 6;
++    if ((ohm & 0x20) != 0) vc |= x2 << 7;
++
++    if ((ohm & 0x5b) != 0) vb0 |= x0 << 6;
++    if ((ohm & 0x5b) != 0) vb1 |= x1 << 6;
++    if ((ohm & 0x12) != 0) vb0 |= x2 << 7;
++    if ((ohm & 0x12) != 0) vb1 |= x3 << 7;
++
++    int shamt = (mode >> 1) ^ 3;
++    va <<= shamt;
++    vb0 <<= shamt;
++    vb1 <<= shamt;
++    vc <<= shamt;
++    vd0 <<= shamt;
++    vd1 <<= shamt;
++
++    ep1 = ivec4(clamp(ivec3(va, va - vb0, va - vb1), ivec3(0), ivec3(0xfff)), 0x780);
++    ep0 = ivec4(clamp(ivec3(va - vc, va - vb0 - vc - vd0, va - vb1 - vc - vd1), ivec3(0), ivec3(0xfff)), 0x780);
++
++    if (major_component == 1)
++    {
++        swap(ep0.r, ep0.g);
++        swap(ep1.r, ep1.g);
++    }
++    else if (major_component == 2)
++    {
++        swap(ep0.r, ep0.b);
++        swap(ep1.r, ep1.b);
++    }
++}
++
++void decode_endpoint_ldr_rgb_base_offset(out ivec4 ep0, out ivec4 ep1,
++        int v0, int v1, int v2, int v3, int v4, int v5)
++{
++    bit_transfer_signed(v1, v0);
++    bit_transfer_signed(v3, v2);
++    bit_transfer_signed(v5, v4);
++    if (v1 + v3 + v5 >= 0)
++    {
++        ep0 = ivec4(v0, v2, v4, 0xff);
++        ep1 = ivec4(v0 + v1, v2 + v3, v4 + v5, 0xff);
++    }
++    else
++    {
++        ep0 = blue_contract(v0 + v1, v2 + v3, v4 + v5, 0xff);
++        ep1 = blue_contract(v0, v2, v4, 0xff);
++    }
++
++    ep0.rgb = clamp(ep0.rgb, ivec3(0), ivec3(0xff));
++    ep1.rgb = clamp(ep1.rgb, ivec3(0), ivec3(0xff));
++}
++
++void decode_endpoint_ldr_rgba_direct(out ivec4 ep0, out ivec4 ep1,
++        int v0, int v1, int v2, int v3,
++        int v4, int v5, int v6, int v7)
++{
++    int s0 = v0 + v2 + v4;
++    int s1 = v1 + v3 + v5;
++    if (s1 >= s0)
++    {
++        ep0 = ivec4(v0, v2, v4, v6);
++        ep1 = ivec4(v1, v3, v5, v7);
++    }
++    else
++    {
++        ep0 = blue_contract(v1, v3, v5, v7);
++        ep1 = blue_contract(v0, v2, v4, v6);
++    }
++}
++
++void decode_endpoint_ldr_rgba_base_offset(out ivec4 ep0, out ivec4 ep1,
++        int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7)
++{
++    bit_transfer_signed(v1, v0);
++    bit_transfer_signed(v3, v2);
++    bit_transfer_signed(v5, v4);
++    bit_transfer_signed(v7, v6);
++
++    if (v1 + v3 + v5 >= 0)
++    {
++        ep0 = ivec4(v0, v2, v4, v6);
++        ep1 = ivec4(v0 + v1, v2 + v3, v4 + v5, v6 + v7);
++    }
++    else
++    {
++        ep0 = blue_contract(v0 + v1, v2 + v3, v4 + v5, v6 + v7);
++        ep1 = blue_contract(v0, v2, v4, v6);
++    }
++
++    ep0 = clamp(ep0, ivec4(0), ivec4(0xff));
++    ep1 = clamp(ep1, ivec4(0), ivec4(0xff));
++}
++
++void decode_endpoint_hdr_alpha(out int ep0, out int ep1, int v6, int v7)
++{
++    int mode = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
++    v6 &= 0x7f;
++    v7 &= 0x7f;
++
++    if (mode == 3)
++    {
++        ep0 = v6 << 5;
++        ep1 = v7 << 5;
++    }
++    else
++    {
++        v6 |= (v7 << (mode + 1)) & 0x780;
++        v7 &= 0x3f >> mode;
++        v7 ^= 0x20 >> mode;
++        v7 -= 0x20 >> mode;
++        v6 <<= 4 - mode;
++        v7 <<= 4 - mode;
++        v7 += v6;
++        v7 = clamp(v7, 0, 0xfff);
++        ep0 = v6;
++        ep1 = v7;
++    }
++}
++
++void decode_endpoint(out ivec4 ep0, out ivec4 ep1, out int decode_mode,
++                     uvec4 payload, int bit_offset, ivec4 quant, int ep_mode,
++                     int base_endpoint_index, int num_endpoint_bits)
++{
++    num_endpoint_bits += bit_offset;
++    payload &= build_bitmask(num_endpoint_bits);
++
++    // Could of course use an array, but that doesn't lower nicely to indexed registers on all GPUs.
++    int v0, v1, v2, v3, v4, v5, v6, v7;
++    int num_values = 2 * ((ep_mode >> 2) + 1);
++
++#define DECODE_EP(i) \
++    int(texelFetch(LUTEndpointUnquantize, quant.w + decode_integer_sequence(payload, bit_offset, i + base_endpoint_index, quant.xyz)).x)
++
++    int hi_bits = ep_mode >> 2;
++    v0 = DECODE_EP(0);
++    v1 = DECODE_EP(1);
++
++    if (hi_bits >= 1)
++    {
++        v2 = DECODE_EP(2);
++        v3 = DECODE_EP(3);
++    }
++
++    if (hi_bits >= 2)
++    {
++        v4 = DECODE_EP(4);
++        v5 = DECODE_EP(5);
++    }
++
++    if (hi_bits >= 3)
++    {
++        v6 = DECODE_EP(6);
++        v7 = DECODE_EP(7);
++    }
++
++    switch (ep_mode)
++    {
++    case 0:
++        decode_endpoint_ldr_luma_direct(ep0, ep1,
++            v0, v1);
++        decode_mode = MODE_LDR;
++        break;
++
++    case 1:
++        decode_endpoint_ldr_luma_base_offset(ep0, ep1,
++            v0, v1);
++        decode_mode = MODE_LDR;
++        break;
++
++    case 2:
++        decode_endpoint_hdr_luma_direct(ep0, ep1,
++            v0, v1);
++        decode_mode = MODE_HDR;
++        break;
++
++    case 3:
++        decode_endpoint_hdr_luma_direct_small_range(ep0, ep1,
++            v0, v1);
++        decode_mode = MODE_HDR;
++        break;
++
++    case 4:
++        decode_endpoint_ldr_luma_alpha_direct(ep0, ep1,
++            v0, v1, v2, v3);
++        decode_mode = MODE_LDR;
++        break;
++
++    case 5:
++        decode_endpoint_ldr_luma_alpha_base_offset(ep0, ep1,
++            v0, v1, v2, v3);
++        decode_mode = MODE_LDR;
++        break;
++
++    case 6:
++        decode_endpoint_ldr_rgb_base_scale(ep0, ep1,
++            v0, v1, v2, v3);
++        decode_mode = MODE_LDR;
++        break;
++
++    case 7:
++        decode_endpoint_hdr_rgb_scale(ep0, ep1,
++            v0, v1, v2, v3);
++        decode_mode = MODE_HDR;
++        break;
++
++    case 8:
++        decode_endpoint_ldr_rgb_direct(ep0, ep1,
++            v0, v1, v2, v3, v4, v5);
++        decode_mode = MODE_LDR;
++        break;
++
++    case 9:
++        decode_endpoint_ldr_rgb_base_offset(ep0, ep1,
++            v0, v1, v2, v3, v4, v5);
++        decode_mode = MODE_LDR;
++        break;
++
++    case 10:
++        decode_endpoint_ldr_rgb_base_scale_two_a(ep0, ep1,
++            v0, v1, v2, v3, v4, v5);
++        decode_mode = MODE_LDR;
++        break;
++
++    case 11:
++    case 14:
++    case 15:
++        decode_endpoint_hdr_rgb_direct(ep0, ep1,
++            v0, v1, v2, v3, v4, v5);
++        if (ep_mode == 14)
++        {
++            ep0.a = v6;
++            ep1.a = v7;
++            decode_mode = MODE_HDR_LDR_ALPHA;
++        }
++        else if (ep_mode == 15)
++        {
++            decode_endpoint_hdr_alpha(ep0.a, ep1.a, v6, v7);
++            decode_mode = MODE_HDR;
++        }
++        else
++            decode_mode = MODE_HDR;
++        break;
++
++    case 12:
++        decode_endpoint_ldr_rgba_direct(ep0, ep1,
++            v0, v1, v2, v3, v4, v5, v6, v7);
++        decode_mode = MODE_LDR;
++        break;
++
++    case 13:
++        decode_endpoint_ldr_rgba_base_offset(ep0, ep1,
++            v0, v1, v2, v3, v4, v5, v6, v7);
++        decode_mode = MODE_LDR;
++        break;
++    }
++
++    if (DECODE_8BIT && decode_mode != MODE_LDR)
++        decode_error = true;
++}
++
++#define CHECK_DECODE_ERROR() do { \
++    if (decode_error) \
++    { \
++        emit_decode_error(coord.xy); \
++        return; \
++    } \
++} while(false)
++
++void emit_decode_error(ivec2 coord)
++{
++#ifdef VULKAN
++    if (is_3Dimage)
++        imageStore(OutputImage3D, ivec3(coord, gl_WorkGroupID.z), error_color);
++    else
++        imageStore(OutputImage2Darray, ivec3(coord, gl_WorkGroupID.z), error_color);
++#else /* VULKAN */
++    imageStore(OutputImage, coord, error_color);
++#endif /* VULKAN */
++}
++
++int compute_num_endpoint_pairs(int num_partitions, int cem)
++{
++    int ret;
++    if (num_partitions > 1)
++    {
++        bool single_cem = (cem & 3) == 0;
++        if (single_cem)
++            ret = ((cem >> 4) + 1) * num_partitions;
++        else
++            ret = (cem & 3) * num_partitions + bitCount(bitfieldExtract(uint(cem), 2, num_partitions));
++    }
++    else
++    {
++        ret = (cem >> 2) + 1;
++    }
++    return ret;
++}
++
++void decode_cem_base_endpoint(uvec4 payload, int weight_cost_bits, inout int cem, out int base_endpoint_index,
++    int num_partitions, int partition_index)
++{
++    if (num_partitions > 1)
++    {
++        bool single_cem = (cem & 3) == 0;
++        if (single_cem)
++        {
++            cem >>= 2;
++            base_endpoint_index = ((cem >> 2) + 1) * partition_index;
++        }
++        else
++        {
++            if (partition_index != 0)
++                base_endpoint_index = (cem & 3) * partition_index + bitCount(bitfieldExtract(uint(cem), 2, partition_index));
++            else
++                base_endpoint_index = 0;
++
++            int base_class = (cem & 3) - 1;
++            int extra_cem_bits = num_partitions * 3 - 4;
++            int extra_bits = extract_bits(payload, 128 - weight_cost_bits - extra_cem_bits, extra_cem_bits);
++            cem = (extra_bits << 4) | (cem >> 2);
++
++            int class_offset_bit = (cem >> partition_index) & 1;
++            int ep_bits = (cem >> (num_partitions + 2 * partition_index)) & 3;
++
++            cem = 4 * (base_class + class_offset_bit) + ep_bits;
++        }
++        base_endpoint_index *= 2;
++    }
++    else
++    {
++        base_endpoint_index = 0;
++    }
++}
++
++ivec4 void_extent_color(uvec4 payload, out int decode_mode)
++{
++    int min_s = extract_bits(payload, 12, 13);
++    int max_s = extract_bits(payload, 12 + 13, 13);
++    int min_t = extract_bits(payload, 12 + 2 * 13, 13);
++    int max_t = extract_bits(payload, 12 + 3 * 13, 13);
++
++    int reserved = extract_bits(payload, 10, 2);
++    if (reserved != 3)
++    {
++        decode_error = true;
++        return ivec4(0);
++    }
++
++    if (!all(equal(ivec4(min_s, max_s, min_t, max_t), ivec4((1 << 13) - 1))))
++    {
++        if (any(greaterThanEqual(ivec2(min_s, min_t), ivec2(max_s, max_t))))
++        {
++            decode_error = true;
++            return ivec4(0);
++        }
++    }
++
++    decode_mode = (payload.x & (1u << 9)) != 0u ? MODE_HDR : MODE_LDR;
++
++    int r = extract_bits(payload, 64, 16);
++    int g = extract_bits(payload, 64 + 16, 16);
++    int b = extract_bits(payload, 64 + 32, 16);
++    int a = extract_bits(payload, 64 + 48, 16);
++
++    return ivec4(r, g, b, a);
++}
++
++void main()
++{
++    ivec4 coord = build_coord();
++#ifdef VULKAN
++    if (any(greaterThanEqual(coord.xy, texel_end.xy)))
++        return;
++#else /* VULKAN */
++    if (any(greaterThanEqual(coord.xy, imageSize(OutputImage))))
++        return;
++#endif /* VULKAN */
++
++    ivec2 pixel_coord = ivec2(gl_LocalInvocationID.xy);
++    int linear_pixel = int(gl_WorkGroupSize.x) * pixel_coord.y + pixel_coord.x;
++    uvec4 payload;
++#ifdef VULKAN
++    if (is_3Dimage)
++        payload = texelFetch(PayloadInput3D, ivec3(coord.zw, gl_WorkGroupID.z), 0);
++    else
++        payload = texelFetch(PayloadInput2Darray,ivec3(coord.zw, gl_WorkGroupID.z), 0);
++#else /* VULKAN */
++    payload = texelFetch(PayloadInput, coord.zw, 0);
++#endif /* VULKAN */
++
++    BlockMode block_mode = decode_block_mode(payload);
++    CHECK_DECODE_ERROR();
++
++    ivec4 final_color;
++    int decode_mode;
++    if (block_mode.void_extent)
++    {
++        final_color = void_extent_color(payload, decode_mode);
++        CHECK_DECODE_ERROR();
++    }
++    else
++    {
++        int weight_cost_bits;
++        ivec4 weights = decode_weights(payload, block_mode, normalize_coord(pixel_coord), weight_cost_bits);
++
++        int partition_index = 0;
++        if (block_mode.num_partitions > 1)
++        {
++            int lut_x = pixel_coord.x + int(gl_WorkGroupSize.x) * (block_mode.seed & 31);
++            int lut_y = pixel_coord.y + int(gl_WorkGroupSize.y) * (block_mode.seed >> 5);
++#ifdef VULKAN
++            int lut_width = int(gl_WorkGroupSize.x) * 32;
++            partition_index = int(texelFetch(LUTPartitionTable, lut_y * lut_width + lut_x).x);
++#else /* VULKAN */
++            partition_index = int(texelFetch(LUTPartitionTable, ivec2(lut_x, lut_y), 0).x);
++#endif /* VULKAN */
++            partition_index = (partition_index >> (2 * block_mode.num_partitions - 4)) & 3;
++        }
++
++        int available_endpoint_bits = max(128 - block_mode.config_bits - weight_cost_bits, 0);
++
++        // In multi-partition mode, the 6-bit CEM field is encoded as
++        // First two bits tell if all CEM field are the same, if not we specify a class offset, and N bits
++        // after that will offset the class by 1.
++        int num_endpoint_pairs = compute_num_endpoint_pairs(block_mode.num_partitions, block_mode.cem);
++
++        // Error color must be emitted if we need more than 18 integer sequence encoded values of color.
++        if (num_endpoint_pairs > 9)
++        {
++            decode_error = true;
++            emit_decode_error(coord.xy);
++            return;
++        }
++
++        ivec4 endpoint_quant = ivec4(texelFetch(LUTRemainingBitsToEndpointQuantizer,
++                128 * (num_endpoint_pairs - 1) + available_endpoint_bits));
++
++        // Only read the bits we need for endpoints.
++        int num_endpoint_values = num_endpoint_pairs * 2;
++        available_endpoint_bits =
++            endpoint_quant.x * num_endpoint_values +
++            idiv5_ceil(endpoint_quant.y * 8 * num_endpoint_values) +
++            idiv3_ceil(endpoint_quant.z * 7 * num_endpoint_values);
++
++        // No space left for color endpoints.
++        if (all(equal(endpoint_quant.xyz, ivec3(0))))
++        {
++            decode_error = true;
++            emit_decode_error(coord.xy);
++            return;
++        }
++
++        int endpoint_bit_offset = block_mode.primary_config_bits;
++        ivec4 ep0, ep1;
++
++        // Decode CEM for multi-partition schemes.
++        int cem = block_mode.cem;
++        int base_endpoint_index;
++        decode_cem_base_endpoint(payload, weight_cost_bits, cem, base_endpoint_index,
++                                 block_mode.num_partitions, partition_index);
++
++        decode_endpoint(ep0, ep1, decode_mode, payload, endpoint_bit_offset, endpoint_quant,
++                        cem, base_endpoint_index, available_endpoint_bits);
++        CHECK_DECODE_ERROR();
++
++        final_color = interpolate_endpoint(ep0, ep1, weights, decode_mode);
++    }
++
++    if (DECODE_8BIT)
++    {
++#ifdef VULKAN
++        if (is_3Dimage)
++            imageStore(OutputImage3D, ivec3(coord.xy, gl_WorkGroupID.z), uvec4(final_color >> 8));
++        else
++            imageStore(OutputImage2Darray, ivec3(coord.xy, gl_WorkGroupID.z), uvec4(final_color >> 8));
++#else /* VULKAN */
++        imageStore(OutputImage, coord.xy, uvec4(final_color >> 8));
++#endif /* VULKAN */
++    }
++    else
++    {
++        uvec4 encoded;
++        if (block_mode.void_extent && decode_mode == MODE_HDR)
++            encoded = uvec4(final_color);
++        else
++            encoded = decode_fp16(final_color, decode_mode);
++#ifdef VULKAN
++        if (is_3Dimage)
++            imageStore(OutputImage3D, ivec3(coord.xy, gl_WorkGroupID.z), encoded);
++        else
++            imageStore(OutputImage2Darray, ivec3(coord.xy, gl_WorkGroupID.z), encoded);
++#else /* VULKAN */
++        imageStore(OutputImage, coord.xy, encoded);
++#endif /* VULKAN */
++    }
++}
+diff --git a/src/compiler/glsl/bc1.glsl b/src/compiler/glsl/bc1.glsl
+new file mode 100644
+index 00000000000..251635d7b69
+--- /dev/null
++++ b/src/compiler/glsl/bc1.glsl
+@@ -0,0 +1,544 @@
++/*
++ * Copyright 2020-2022 Matias N. Goldberg
++ * Copyright 2022 Intel Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#version 310 es
++
++#if defined(GL_ES) && GL_ES == 1
++	// Desktop GLSL allows the const keyword for either compile-time or
++	// run-time constants. GLSL ES only allows the keyword for compile-time
++	// constants. Since we use const on run-time constants, define it to
++	// nothing.
++	#define const
++#endif
++
++%s // include "CrossPlatformSettings_piece_all.glsl"
++
++#define FLT_MAX 340282346638528859811704183484516925440.0f
++
++layout( location = 0 ) uniform uint p_numRefinements;
++
++uniform sampler2D srcTex;
++
++layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture;
++
++layout( std430, binding = 1 ) readonly restrict buffer globalBuffer
++{
++	float2 c_oMatch5[256];
++	float2 c_oMatch6[256];
++};
++
++layout( local_size_x = 8,  //
++		local_size_y = 8,  //
++		local_size_z = 1 ) in;
++
++float3 rgb565to888( float rgb565 )
++{
++	float3 retVal;
++	retVal.x = floor( rgb565 / 2048.0f );
++	retVal.y = floor( mod( rgb565, 2048.0f ) / 32.0f );
++	retVal.z = floor( mod( rgb565, 32.0f ) );
++
++	// This is the correct 565 to 888 conversion:
++	//		rgb = floor( rgb * ( 255.0f / float3( 31.0f, 63.0f, 31.0f ) ) + 0.5f )
++	//
++	// However stb_dxt follows a different one:
++	//		rb = floor( rb * ( 256 / 32 + 8 / 32 ) );
++	//		g  = floor( g  * ( 256 / 64 + 4 / 64 ) );
++	//
++	// I'm not sure exactly why but it's possible this is how the S3TC specifies it should be decoded
++	// It's quite possible this is the reason:
++	//		http://www.ludicon.com/castano/blog/2009/03/gpu-dxt-decompression/
++	//
++	// Or maybe it's just because it's cheap to do with integer shifts.
++	// Anyway, we follow stb_dxt's conversion just in case
++	// (gives almost the same result, with 1 or -1 of difference for a very few values)
++	//
++	// Perhaps when we make 888 -> 565 -> 888 it doesn't matter
++	// because they end up mapping to the original number
++
++	return floor( retVal * float3( 8.25f, 4.0625f, 8.25f ) );
++}
++
++float rgb888to565( float3 rgbValue )
++{
++	rgbValue.rb = floor( rgbValue.rb * 31.0f / 255.0f + 0.5f );
++	rgbValue.g = floor( rgbValue.g * 63.0f / 255.0f + 0.5f );
++
++	return rgbValue.r * 2048.0f + rgbValue.g * 32.0f + rgbValue.b;
++}
++
++// linear interpolation at 1/3 point between a and b, using desired rounding type
++float3 lerp13( float3 a, float3 b )
++{
++#ifdef STB_DXT_USE_ROUNDING_BIAS
++	// with rounding bias
++	return a + floor( ( b - a ) * ( 1.0f / 3.0f ) + 0.5f );
++#else
++	// without rounding bias
++	return floor( ( 2.0f * a + b ) / 3.0f );
++#endif
++}
++
++/// Unpacks a block of 4 colours from two 16-bit endpoints
++void EvalColors( out float3 colours[4], float c0, float c1 )
++{
++	colours[0] = rgb565to888( c0 );
++	colours[1] = rgb565to888( c1 );
++	colours[2] = lerp13( colours[0], colours[1] );
++	colours[3] = lerp13( colours[1], colours[0] );
++}
++
++/** The color optimization function. (Clever code, part 1)
++@param outMinEndp16 [out]
++	Minimum endpoint, in RGB565
++@param outMaxEndp16 [out]
++	Maximum endpoint, in RGB565
++*/
++void OptimizeColorsBlock( const uint srcPixelsBlock[16], out float outMinEndp16, out float outMaxEndp16 )
++{
++	// determine color distribution
++	float3 avgColour;
++	float3 minColour;
++	float3 maxColour;
++
++	avgColour = minColour = maxColour = unpackUnorm4x8( srcPixelsBlock[0] ).xyz;
++	for( int i = 1; i < 16; ++i )
++	{
++		const float3 currColourUnorm = unpackUnorm4x8( srcPixelsBlock[i] ).xyz;
++		avgColour += currColourUnorm;
++		minColour = min( minColour, currColourUnorm );
++		maxColour = max( maxColour, currColourUnorm );
++	}
++
++	avgColour = round( avgColour * 255.0f / 16.0f );
++	maxColour *= 255.0f;
++	minColour *= 255.0f;
++
++	// determine covariance matrix
++	float cov[6];
++	for( int i = 0; i < 6; ++i )
++		cov[i] = 0.0f;
++
++	for( int i = 0; i < 16; ++i )
++	{
++		const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;
++		float3 rgbDiff = currColour - avgColour;
++
++		cov[0] += rgbDiff.r * rgbDiff.r;
++		cov[1] += rgbDiff.r * rgbDiff.g;
++		cov[2] += rgbDiff.r * rgbDiff.b;
++		cov[3] += rgbDiff.g * rgbDiff.g;
++		cov[4] += rgbDiff.g * rgbDiff.b;
++		cov[5] += rgbDiff.b * rgbDiff.b;
++	}
++
++	// convert covariance matrix to float, find principal axis via power iter
++	for( int i = 0; i < 6; ++i )
++		cov[i] /= 255.0f;
++
++	float3 vF = maxColour - minColour;
++
++	const int nIterPower = 4;
++	for( int iter = 0; iter < nIterPower; ++iter )
++	{
++		const float r = vF.r * cov[0] + vF.g * cov[1] + vF.b * cov[2];
++		const float g = vF.r * cov[1] + vF.g * cov[3] + vF.b * cov[4];
++		const float b = vF.r * cov[2] + vF.g * cov[4] + vF.b * cov[5];
++
++		vF.r = r;
++		vF.g = g;
++		vF.b = b;
++	}
++
++	float magn = max3( abs( vF.r ), abs( vF.g ), abs( vF.b ) );
++	float3 v;
++
++	if( magn < 4.0f )
++	{                  // too small, default to luminance
++		v.r = 299.0f;  // JPEG YCbCr luma coefs, scaled by 1000.
++		v.g = 587.0f;
++		v.b = 114.0f;
++	}
++	else
++	{
++		v = trunc( vF * ( 512.0f / magn ) );
++	}
++
++	// Pick colors at extreme points
++	float3 minEndpoint, maxEndpoint;
++	float minDot = FLT_MAX;
++	float maxDot = -FLT_MAX;
++	for( int i = 0; i < 16; ++i )
++	{
++		const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;
++		const float dotValue = dot( currColour, v );
++
++		if( dotValue < minDot )
++		{
++			minDot = dotValue;
++			minEndpoint = currColour;
++		}
++
++		if( dotValue > maxDot )
++		{
++			maxDot = dotValue;
++			maxEndpoint = currColour;
++		}
++	}
++
++	outMinEndp16 = rgb888to565( minEndpoint );
++	outMaxEndp16 = rgb888to565( maxEndpoint );
++}
++
++// The color matching function
++uint MatchColorsBlock( const uint srcPixelsBlock[16], float3 colour[4] )
++{
++	uint mask = 0u;
++	float3 dir = colour[0] - colour[1];
++	float stops[4];
++
++	for( int i = 0; i < 4; ++i )
++		stops[i] = dot( colour[i], dir );
++
++	// think of the colors as arranged on a line; project point onto that line, then choose
++	// next color out of available ones. we compute the crossover points for "best color in top
++	// half"/"best in bottom half" and then the same inside that subinterval.
++	//
++	// relying on this 1d approximation isn't always optimal in terms of euclidean distance,
++	// but it's very close and a lot faster.
++	// http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html
++
++	float c0Point = trunc( ( stops[1] + stops[3] ) * 0.5f );
++	float halfPoint = trunc( ( stops[3] + stops[2] ) * 0.5f );
++	float c3Point = trunc( ( stops[2] + stops[0] ) * 0.5f );
++
++#ifndef BC1_DITHER
++	// the version without dithering is straightforward
++	for( uint i = 16u; i-- > 0u; )
++	{
++		const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;
++
++		const float dotValue = dot( currColour, dir );
++		mask <<= 2u;
++
++		if( dotValue < halfPoint )
++			mask |= ( ( dotValue < c0Point ) ? 1u : 3u );
++		else
++			mask |= ( ( dotValue < c3Point ) ? 2u : 0u );
++	}
++#else
++	// with floyd-steinberg dithering
++	float4 ep1 = float4( 0, 0, 0, 0 );
++	float4 ep2 = float4( 0, 0, 0, 0 );
++
++	c0Point *= 16.0f;
++	halfPoint *= 16.0f;
++	c3Point *= 16.0f;
++
++	for( uint y = 0u; y < 4u; ++y )
++	{
++		float ditherDot;
++		uint lmask, step;
++
++		float3 currColour;
++		float dotValue;
++
++		currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 0u] ).xyz * 255.0f;
++		dotValue = dot( currColour, dir );
++
++		ditherDot = ( dotValue * 16.0f ) + ( 3.0f * ep2[1] + 5.0f * ep2[0] );
++		if( ditherDot < halfPoint )
++			step = ( ditherDot < c0Point ) ? 1u : 3u;
++		else
++			step = ( ditherDot < c3Point ) ? 2u : 0u;
++		ep1[0] = dotValue - stops[step];
++		lmask = step;
++
++		currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 1u] ).xyz * 255.0f;
++		dotValue = dot( currColour, dir );
++
++		ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] );
++		if( ditherDot < halfPoint )
++			step = ( ditherDot < c0Point ) ? 1u : 3u;
++		else
++			step = ( ditherDot < c3Point ) ? 2u : 0u;
++		ep1[1] = dotValue - stops[step];
++		lmask |= step << 2u;
++
++		currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f;
++		dotValue = dot( currColour, dir );
++
++		ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] );
++		if( ditherDot < halfPoint )
++			step = ( ditherDot < c0Point ) ? 1u : 3u;
++		else
++			step = ( ditherDot < c3Point ) ? 2u : 0u;
++		ep1[2] = dotValue - stops[step];
++		lmask |= step << 4u;
++
++		currColour = unpackUnorm4x8( srcPixelsBlock[y * 4u + 2u] ).xyz * 255.0f;
++		dotValue = dot( currColour, dir );
++
++		ditherDot = ( dotValue * 16.0f ) + ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] );
++		if( ditherDot < halfPoint )
++			step = ( ditherDot < c0Point ) ? 1u : 3u;
++		else
++			step = ( ditherDot < c3Point ) ? 2u : 0u;
++		ep1[3] = dotValue - stops[step];
++		lmask |= step << 6u;
++
++		mask |= lmask << ( y * 8u );
++		{
++			float4 tmp = ep1;
++			ep1 = ep2;
++			ep2 = tmp;
++		}  // swap
++	}
++#endif
++
++	return mask;
++}
++
++// The refinement function. (Clever code, part 2)
++// Tries to optimize colors to suit block contents better.
++// (By solving a least squares system via normal equations+Cramer's rule)
++bool RefineBlock( const uint srcPixelsBlock[16], uint mask, inout float inOutMinEndp16,
++				  inout float inOutMaxEndp16 )
++{
++	float newMin16, newMax16;
++	const float oldMin = inOutMinEndp16;
++	const float oldMax = inOutMaxEndp16;
++
++	if( ( mask ^ ( mask << 2u ) ) < 4u )  // all pixels have the same index?
++	{
++		// yes, linear system would be singular; solve using optimal
++		// single-color match on average color
++		float3 rgbVal = float3( 8.0f / 255.0f, 8.0f / 255.0f, 8.0f / 255.0f );
++		for( int i = 0; i < 16; ++i )
++			rgbVal += unpackUnorm4x8( srcPixelsBlock[i] ).xyz;
++
++		rgbVal = floor( rgbVal * ( 255.0f / 16.0f ) );
++
++		newMax16 = c_oMatch5[uint( rgbVal.r )][0] * 2048.0f +  //
++				   c_oMatch6[uint( rgbVal.g )][0] * 32.0f +    //
++				   c_oMatch5[uint( rgbVal.b )][0];
++		newMin16 = c_oMatch5[uint( rgbVal.r )][1] * 2048.0f +  //
++				   c_oMatch6[uint( rgbVal.g )][1] * 32.0f +    //
++				   c_oMatch5[uint( rgbVal.b )][1];
++	}
++	else
++	{
++		const float w1Tab[4] = float[4]( 3.0f, 0.0f, 2.0f, 1.0f );
++		const float prods[4] = float[4]( 589824.0f, 2304.0f, 262402.0f, 66562.0f );
++		// ^some magic to save a lot of multiplies in the accumulating loop...
++		// (precomputed products of weights for least squares system, accumulated inside one 32-bit
++		// register)
++
++		float akku = 0.0f;
++		uint cm = mask;
++		float3 at1 = float3( 0, 0, 0 );
++		float3 at2 = float3( 0, 0, 0 );
++		for( int i = 0; i < 16; ++i, cm >>= 2u )
++		{
++			const float3 currColour = unpackUnorm4x8( srcPixelsBlock[i] ).xyz * 255.0f;
++
++			const uint step = cm & 3u;
++			const float w1 = w1Tab[step];
++			akku += prods[step];
++			at1 += currColour * w1;
++			at2 += currColour;
++		}
++
++		at2 = 3.0f * at2 - at1;
++
++		// extract solutions and decide solvability
++		const float xx = floor( akku / 65535.0f );
++		const float yy = floor( mod( akku, 65535.0f ) / 256.0f );
++		const float xy = mod( akku, 256.0f );
++
++		float2 f_rb_g;
++		f_rb_g.x = 3.0f * 31.0f / 255.0f / ( xx * yy - xy * xy );
++		f_rb_g.y = f_rb_g.x * 63.0f / 31.0f;
++
++		// solve.
++		const float3 newMaxVal = clamp( floor( ( at1 * yy - at2 * xy ) * f_rb_g.xyx + 0.5f ),
++										float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) );
++		newMax16 = newMaxVal.x * 2048.0f + newMaxVal.y * 32.0f + newMaxVal.z;
++
++		const float3 newMinVal = clamp( floor( ( at2 * xx - at1 * xy ) * f_rb_g.xyx + 0.5f ),
++										float3( 0.0f, 0.0f, 0.0f ), float3( 31, 63, 31 ) );
++		newMin16 = newMinVal.x * 2048.0f + newMinVal.y * 32.0f + newMinVal.z;
++	}
++
++	inOutMinEndp16 = newMin16;
++	inOutMaxEndp16 = newMax16;
++
++	return oldMin != newMin16 || oldMax != newMax16;
++}
++
++#ifdef BC1_DITHER
++/// Quantizes 'srcValue' which is originally in 888 (full range),
++/// converting it to 565 and then back to 888 (quantized)
++float3 quant( float3 srcValue )
++{
++	srcValue = clamp( srcValue, 0.0f, 255.0f );
++	// Convert 888 -> 565
++	srcValue = floor( srcValue * float3( 31.0f / 255.0f, 63.0f / 255.0f, 31.0f / 255.0f ) + 0.5f );
++	// Convert 565 -> 888 back
++	srcValue = floor( srcValue * float3( 8.25f, 4.0625f, 8.25f ) );
++
++	return srcValue;
++}
++
++void DitherBlock( const uint srcPixBlck[16], out uint dthPixBlck[16] )
++{
++	float3 ep1[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) );
++	float3 ep2[4] = float3[4]( float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ), float3( 0, 0, 0 ) );
++
++	for( uint y = 0u; y < 16u; y += 4u )
++	{
++		float3 srcPixel, dithPixel;
++
++		srcPixel = unpackUnorm4x8( srcPixBlck[y + 0u] ).xyz * 255.0f;
++		dithPixel = quant( srcPixel + trunc( ( 3.0f * ep2[1] + 5.0f * ep2[0] ) * ( 1.0f / 16.0f ) ) );
++		ep1[0] = srcPixel - dithPixel;
++		dthPixBlck[y + 0u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );
++
++		srcPixel = unpackUnorm4x8( srcPixBlck[y + 1u] ).xyz * 255.0f;
++		dithPixel = quant(
++			srcPixel + trunc( ( 7.0f * ep1[0] + 3.0f * ep2[2] + 5.0f * ep2[1] + ep2[0] ) * ( 1.0f / 16.0f ) ) );
++		ep1[1] = srcPixel - dithPixel;
++		dthPixBlck[y + 1u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );
++
++		srcPixel = unpackUnorm4x8( srcPixBlck[y + 2u] ).xyz * 255.0f;
++		dithPixel = quant(
++			srcPixel + trunc( ( 7.0f * ep1[1] + 3.0f * ep2[3] + 5.0f * ep2[2] + ep2[1] ) * ( 1.0f / 16.0f ) ) );
++		ep1[2] = srcPixel - dithPixel;
++		dthPixBlck[y + 2u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );
++
++		srcPixel = unpackUnorm4x8( srcPixBlck[y + 3u] ).xyz * 255.0f;
++		dithPixel = quant( srcPixel + trunc( ( 7.0f * ep1[2] + 5.0f * ep2[3] + ep2[2] ) * ( 1.0f / 16.0f ) ) );
++		ep1[3] = srcPixel - dithPixel;
++		dthPixBlck[y + 3u] = packUnorm4x8( float4( dithPixel * ( 1.0f / 255.0f ), 1.0f ) );
++
++		// swap( ep1, ep2 )
++		for( uint i = 0u; i < 4u; ++i )
++		{
++			float3 tmp = ep1[i];
++			ep1[i] = ep2[i];
++			ep2[i] = tmp;
++		}
++	}
++}
++#endif
++
++void main()
++{
++	uint srcPixelsBlock[16];
++
++	bool bAllColoursEqual = true;
++
++	// Load the whole 4x4 block
++	const uint2 pixelsToLoadBase = gl_GlobalInvocationID.xy << 2u;
++	for( uint i = 0u; i < 16u; ++i )
++	{
++		const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i & 0x03u, i >> 2u );
++		const float3 srcPixels0 = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyz;
++		srcPixelsBlock[i] = packUnorm4x8( float4( srcPixels0, 1.0f ) );
++		bAllColoursEqual = bAllColoursEqual && srcPixelsBlock[0] == srcPixelsBlock[i];
++	}
++
++	float maxEndp16, minEndp16;
++	uint mask = 0u;
++
++	if( bAllColoursEqual )
++	{
++		const uint3 rgbVal = uint3( unpackUnorm4x8( srcPixelsBlock[0] ).xyz * 255.0f );
++		mask = 0xAAAAAAAAu;
++		maxEndp16 =
++			c_oMatch5[rgbVal.r][0] * 2048.0f + c_oMatch6[rgbVal.g][0] * 32.0f + c_oMatch5[rgbVal.b][0];
++		minEndp16 =
++			c_oMatch5[rgbVal.r][1] * 2048.0f + c_oMatch6[rgbVal.g][1] * 32.0f + c_oMatch5[rgbVal.b][1];
++	}
++	else
++	{
++#ifdef BC1_DITHER
++		uint ditherPixelsBlock[16];
++		// first step: compute dithered version for PCA if desired
++		DitherBlock( srcPixelsBlock, ditherPixelsBlock );
++#else
++#	define ditherPixelsBlock srcPixelsBlock
++#endif
++
++		// second step: pca+map along principal axis
++		OptimizeColorsBlock( ditherPixelsBlock, minEndp16, maxEndp16 );
++		if( minEndp16 != maxEndp16 )
++		{
++			float3 colours[4];
++			EvalColors( colours, maxEndp16, minEndp16 );  // Note min/max are inverted
++			mask = MatchColorsBlock( srcPixelsBlock, colours );
++		}
++
++		// third step: refine (multiple times if requested)
++		bool bStopRefinement = false;
++		for( uint i = 0u; i < p_numRefinements && !bStopRefinement; ++i )
++		{
++			const uint lastMask = mask;
++
++			if( RefineBlock( ditherPixelsBlock, mask, minEndp16, maxEndp16 ) )
++			{
++				if( minEndp16 != maxEndp16 )
++				{
++					float3 colours[4];
++					EvalColors( colours, maxEndp16, minEndp16 );  // Note min/max are inverted
++					mask = MatchColorsBlock( srcPixelsBlock, colours );
++				}
++				else
++				{
++					mask = 0u;
++					bStopRefinement = true;
++				}
++			}
++
++			bStopRefinement = mask == lastMask || bStopRefinement;
++		}
++	}
++
++	// write the color block
++	if( maxEndp16 < minEndp16 )
++	{
++		const float tmpValue = minEndp16;
++		minEndp16 = maxEndp16;
++		maxEndp16 = tmpValue;
++		mask ^= 0x55555555u;
++	}
++
++	uint4 outputBytes;
++	outputBytes.x = uint( maxEndp16 );
++	outputBytes.y = uint( minEndp16 );
++	outputBytes.z = mask & 0xFFFFu;
++	outputBytes.w = mask >> 16u;
++
++	uint2 dstUV = gl_GlobalInvocationID.xy;
++	imageStore( dstTexture, int2( dstUV ), outputBytes );
++}
+diff --git a/src/compiler/glsl/bc4.glsl b/src/compiler/glsl/bc4.glsl
+new file mode 100644
+index 00000000000..2486fa4ae0c
+--- /dev/null
++++ b/src/compiler/glsl/bc4.glsl
+@@ -0,0 +1,187 @@
++/*
++ * Copyright 2020-2022 Matias N. Goldberg
++ * Copyright 2022 Intel Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++#version 310 es
++
++#if defined(GL_ES) && GL_ES == 1
++	// Desktop GLSL allows the const keyword for either compile-time or
++	// run-time constants. GLSL ES only allows the keyword for compile-time
++	// constants. Since we use const on run-time constants, define it to
++	// nothing.
++	#define const
++#endif
++
++#define __sharedOnlyBarrier memoryBarrierShared();barrier();
++
++%s // include "CrossPlatformSettings_piece_all.glsl"
++
++shared float2 g_minMaxValues[4u * 4u * 4u];
++shared uint2 g_mask[4u * 4u];
++
++layout( location = 0 ) uniform uint2 params;
++
++#define p_channelIdx params.x
++#define p_useSNorm params.y
++
++uniform sampler2D srcTex;
++
++layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture;
++
++layout( local_size_x = 4,  //
++		local_size_y = 4,  //
++		local_size_z = 4 ) in;
++
++/// Each block is 16 pixels
++/// Each thread works on 4 pixels
++/// Therefore each block needs 4 threads, generating 8 masks
++/// At the end these 8 masks get merged into 2 and results written to output
++///
++/// **Q: Why 4 pixels per thread? Why not 1 pixel per thread? Why not 2? Why not 16?**
++///
++/// A: It's a sweetspot.
++///  - Very short threads cannot fill expensive GPUs with enough work (dispatch bound)
++///  - Lots of threads means lots of synchronization (e.g. evaluating min/max, merging masks)
++///    overhead, and also more LDS usage which reduces occupancy.
++///  - Long threads (e.g. 1 thread per block) misses parallelism opportunities
++void main()
++{
++	float minVal, maxVal;
++	float4 srcPixel;
++
++	const uint blockThreadId = gl_LocalInvocationID.x;
++
++	const uint2 pixelsToLoadBase = gl_GlobalInvocationID.yz << 2u;
++
++	for( uint i = 0u; i < 4u; ++i )
++	{
++		const uint2 pixelsToLoad = pixelsToLoadBase + uint2( i, blockThreadId );
++
++		const float4 value = OGRE_Load2D( srcTex, int2( pixelsToLoad ), 0 ).xyzw;
++		srcPixel[i] = p_channelIdx == 0u ? value.x : ( p_channelIdx == 1u ? value.y : value.w );
++		srcPixel[i] *= 255.0f;
++	}
++
++	minVal = min3( srcPixel.x, srcPixel.y, srcPixel.z );
++	maxVal = max3( srcPixel.x, srcPixel.y, srcPixel.z );
++	minVal = min( minVal, srcPixel.w );
++	maxVal = max( maxVal, srcPixel.w );
++
++	const uint minMaxIdxBase = ( gl_LocalInvocationID.z << 4u ) + ( gl_LocalInvocationID.y << 2u );
++	const uint maskIdxBase = ( gl_LocalInvocationID.z << 2u ) + gl_LocalInvocationID.y;
++
++	g_minMaxValues[minMaxIdxBase + blockThreadId] = float2( minVal, maxVal );
++	g_mask[maskIdxBase] = uint2( 0u, 0u );
++
++	__sharedOnlyBarrier;
++
++	// Have all 4 threads in the block grab the min/max value by comparing what all 4 threads uploaded
++	for( uint i = 0u; i < 4u; ++i )
++	{
++		minVal = min( g_minMaxValues[minMaxIdxBase + i].x, minVal );
++		maxVal = max( g_minMaxValues[minMaxIdxBase + i].y, maxVal );
++	}
++
++	// determine bias and emit color indices
++	// given the choice of maxVal/minVal, these indices are optimal:
++	// http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
++	float dist = maxVal - minVal;
++	float dist4 = dist * 4.0f;
++	float dist2 = dist * 2.0f;
++	float bias = ( dist < 8.0f ) ? ( dist - 1.0f ) : ( trunc( dist * 0.5f ) + 2.0f );
++	bias -= minVal * 7.0f;
++
++	uint mask0 = 0u, mask1 = 0u;
++
++	for( uint i = 0u; i < 4u; ++i )
++	{
++		float a = srcPixel[i] * 7.0f + bias;
++
++		int ind = 0;
++
++		// select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
++		if( a >= dist4 )
++		{
++			ind = 4;
++			a -= dist4;
++		}
++
++		if( a >= dist2 )
++		{
++			ind += 2;
++			a -= dist2;
++		}
++
++		if( a >= dist )
++			ind += 1;
++
++		// turn linear scale into DXT index (0/1 are extremal pts)
++		ind = -ind & 7;
++		ind ^= ( 2 > ind ) ? 1 : 0;
++
++		// write index
++		const uint bits = 16u + ( ( blockThreadId << 2u ) + i ) * 3u;
++		if( bits < 32u )
++		{
++			mask0 |= uint( ind ) << bits;
++			if( bits + 3u > 32u )
++			{
++				mask1 |= uint( ind ) >> ( 32u - bits );
++			}
++		}
++		else
++		{
++			mask1 |= uint( ind ) << ( bits - 32u );
++		}
++	}
++
++	if( mask0 != 0u )
++		atomicOr( g_mask[maskIdxBase].x, mask0 );
++	if( mask1 != 0u )
++		atomicOr( g_mask[maskIdxBase].y, mask1 );
++
++	__sharedOnlyBarrier;
++
++	if( blockThreadId == 0u )
++	{
++		// Save data
++		uint4 outputBytes;
++
++		if( p_useSNorm != 0u )
++		{
++			outputBytes.x =
++				packSnorm4x8( float4( maxVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f,
++									  minVal * ( 1.0f / 255.0f ) * 2.0f - 1.0f, 0.0f, 0.0f ) );
++		}
++		else
++		{
++			outputBytes.x = packUnorm4x8(
++				float4( maxVal * ( 1.0f / 255.0f ), minVal * ( 1.0f / 255.0f ), 0.0f, 0.0f ) );
++		}
++		outputBytes.y = g_mask[maskIdxBase].x >> 16u;
++		outputBytes.z = g_mask[maskIdxBase].y & 0xFFFFu;
++		outputBytes.w = g_mask[maskIdxBase].y >> 16u;
++
++		uint2 dstUV = gl_GlobalInvocationID.yz;
++		imageStore( dstTexture, int2( dstUV ), outputBytes );
++	}
++}
+diff --git a/src/compiler/glsl/etc2_rgba_stitch.glsl b/src/compiler/glsl/etc2_rgba_stitch.glsl
+new file mode 100644
+index 00000000000..f90accb82e1
+--- /dev/null
++++ b/src/compiler/glsl/etc2_rgba_stitch.glsl
+@@ -0,0 +1,46 @@
++/*
++ * Copyright 2020-2022 Matias N. Goldberg
++ * Copyright 2022 Intel Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++// RGB and Alpha components of ETC2 RGBA are computed separately.
++// This compute shader merely stitches them together to form the final result
++// It's also used by RG11 driver to stitch two R11 into one RG11
++
++#version 310 es
++
++%s // include "CrossPlatformSettings_piece_all.glsl"
++
++layout( local_size_x = 8,  //
++		local_size_y = 8,  //
++		local_size_z = 1 ) in;
++
++layout( binding = 0 ) uniform highp usampler2D srcRGB;
++layout( binding = 1 ) uniform highp usampler2D srcAlpha;
++layout( rgba32ui ) uniform restrict writeonly highp uimage2D dstTexture;
++
++void main()
++{
++	uint2 etcRgb = OGRE_Load2D( srcRGB, int2( gl_GlobalInvocationID.xy ), 0 ).xy;
++	uint2 etcAlpha = OGRE_Load2D( srcAlpha, int2( gl_GlobalInvocationID.xy ), 0 ).xy;
++
++	imageStore( dstTexture, int2( gl_GlobalInvocationID.xy ), uint4( etcAlpha.xy, etcRgb.xy ) );
++}
+diff --git a/src/compiler/glsl/meson.build b/src/compiler/glsl/meson.build
+index abae5adeabe..a130a963b2b 100644
+--- a/src/compiler/glsl/meson.build
++++ b/src/compiler/glsl/meson.build
+@@ -70,6 +70,41 @@ float64_glsl_h = custom_target(
+   command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'float64_source'],
+ )
+ 
++cross_platform_settings_piece_all_h = custom_target(
++  'cross_platform_settings_piece_all.h',
++  input : [files_xxd, 'CrossPlatformSettings_piece_all.glsl'],
++  output : 'cross_platform_settings_piece_all.h',
++  command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'cross_platform_settings_piece_all_header'],
++)
++
++bc1_glsl_h = custom_target(
++  'bc1_glsl.h',
++  input : [files_xxd, 'bc1.glsl'],
++  output : 'bc1_glsl.h',
++  command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'bc1_source'],
++)
++
++bc4_glsl_h = custom_target(
++  'bc4_glsl.h',
++  input : [files_xxd, 'bc4.glsl'],
++  output : 'bc4_glsl.h',
++  command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'bc4_source'],
++)
++
++etc2_rgba_stitch_glsl_h = custom_target(
++  'etc2_rgba_stitch_glsl.h',
++  input : [files_xxd, 'etc2_rgba_stitch.glsl'],
++  output : 'etc2_rgba_stitch_glsl.h',
++  command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'etc2_rgba_stitch_source'],
++)
++
++astc_glsl_h = custom_target(
++  'astc_glsl.h',
++  input : [files_xxd, 'astc_decoder.glsl'],
++  output : 'astc_glsl.h',
++  command : [prog_python, '@INPUT@', '@OUTPUT@', '-n', 'astc_source'],
++)
++
+ files_libglsl = files(
+   'ast.h',
+   'ast_array_index.cpp',
+@@ -233,7 +268,8 @@ libglsl = static_library(
+   'glsl',
+   [files_libglsl, glsl_parser, glsl_lexer_cpp, ir_expression_operation_h,
+    ir_expression_operation_strings_h, ir_expression_operation_constant_h,
+-   float64_glsl_h],
++   float64_glsl_h, cross_platform_settings_piece_all_h, bc1_glsl_h, bc4_glsl_h,
++   etc2_rgba_stitch_glsl_h, astc_glsl_h],
+   c_args : [c_msvc_compat_args, no_override_init_args],
+   cpp_args : [cpp_msvc_compat_args],
+   gnu_symbol_visibility : 'hidden',
+diff --git a/src/compiler/meson.build b/src/compiler/meson.build
+index ce6c6c06c44..e18af3bf335 100644
+--- a/src/compiler/meson.build
++++ b/src/compiler/meson.build
+@@ -22,6 +22,8 @@ inc_compiler = include_directories('.')
+ inc_glsl = include_directories('glsl')
+ inc_spirv = include_directories('spirv')
+ 
++astc_decoder_glsl_file = files('glsl/astc_decoder.glsl')
++
+ files_libcompiler = files(
+   'builtin_type_macros.h',
+   'glsl_types.cpp',
+diff --git a/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h b/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h
+index aacf408e608..e8c70fd4ae7 100644
+--- a/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h
++++ b/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h
+@@ -39,7 +39,7 @@ DRI_CONF_SECTION_DEBUG
+    DRI_CONF_FORCE_GL_NAMES_REUSE(false)
+    DRI_CONF_FORCE_GL_MAP_BUFFER_SYNCHRONIZED(false)
+    DRI_CONF_TRANSCODE_ETC(false)
+-   DRI_CONF_TRANSCODE_ASTC(false)
++   DRI_CONF_TRANSCODE_ASTC(true)
+    DRI_CONF_FORCE_GL_VENDOR()
+    DRI_CONF_FORCE_GL_RENDERER()
+    DRI_CONF_OVERRIDE_VRAM_SIZE()
+diff --git a/src/util/driconf.h b/src/util/driconf.h
+index 7ca309f1ab3..37e617ca413 100644
+--- a/src/util/driconf.h
++++ b/src/util/driconf.h
+@@ -571,6 +571,11 @@
+   DRI_CONF_OPT_B(radv_require_etc2, def,                                       \
+                  "Implement emulated ETC2 on HW that does not support it")
+ 
++
++#define DRI_CONF_RADV_REQUIRE_ASTC(def)                                      \
++   DRI_CONF_OPT_B(radv_require_astc, def,                                    \
++                  "Implement emulated ASTC on HW that does not support it")
++
+ #define DRI_CONF_RADV_DISABLE_HTILE_LAYERS(def) \
+    DRI_CONF_OPT_B(radv_disable_htile_layers, def, \
+                   "Disable HTILE for layered depth/stencil formats")
+diff --git a/src/util/meson.build b/src/util/meson.build
+index 2a1028f0d3a..03db5cf524b 100644
+--- a/src/util/meson.build
++++ b/src/util/meson.build
+@@ -115,6 +115,10 @@ files_mesa_util = files(
+   'strndup.h',
+   'strtod.c',
+   'strtod.h',
++  'texcompress_astc_luts.cpp',
++  'texcompress_astc_luts.h',
++  'texcompress_astc_luts_wrap.cpp',
++  'texcompress_astc_luts_wrap.h',
+   'texcompress_rgtc_tmp.h',
+   'timespec.h',
+   'u_atomic.c',
+diff --git a/src/util/texcompress_astc_luts.cpp b/src/util/texcompress_astc_luts.cpp
+new file mode 100644
+index 00000000000..6ffaf23ea88
+--- /dev/null
++++ b/src/util/texcompress_astc_luts.cpp
+@@ -0,0 +1,532 @@
++/* Copyright (c) 2017-2022 Hans-Kristian Arntzen
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining
++ * a copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sublicense, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be
++ * included in all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include <assert.h>
++#include <cstdint>
++#include <cstring>
++#include <mutex>
++#include <unordered_map>
++#include <vector>
++
++#include "texcompress_astc_luts.h"
++
++namespace Granite
++{
++static void build_astc_unquant_weight_lut(uint8_t *lut, size_t range, const ASTCQuantizationMode &mode)
++{
++	for (size_t i = 0; i < range; i++)
++	{
++		auto &v = lut[i];
++
++		if (!mode.quints && !mode.trits)
++		{
++			switch (mode.bits)
++			{
++			case 1:
++				v = i * 63;
++				break;
++
++			case 2:
++				v = i * 0x15;
++				break;
++
++			case 3:
++				v = i * 9;
++				break;
++
++			case 4:
++				v = (i << 2) | (i >> 2);
++				break;
++
++			case 5:
++				v = (i << 1) | (i >> 4);
++				break;
++
++			default:
++				v = 0;
++				break;
++			}
++		}
++		else if (mode.bits == 0)
++		{
++			if (mode.trits)
++				v = 32 * i;
++			else
++				v = 16 * i;
++		}
++		else
++		{
++			unsigned b = (i >> 1) & 1;
++			unsigned c = (i >> 2) & 1;
++			unsigned A, B, C, D;
++
++			A = 0x7f * (i & 1);
++			D = i >> mode.bits;
++			B = 0;
++
++			if (mode.trits)
++			{
++				static const unsigned Cs[3] = { 50, 23, 11 };
++				C = Cs[mode.bits - 1];
++				if (mode.bits == 2)
++					B = 0x45 * b;
++				else if (mode.bits == 3)
++					B = 0x21 * b + 0x42 * c;
++			}
++			else
++			{
++				static const unsigned Cs[2] = { 28, 13 };
++				C = Cs[mode.bits - 1];
++				if (mode.bits == 2)
++					B = 0x42 * b;
++			}
++
++			unsigned unq = D * C + B;
++			unq ^= A;
++			unq = (A & 0x20) | (unq >> 2);
++			v = unq;
++		}
++
++		// Expand [0, 63] to [0, 64].
++		if (mode.bits != 0 && v > 32)
++			v++;
++	}
++}
++
++static void build_astc_unquant_endpoint_lut(uint8_t *lut, size_t range, const ASTCQuantizationMode &mode)
++{
++	for (size_t i = 0; i < range; i++)
++	{
++		auto &v = lut[i];
++
++		if (!mode.quints && !mode.trits)
++		{
++			// Bit-replication.
++			switch (mode.bits)
++			{
++			case 1:
++				v = i * 0xff;
++				break;
++
++			case 2:
++				v = i * 0x55;
++				break;
++
++			case 3:
++				v = (i << 5) | (i << 2) | (i >> 1);
++				break;
++
++			case 4:
++				v = i * 0x11;
++				break;
++
++			case 5:
++				v = (i << 3) | (i >> 2);
++				break;
++
++			case 6:
++				v = (i << 2) | (i >> 4);
++				break;
++
++			case 7:
++				v = (i << 1) | (i >> 6);
++				break;
++
++			default:
++				v = i;
++				break;
++			}
++		}
++		else
++		{
++			unsigned A, B, C, D;
++			unsigned b = (i >> 1) & 1;
++			unsigned c = (i >> 2) & 1;
++			unsigned d = (i >> 3) & 1;
++			unsigned e = (i >> 4) & 1;
++			unsigned f = (i >> 5) & 1;
++
++			B = 0;
++			D = i >> mode.bits;
++			A = (i & 1) * 0x1ff;
++
++			if (mode.trits)
++			{
++				static const unsigned Cs[6] = { 204, 93, 44, 22, 11, 5 };
++				C = Cs[mode.bits - 1];
++
++				switch (mode.bits)
++				{
++				case 2:
++					B = b * 0x116;
++					break;
++
++				case 3:
++					B = b * 0x85 + c * 0x10a;
++					break;
++
++				case 4:
++					B = b * 0x41 + c * 0x82 + d * 0x104;
++					break;
++
++				case 5:
++					B = b * 0x20 + c * 0x40 + d * 0x81 + e * 0x102;
++					break;
++
++				case 6:
++					B = b * 0x10 + c * 0x20 + d * 0x40 + e * 0x80 + f * 0x101;
++					break;
++				}
++			}
++			else
++			{
++				static const unsigned Cs[5] = { 113, 54, 26, 13, 6 };
++				C = Cs[mode.bits - 1];
++
++				switch (mode.bits)
++				{
++				case 2:
++					B = b * 0x10c;
++					break;
++
++				case 3:
++					B = b * 0x82 + c * 0x105;
++					break;
++
++				case 4:
++					B = b * 0x40 + c * 0x81 + d * 0x102;
++					break;
++
++				case 5:
++					B = b * 0x20 + c * 0x40 + d * 0x80 + e * 0x101;
++					break;
++				}
++			}
++
++			unsigned unq = D * C + B;
++			unq ^= A;
++			unq = (A & 0x80) | (unq >> 2);
++			v = uint8_t(unq);
++		}
++	}
++}
++
++static unsigned astc_value_range(const ASTCQuantizationMode &mode)
++{
++	unsigned value_range = 1u << mode.bits;
++	if (mode.trits)
++		value_range *= 3;
++	if (mode.quints)
++		value_range *= 5;
++
++	if (value_range == 1)
++		value_range = 0;
++	return value_range;
++}
++
++static uint32_t astc_hash52(uint32_t p)
++{
++	p ^= p >> 15; p -= p << 17; p += p << 7; p += p << 4;
++	p ^= p >>  5; p += p << 16; p ^= p >> 7; p ^= p >> 3;
++	p ^= p <<  6; p ^= p >> 17;
++	return p;
++}
++
++// Copy-paste from spec.
++static int astc_select_partition(int seed, int x, int y, int z, int partitioncount, bool small_block)
++{
++	if (small_block)
++	{
++		x <<= 1;
++		y <<= 1;
++		z <<= 1;
++	}
++
++	seed += (partitioncount - 1) * 1024;
++	uint32_t rnum = astc_hash52(seed);
++	uint8_t seed1 = rnum & 0xF;
++	uint8_t seed2 = (rnum >> 4) & 0xF;
++	uint8_t seed3 = (rnum >> 8) & 0xF;
++	uint8_t seed4 = (rnum >> 12) & 0xF;
++	uint8_t seed5 = (rnum >> 16) & 0xF;
++	uint8_t seed6 = (rnum >> 20) & 0xF;
++	uint8_t seed7 = (rnum >> 24) & 0xF;
++	uint8_t seed8 = (rnum >> 28) & 0xF;
++	uint8_t seed9 = (rnum >> 18) & 0xF;
++	uint8_t seed10 = (rnum >> 22) & 0xF;
++	uint8_t seed11 = (rnum >> 26) & 0xF;
++	uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;
++
++	seed1 *= seed1; seed2 *= seed2; seed3 *= seed3; seed4 *= seed4;
++	seed5 *= seed5; seed6 *= seed6; seed7 *= seed7; seed8 *= seed8;
++	seed9 *= seed9; seed10 *= seed10; seed11 *= seed11; seed12 *= seed12;
++
++	int sh1, sh2, sh3;
++	if (seed & 1)
++	{
++		sh1 = seed & 2 ? 4 : 5;
++		sh2 = partitioncount == 3 ? 6 : 5;
++	}
++	else
++	{
++		sh1 = partitioncount == 3 ? 6 : 5;
++		sh2 = seed & 2 ? 4 : 5;
++	}
++	sh3 = (seed & 0x10) ? sh1 : sh2;
++
++	seed1 >>= sh1; seed2 >>= sh2; seed3 >>= sh1; seed4 >>= sh2;
++	seed5 >>= sh1; seed6 >>= sh2; seed7 >>= sh1; seed8 >>= sh2;
++	seed9 >>= sh3; seed10 >>= sh3; seed11 >>= sh3; seed12 >>= sh3;
++
++	int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
++	int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
++	int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
++	int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
++
++	a &= 0x3f; b &= 0x3f; c &= 0x3f; d &= 0x3f;
++
++	if (partitioncount < 4)
++		d = 0;
++	if (partitioncount < 3)
++		c = 0;
++
++	if (a >= b && a >= c && a >= d)
++		return 0;
++	else if (b >= c && b >= d)
++		return 1;
++	else if (c >= d)
++		return 2;
++	else
++		return 3;
++}
++
++ASTCLutHolder::PartitionTable::PartitionTable(unsigned block_width, unsigned block_height)
++{
++	bool small_block = (block_width * block_height) < 31;
++
++	lut_width = block_width * 32;
++	lut_height = block_height * 32;
++	lut_buffer.resize(lut_width * lut_height);
++
++	for (unsigned seed_y = 0; seed_y < 32; seed_y++)
++	{
++		for (unsigned seed_x = 0; seed_x < 32; seed_x++)
++		{
++			unsigned seed = seed_y * 32 + seed_x;
++			for (unsigned block_y = 0; block_y < block_height; block_y++)
++			{
++				for (unsigned block_x = 0; block_x < block_width; block_x++)
++				{
++					int part2 = astc_select_partition(seed, block_x, block_y, 0, 2, small_block);
++					int part3 = astc_select_partition(seed, block_x, block_y, 0, 3, small_block);
++					int part4 = astc_select_partition(seed, block_x, block_y, 0, 4, small_block);
++					lut_buffer[(seed_y * block_height + block_y) * lut_width + (seed_x * block_width + block_x)] =
++							(part2 << 0) | (part3 << 2) | (part4 << 4);
++				}
++			}
++		}
++	}
++}
++
++ASTCLutHolder::PartitionTable &ASTCLutHolder::get_partition_table(unsigned width, unsigned height)
++{
++	std::lock_guard<std::mutex> holder{table_lock};
++	auto itr = tables.find(width * 16 + height);
++	if (itr != tables.end())
++	{
++		return itr->second;
++	}
++	else
++	{
++		auto &t = tables[width * 16 + height];
++		t = { width, height };
++		return t;
++	}
++}
++
++ASTCLutHolder &get_astc_luts()
++{
++	static ASTCLutHolder holder;
++	return holder;
++}
++
++ASTCLutHolder::ASTCLutHolder()
++{
++	init_color_endpoint();
++	init_weight_luts();
++	init_trits_quints();
++}
++
++void ASTCLutHolder::init_color_endpoint()
++{
++	auto &unquant_lut = color_endpoint.unquant_lut;
++
++	for (size_t i = 0; i < astc_num_quantization_modes; i++)
++	{
++		auto value_range = astc_value_range(astc_quantization_modes[i]);
++		color_endpoint.unquant_lut_offsets[i] = color_endpoint.unquant_offset;
++		build_astc_unquant_endpoint_lut(unquant_lut + color_endpoint.unquant_offset, value_range, astc_quantization_modes[i]);
++		color_endpoint.unquant_offset += value_range;
++	}
 +
-+   encode_bc4(cmd_buffer, layout, &extent_copy, subresource, rgba_image, image4);
++	auto &lut = color_endpoint.lut;
 +
-+   encode_bc3(cmd_buffer, &extent_copy, subresource, image1, image4, image);
++	// We can have a maximum of 9 endpoint pairs, i.e. 18 endpoint values in total.
++	for (unsigned pairs_minus_1 = 0; pairs_minus_1 < 9; pairs_minus_1++)
++	{
++		for (unsigned remaining = 0; remaining < 128; remaining++)
++		{
++			bool found_mode = false;
++			for (auto &mode : astc_quantization_modes)
++			{
++				unsigned num_values = (pairs_minus_1 + 1) * 2;
++				unsigned total_bits = mode.bits * num_values +
++				                      (mode.quints * 7 * num_values + 2) / 3 +
++				                      (mode.trits * 8 * num_values + 4) / 5;
++
++				if (total_bits <= remaining)
++				{
++					found_mode = true;
++					lut[pairs_minus_1][remaining][0] = mode.bits;
++					lut[pairs_minus_1][remaining][1] = mode.trits;
++					lut[pairs_minus_1][remaining][2] = mode.quints;
++					lut[pairs_minus_1][remaining][3] = color_endpoint.unquant_lut_offsets[&mode - astc_quantization_modes];
++					break;
++				}
++			}
 +
-+   radv_meta_restore(&saved_state, cmd_buffer);
++			if (!found_mode)
++				memset(lut[pairs_minus_1][remaining], 0, sizeof(lut[pairs_minus_1][remaining]));
++		}
++	}
 +}
-diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
-index 472858dd401..04a670d714d 100644
---- a/src/amd/vulkan/radv_private.h
-+++ b/src/amd/vulkan/radv_private.h
-@@ -98,6 +98,7 @@ typedef uint32_t xcb_window_t;
- 
- #include "vk_texcompress_astc.h"
- #include "wsi_common.h"
-+#include "vk_texcompress_bcn.h"
- 
- #ifdef __cplusplus
- extern "C"
-@@ -681,6 +682,8 @@ struct radv_meta_state {
-    } etc_decode;
- 
-    struct vk_texcompress_astc_state *astc_decode;
 +
-+   struct vk_texcompress_bcn_state *bcn_encoded[VK_TEXCOMPRESS_BCN_NUM_COMPUTE_PIPELINES];
- };
- 
- #define RADV_NUM_HW_CTX (RADEON_CTX_PRIORITY_REALTIME + 1)
-@@ -2090,8 +2093,12 @@ struct radv_image {
-    bool dcc_sign_reinterpret;
-    bool support_comp_to_single;
- 
--   bool isNeedSoftEncode;// 是否需要使用 软编
--   bool isNeedSoftDecode;// 是否需要使用 软解
-+   bool isNeedSoftEncode;  // 是否需要使用 软编
-+   bool isNeedSoftDecode;  // 是否需要使用 软解
++void ASTCLutHolder::init_weight_luts()
++{
++	auto &lut = weights.lut;
++	auto &unquant_lut = weights.unquant_lut;
++	auto &unquant_offset = weights.unquant_offset;
++
++	for (size_t i = 0; i < astc_num_weight_modes; i++)
++	{
++		auto value_range = astc_value_range(astc_weight_modes[i]);
++		lut[i][0] = astc_weight_modes[i].bits;
++		lut[i][1] = astc_weight_modes[i].trits;
++		lut[i][2] = astc_weight_modes[i].quints;
++		lut[i][3] = unquant_offset;
++		build_astc_unquant_weight_lut(unquant_lut + unquant_offset, value_range, astc_weight_modes[i]);
++		unquant_offset += value_range;
++	}
 +
-+   bool isNeedHardEncode;  // 是否需要使用 硬编
-+   struct vk_texcompress_bcn_image *staging_images;  // 硬编的中间资源
++	assert(unquant_offset <= 256);
++}
 +
- 	VkFormat srcFormat; //用于保存纹理的原始格式
- 	VkFormat unpackETCFormat;//用于保存ETC纹理解压后的格式
- 
-@@ -2474,6 +2481,8 @@ void radv_image_view_init(struct radv_image_view *view, struct radv_device *devi
-                           const struct radv_image_view_extra_create_info *extra_create_info);
- void radv_image_view_finish(struct radv_image_view *iview);
- 
-+void radv_internal_image_finish(struct radv_image *image);
++void ASTCLutHolder::init_trits_quints()
++{
++	// From specification.
++	auto &trits_quints = integer.trits_quints;
 +
- VkFormat radv_get_aspect_format(struct radv_image *image, VkImageAspectFlags mask);
- 
- struct radv_sampler_ycbcr_conversion_state {
-diff --git a/src/compiler/glsl/bc1.glsl b/src/compiler/glsl/bc1.glsl
-index 251635d7b69..627b2e5419c 100644
---- a/src/compiler/glsl/bc1.glsl
-+++ b/src/compiler/glsl/bc1.glsl
-@@ -1,3 +1,4 @@
-+#version 310 es
- /*
-  * Copyright 2020-2022 Matias N. Goldberg
-  * Copyright 2022 Intel Corporation
-@@ -21,7 +22,6 @@
-  * DEALINGS IN THE SOFTWARE.
-  */
- 
--#version 310 es
- 
- #if defined(GL_ES) && GL_ES == 1
- 	// Desktop GLSL allows the const keyword for either compile-time or
-@@ -31,10 +31,31 @@
- 	#define const
- #endif
- 
--%s // include "CrossPlatformSettings_piece_all.glsl"
--
- #define FLT_MAX 340282346638528859811704183484516925440.0f
- 
-+#ifdef VULKAN
++	for (unsigned T = 0; T < 256; T++)
++	{
++		unsigned C;
++		uint8_t t0, t1, t2, t3, t4;
 +
-+#extension GL_GOOGLE_include_directive : require
-+#include "CrossPlatformSettings_piece_all.glsl"
++		if (((T >> 2) & 7) == 7)
++		{
++			C = (((T >> 5) & 7) << 2) | (T & 3);
++			t4 = t3 = 2;
++		}
++		else
++		{
++			C = T & 0x1f;
++			if (((T >> 5) & 3) == 3)
++			{
++				t4 = 2;
++				t3 = (T >> 7) & 1;
++			}
++			else
++			{
++				t4 = (T >> 7) & 1;
++				t3 = (T >> 5) & 3;
++			}
++		}
 +
-+layout(push_constant, std430) uniform pc {
-+	uint p_numRefinements;
-+};
++		if ((C & 3) == 3)
++		{
++			t2 = 2;
++			t1 = (C >> 4) & 1;
++			t0 = (((C >> 3) & 1) << 1) | (((C >> 2) & 1) & ~(((C >> 3) & 1)));
++		}
++		else if (((C >> 2) & 3) == 3)
++		{
++			t2 = 2;
++			t1 = 2;
++			t0 = C & 3;
++		}
++		else
++		{
++			t2 = (C >> 4) & 1;
++			t1 = (C >> 2) & 3;
++			t0 = (((C >> 1) & 1) << 1) | ((C & 1) & ~(((C >> 1) & 1)));
++		}
 +
-+layout ( set = 0, binding = 0 ) uniform sampler2D srcTex;
++		trits_quints[T] = t0 | (t1 << 3) | (t2 << 6) | (t3 << 9) | (t4 << 12);
++	}
 +
-+layout (std430, set = 0, binding = 1) readonly restrict buffer globalBuffer
-+{
-+	float2 c_oMatch5[256];
-+	float2 c_oMatch6[256];
-+};
++	for (unsigned Q = 0; Q < 128; Q++)
++	{
++		unsigned C;
++		uint8_t q0, q1, q2;
++		if (((Q >> 1) & 3) == 3 && ((Q >> 5) & 3) == 0)
++		{
++			q2 = ((Q & 1) << 2) | ((((Q >> 4) & 1) & ~(Q & 1)) << 1) | (((Q >> 3) & 1) & ~(Q & 1));
++			q1 = q0 = 4;
++		}
++		else
++		{
++			if (((Q >> 1) & 3) == 3)
++			{
++				q2 = 4;
++				C = (((Q >> 3) & 3) << 3) | ((~(Q >> 5) & 3) << 1) | (Q & 1);
++			}
++			else
++			{
++				q2 = (Q >> 5) & 3;
++				C = Q & 0x1f;
++			}
 +
-+layout (rgba16ui, set = 0, binding = 2 ) uniform restrict writeonly mediump uimage2D dstTexture;
++			if ((C & 7) == 5)
++			{
++				q1 = 4;
++				q0 = (C >> 3) & 3;
++			}
++			else
++			{
++				q1 = (C >> 3) & 3;
++				q0 = C & 7;
++			}
++		}
 +
-+#else
++		trits_quints[256 + Q] = q0 | (q1 << 3) | (q2 << 6);
++	}
++}
++}
+diff --git a/src/util/texcompress_astc_luts.h b/src/util/texcompress_astc_luts.h
+new file mode 100644
+index 00000000000..b3c26033a9a
+--- /dev/null
++++ b/src/util/texcompress_astc_luts.h
+@@ -0,0 +1,127 @@
++/* Copyright (c) 2017-2022 Hans-Kristian Arntzen
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining
++ * a copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sublicense, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be
++ * included in all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
 +
-+%s // include "CrossPlatformSettings_piece_all.glsl"
++#pragma once
 +
- layout( location = 0 ) uniform uint p_numRefinements;
- 
- uniform sampler2D srcTex;
-@@ -47,6 +68,8 @@ layout( std430, binding = 1 ) readonly restrict buffer globalBuffer
- 	float2 c_oMatch6[256];
- };
- 
-+#endif
++#include <mutex>
++#include <unordered_map>
++#include <vector>
 +
- layout( local_size_x = 8,  //
- 		local_size_y = 8,  //
- 		local_size_z = 1 ) in;
-diff --git a/src/compiler/glsl/bc4.glsl b/src/compiler/glsl/bc4.glsl
-index 2486fa4ae0c..0b9d236d624 100644
---- a/src/compiler/glsl/bc4.glsl
-+++ b/src/compiler/glsl/bc4.glsl
-@@ -1,3 +1,5 @@
-+#version 310 es
++namespace Granite
++{
 +
- /*
-  * Copyright 2020-2022 Matias N. Goldberg
-  * Copyright 2022 Intel Corporation
-@@ -21,8 +23,6 @@
-  * DEALINGS IN THE SOFTWARE.
-  */
- 
--#version 310 es
--
- #if defined(GL_ES) && GL_ES == 1
- 	// Desktop GLSL allows the const keyword for either compile-time or
- 	// run-time constants. GLSL ES only allows the keyword for compile-time
-@@ -33,20 +33,37 @@
- 
- #define __sharedOnlyBarrier memoryBarrierShared();barrier();
- 
--%s // include "CrossPlatformSettings_piece_all.glsl"
-+#ifdef VULKAN
- 
--shared float2 g_minMaxValues[4u * 4u * 4u];
--shared uint2 g_mask[4u * 4u];
-+#extension GL_GOOGLE_include_directive : require
-+#include "CrossPlatformSettings_piece_all.glsl"
- 
--layout( location = 0 ) uniform uint2 params;
-+layout(push_constant, std430) uniform pc {
-+	uint2 params;
++struct ASTCQuantizationMode
++{
++        uint8_t bits, trits, quints;
 +};
- 
--#define p_channelIdx params.x
--#define p_useSNorm params.y
-+layout ( set = 0, binding = 0 ) uniform sampler2D srcTex;
-+
-+layout (rgba16ui, set = 0, binding = 1) uniform restrict writeonly mediump uimage2D dstTexture;
-+
-+#else
-+
-+%s // include "CrossPlatformSettings_piece_all.glsl"
- 
- uniform sampler2D srcTex;
- 
- layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture;
- 
-+layout( location = 0 ) uniform uint2 params;
 +
-+#endif
++// In order to decode color endpoints, we need to convert available bits and number of values
++// into a format of (bits, trits, quints). A simple LUT texture is a reasonable approach for this.
++// Decoders are expected to have some form of LUT to deal with this ...
++static const ASTCQuantizationMode astc_quantization_modes[] = {
++        { 8, 0, 0 },
++        { 6, 1, 0 },
++        { 5, 0, 1 },
++        { 7, 0, 0 },
++        { 5, 1, 0 },
++        { 4, 0, 1 },
++        { 6, 0, 0 },
++        { 4, 1, 0 },
++        { 3, 0, 1 },
++        { 5, 0, 0 },
++        { 3, 1, 0 },
++        { 2, 0, 1 },
++        { 4, 0, 0 },
++        { 2, 1, 0 },
++        { 1, 0, 1 },
++        { 3, 0, 0 },
++        { 1, 1, 0 },
++};
 +
-+shared float2 g_minMaxValues[4u * 4u * 4u];
-+shared uint2 g_mask[4u * 4u];
++constexpr size_t astc_num_quantization_modes = sizeof(astc_quantization_modes) / sizeof(astc_quantization_modes[0]);
++
++static const ASTCQuantizationMode astc_weight_modes[] = {
++        { 0, 0, 0 }, // Invalid
++        { 0, 0, 0 }, // Invalid
++        { 1, 0, 0 },
++        { 0, 1, 0 },
++        { 2, 0, 0 },
++        { 0, 0, 1 },
++        { 1, 1, 0 },
++        { 3, 0, 0 },
++        { 0, 0, 0 }, // Invalid
++        { 0, 0, 0 }, // Invalid
++        { 1, 0, 1 },
++        { 2, 1, 0 },
++        { 4, 0, 0 },
++        { 2, 0, 1 },
++        { 3, 1, 0 },
++        { 5, 0, 0 },
++};
 +
-+#define p_channelIdx params.x
-+#define p_useSNorm params.y
++constexpr size_t astc_num_weight_modes = sizeof(astc_weight_modes) / sizeof(astc_weight_modes[0]);
 +
- layout( local_size_x = 4,  //
- 		local_size_y = 4,  //
- 		local_size_z = 4 ) in;
-diff --git a/src/compiler/glsl/etc2_rgba_stitch.glsl b/src/compiler/glsl/etc2_rgba_stitch.glsl
-index f90accb82e1..8d50c469956 100644
---- a/src/compiler/glsl/etc2_rgba_stitch.glsl
-+++ b/src/compiler/glsl/etc2_rgba_stitch.glsl
-@@ -1,3 +1,5 @@
-+#version 310 es
++struct ASTCLutHolder
++{
++        ASTCLutHolder();
++
++        void init_color_endpoint();
++        void init_weight_luts();
++        void init_trits_quints();
++
++        struct
++        {
++                size_t unquant_offset = 0;
++                uint8_t unquant_lut[2048];
++                uint16_t lut[9][128][4];
++                size_t unquant_lut_offsets[astc_num_quantization_modes];
++        } color_endpoint;
++
++        struct
++        {
++                size_t unquant_offset = 0;
++                uint8_t unquant_lut[2048];
++                uint8_t lut[astc_num_weight_modes][4];
++        } weights;
++
++        struct
++        {
++                uint16_t trits_quints[256 + 128];
++        } integer;
++
++        struct PartitionTable
++        {
++                PartitionTable() = default;
++                PartitionTable(unsigned width, unsigned height);
++                std::vector<uint8_t> lut_buffer;
++                unsigned lut_width = 0;
++                unsigned lut_height = 0;
++        };
++
++        std::mutex table_lock;
++        std::unordered_map<unsigned, PartitionTable> tables;
++
++        PartitionTable &get_partition_table(unsigned width, unsigned height);
++};
 +
- /*
-  * Copyright 2020-2022 Matias N. Goldberg
-  * Copyright 2022 Intel Corporation
-@@ -25,18 +27,30 @@
- // This compute shader merely stitches them together to form the final result
- // It's also used by RG11 driver to stitch two R11 into one RG11
- 
--#version 310 es
-+#ifdef VULKAN
- 
--%s // include "CrossPlatformSettings_piece_all.glsl"
-+#extension GL_GOOGLE_include_directive : require
-+#include "CrossPlatformSettings_piece_all.glsl"
- 
--layout( local_size_x = 8,  //
--		local_size_y = 8,  //
--		local_size_z = 1 ) in;
-+layout ( binding = 0 ) uniform highp usampler2D srcRGB;
-+layout ( binding = 1 ) uniform highp usampler2D srcAlpha;
++ASTCLutHolder &get_astc_luts();
++}
+diff --git a/src/util/texcompress_astc_luts_wrap.cpp b/src/util/texcompress_astc_luts_wrap.cpp
+new file mode 100644
+index 00000000000..6020a34e2c9
+--- /dev/null
++++ b/src/util/texcompress_astc_luts_wrap.cpp
+@@ -0,0 +1,64 @@
++/*
++ * Copyright © 2022 Intel Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included
++ * in all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
 +
-+layout ( rgba32ui, binding = 2 ) uniform restrict writeonly mediump uimage2D dstTexture;
++#include "texcompress_astc_luts_wrap.h"
++#include "texcompress_astc_luts.h"
 +
-+#else
++extern "C" void
++_mesa_init_astc_decoder_luts(astc_decoder_lut_holder *holder)
++{
++   auto &luts = Granite::get_astc_luts();
 +
-+%s // include "CrossPlatformSettings_piece_all.glsl"
- 
- layout( binding = 0 ) uniform highp usampler2D srcRGB;
- layout( binding = 1 ) uniform highp usampler2D srcAlpha;
- layout( rgba32ui ) uniform restrict writeonly highp uimage2D dstTexture;
- 
-+#endif
++   holder->color_endpoint.data = luts.color_endpoint.lut;
++   holder->color_endpoint.size_B = sizeof(luts.color_endpoint.lut);
++   holder->color_endpoint.format = PIPE_FORMAT_R16G16B16A16_UINT;
 +
-+layout( local_size_x = 8,  //
-+		local_size_y = 8,  //
-+		local_size_z = 1 ) in;
++   holder->color_endpoint_unquant.data = luts.color_endpoint.unquant_lut;
++   holder->color_endpoint_unquant.size_B = luts.color_endpoint.unquant_offset;
++   holder->color_endpoint_unquant.format = PIPE_FORMAT_R8_UINT;
 +
- void main()
- {
- 	uint2 etcRgb = OGRE_Load2D( srcRGB, int2( gl_GlobalInvocationID.xy ), 0 ).xy;
-diff --git a/src/compiler/meson.build b/src/compiler/meson.build
-index e18af3bf335..85b19b84bec 100644
---- a/src/compiler/meson.build
-+++ b/src/compiler/meson.build
-@@ -24,6 +24,14 @@ inc_spirv = include_directories('spirv')
- 
- astc_decoder_glsl_file = files('glsl/astc_decoder.glsl')
- 
-+bc3_encode_head_dir = meson.current_source_dir() / 'glsl'
++   holder->weights.data = luts.weights.lut;
++   holder->weights.size_B = sizeof(luts.weights.lut);
++   holder->weights.format = PIPE_FORMAT_R8G8B8A8_UINT;
 +
-+bc1_glsl_file = files('glsl/bc1.glsl')
++   holder->weights_unquant.data = luts.weights.unquant_lut;
++   holder->weights_unquant.size_B = luts.weights.unquant_offset;
++   holder->weights_unquant.format = PIPE_FORMAT_R8_UINT;
 +
-+bc4_glsl_file = files('glsl/bc4.glsl')
++   holder->trits_quints.data = luts.integer.trits_quints;
++   holder->trits_quints.size_B = sizeof(luts.integer.trits_quints);
++   holder->trits_quints.format = PIPE_FORMAT_R16_UINT;
++}
 +
-+etc2_rgba_stitch_glsl_file = files('glsl/etc2_rgba_stitch.glsl')
++extern "C" void *
++_mesa_get_astc_decoder_partition_table(uint32_t block_width,
++                                       uint32_t block_height,
++                                       unsigned *lut_width,
++                                       unsigned *lut_height)
++{
++   auto &luts = Granite::get_astc_luts();
++   auto &table = luts.get_partition_table(block_width, block_height);
 +
- files_libcompiler = files(
-   'builtin_type_macros.h',
-   'glsl_types.cpp',
-diff --git a/src/util/bc1_tables.h b/src/util/bc1_tables.h
++   *lut_width = table.lut_width;
++   *lut_height = table.lut_height;
++   return table.lut_buffer.data();
++}
+diff --git a/src/util/texcompress_astc_luts_wrap.h b/src/util/texcompress_astc_luts_wrap.h
 new file mode 100644
-index 00000000000..654ccdadb78
+index 00000000000..2d4a6eb6d85
 --- /dev/null
-+++ b/src/util/bc1_tables.h
-@@ -0,0 +1,124 @@
++++ b/src/util/texcompress_astc_luts_wrap.h
+@@ -0,0 +1,62 @@
 +/*
-+ * Copyright 2020-2022 Matias N. Goldberg
++ * Copyright © 2022 Intel Corporation
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining a
 + * copy of this software and associated documentation files (the "Software"),
@@ -998,243 +4976,85 @@ index 00000000000..654ccdadb78
 + * and/or sell copies of the Software, and to permit persons to whom the
 + * Software is furnished to do so, subject to the following conditions:
 + *
-+ * The above copyright notice and this permission notice shall be included in
-+ * all copies or substantial portions of the Software.
++ * The above copyright notice and this permission notice shall be included
++ * in all copies or substantial portions of the Software.
 + *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 + * DEALINGS IN THE SOFTWARE.
 + */
 +
-+/* These tables have been generated with:
++#ifndef TEXCOMPRESS_ASTC_LUTS_WRAP_H
++#define TEXCOMPRESS_ASTC_LUTS_WRAP_H
 +
-+		#define STB_DXT_IMPLEMENTATION
-+		#include "stb_dxt.h"
-+		#include <stdio.h>
-+
-+		int main()
-+		{
-+			stb__InitDXT();
++#include <stdint.h>
++#include <stddef.h>
++#include "format/u_format.h"
 +
-+			printf( "static unsigned char stb__OMatch5[256][2] = {\n" );
-+			for( size_t i = 0u; i < 256u; ++i )
-+			{
-+				printf( "{ %i, %i }", stb__OMatch5[i][0], stb__OMatch5[i][1] );
-+				if( i != 255u )
-+					printf( ",\n" );
-+			}
-+			printf( "\n};\n" );
++/* C wrapper for Granite::ASTCLutHolder. */
 +
-+			printf( "static unsigned char stb__OMatch6[256][2] = {\n" );
-+			for( size_t i = 0u; i < 256u; ++i )
-+			{
-+				printf( "{ %i, %i }", stb__OMatch6[i][0], stb__OMatch6[i][1] );
-+				if( i != 255u )
-+					printf( ",\n" );
-+			}
-+			printf( "\n};\n" );
++#ifdef __cplusplus
++extern "C" {
++#endif
 +
-+			return 0;
-+		}
++typedef struct
++{
++   void *data;
++   size_t size_B;
++   enum pipe_format format;
++} astc_decoder_lut;
 +
-+	Note that stb__OMatch5[i][0] = max and stb__OMatch5[i][1] = min
-+*/
-+static unsigned char stb__OMatch5[256][2] = {
-+	{ 0, 0 },   { 0, 0 },   { 0, 1 },   { 0, 1 },   { 1, 0 },   { 1, 0 },   { 1, 0 },   { 1, 1 },
-+	{ 1, 1 },   { 2, 0 },   { 2, 0 },   { 0, 4 },   { 2, 1 },   { 2, 1 },   { 2, 1 },   { 3, 0 },
-+	{ 3, 0 },   { 3, 0 },   { 3, 1 },   { 1, 5 },   { 3, 2 },   { 3, 2 },   { 4, 0 },   { 4, 0 },
-+	{ 4, 1 },   { 4, 1 },   { 4, 2 },   { 4, 2 },   { 4, 2 },   { 3, 5 },   { 5, 1 },   { 5, 1 },
-+	{ 5, 2 },   { 4, 4 },   { 5, 3 },   { 5, 3 },   { 5, 3 },   { 6, 2 },   { 6, 2 },   { 6, 2 },
-+	{ 6, 3 },   { 5, 5 },   { 6, 4 },   { 6, 4 },   { 4, 8 },   { 7, 3 },   { 7, 3 },   { 7, 3 },
-+	{ 7, 4 },   { 7, 4 },   { 7, 4 },   { 7, 5 },   { 5, 9 },   { 7, 6 },   { 7, 6 },   { 8, 4 },
-+	{ 8, 4 },   { 8, 5 },   { 8, 5 },   { 8, 6 },   { 8, 6 },   { 8, 6 },   { 7, 9 },   { 9, 5 },
-+	{ 9, 5 },   { 9, 6 },   { 8, 8 },   { 9, 7 },   { 9, 7 },   { 9, 7 },   { 10, 6 },  { 10, 6 },
-+	{ 10, 6 },  { 10, 7 },  { 9, 9 },   { 10, 8 },  { 10, 8 },  { 8, 12 },  { 11, 7 },  { 11, 7 },
-+	{ 11, 7 },  { 11, 8 },  { 11, 8 },  { 11, 8 },  { 11, 9 },  { 9, 13 },  { 11, 10 }, { 11, 10 },
-+	{ 12, 8 },  { 12, 8 },  { 12, 9 },  { 12, 9 },  { 12, 10 }, { 12, 10 }, { 12, 10 }, { 11, 13 },
-+	{ 13, 9 },  { 13, 9 },  { 13, 10 }, { 12, 12 }, { 13, 11 }, { 13, 11 }, { 13, 11 }, { 14, 10 },
-+	{ 14, 10 }, { 14, 10 }, { 14, 11 }, { 13, 13 }, { 14, 12 }, { 14, 12 }, { 12, 16 }, { 15, 11 },
-+	{ 15, 11 }, { 15, 11 }, { 15, 12 }, { 15, 12 }, { 15, 12 }, { 15, 13 }, { 13, 17 }, { 15, 14 },
-+	{ 15, 14 }, { 16, 12 }, { 16, 12 }, { 16, 13 }, { 16, 13 }, { 16, 14 }, { 16, 14 }, { 16, 14 },
-+	{ 15, 17 }, { 17, 13 }, { 17, 13 }, { 17, 14 }, { 16, 16 }, { 17, 15 }, { 17, 15 }, { 17, 15 },
-+	{ 18, 14 }, { 18, 14 }, { 18, 14 }, { 18, 15 }, { 17, 17 }, { 18, 16 }, { 18, 16 }, { 16, 20 },
-+	{ 19, 15 }, { 19, 15 }, { 19, 15 }, { 19, 16 }, { 19, 16 }, { 19, 16 }, { 19, 17 }, { 17, 21 },
-+	{ 19, 18 }, { 19, 18 }, { 20, 16 }, { 20, 16 }, { 20, 17 }, { 20, 17 }, { 20, 18 }, { 20, 18 },
-+	{ 20, 18 }, { 19, 21 }, { 21, 17 }, { 21, 17 }, { 21, 18 }, { 20, 20 }, { 21, 19 }, { 21, 19 },
-+	{ 21, 19 }, { 22, 18 }, { 22, 18 }, { 22, 18 }, { 22, 19 }, { 21, 21 }, { 22, 20 }, { 22, 20 },
-+	{ 20, 24 }, { 23, 19 }, { 23, 19 }, { 23, 19 }, { 23, 20 }, { 23, 20 }, { 23, 20 }, { 23, 21 },
-+	{ 21, 25 }, { 23, 22 }, { 23, 22 }, { 24, 20 }, { 24, 20 }, { 24, 21 }, { 24, 21 }, { 24, 22 },
-+	{ 24, 22 }, { 24, 22 }, { 23, 25 }, { 25, 21 }, { 25, 21 }, { 25, 22 }, { 24, 24 }, { 25, 23 },
-+	{ 25, 23 }, { 25, 23 }, { 26, 22 }, { 26, 22 }, { 26, 22 }, { 26, 23 }, { 25, 25 }, { 26, 24 },
-+	{ 26, 24 }, { 24, 28 }, { 27, 23 }, { 27, 23 }, { 27, 23 }, { 27, 24 }, { 27, 24 }, { 27, 24 },
-+	{ 27, 25 }, { 25, 29 }, { 27, 26 }, { 27, 26 }, { 28, 24 }, { 28, 24 }, { 28, 25 }, { 28, 25 },
-+	{ 28, 26 }, { 28, 26 }, { 28, 26 }, { 27, 29 }, { 29, 25 }, { 29, 25 }, { 29, 26 }, { 28, 28 },
-+	{ 29, 27 }, { 29, 27 }, { 29, 27 }, { 30, 26 }, { 30, 26 }, { 30, 26 }, { 30, 27 }, { 29, 29 },
-+	{ 30, 28 }, { 30, 28 }, { 30, 28 }, { 31, 27 }, { 31, 27 }, { 31, 27 }, { 31, 28 }, { 31, 28 },
-+	{ 31, 28 }, { 31, 29 }, { 31, 29 }, { 31, 30 }, { 31, 30 }, { 31, 30 }, { 31, 31 }, { 31, 31 }
-+};
++typedef struct
++{
++   astc_decoder_lut color_endpoint;
++   astc_decoder_lut color_endpoint_unquant;
++   astc_decoder_lut weights;
++   astc_decoder_lut weights_unquant;
++   astc_decoder_lut trits_quints;
++} astc_decoder_lut_holder;
++
++void _mesa_init_astc_decoder_luts(astc_decoder_lut_holder *holder);
++void *_mesa_get_astc_decoder_partition_table(uint32_t block_width,
++                                             uint32_t block_height,
++                                             unsigned *lut_width,
++                                             unsigned *lut_height);
++
++#ifdef __cplusplus
++}
++#endif
 +
-+static unsigned char stb__OMatch6[256][2] = {
-+	{ 0, 0 },   { 0, 1 },   { 1, 0 },   { 1, 0 },   { 1, 1 },   { 2, 0 },   { 2, 1 },   { 3, 0 },
-+	{ 3, 0 },   { 3, 1 },   { 4, 0 },   { 4, 0 },   { 4, 1 },   { 5, 0 },   { 5, 1 },   { 6, 0 },
-+	{ 6, 0 },   { 6, 1 },   { 7, 0 },   { 7, 0 },   { 7, 1 },   { 8, 0 },   { 8, 1 },   { 8, 1 },
-+	{ 8, 2 },   { 9, 1 },   { 9, 2 },   { 9, 2 },   { 9, 3 },   { 10, 2 },  { 10, 3 },  { 10, 3 },
-+	{ 10, 4 },  { 11, 3 },  { 11, 4 },  { 11, 4 },  { 11, 5 },  { 12, 4 },  { 12, 5 },  { 12, 5 },
-+	{ 12, 6 },  { 13, 5 },  { 13, 6 },  { 8, 16 },  { 13, 7 },  { 14, 6 },  { 14, 7 },  { 9, 17 },
-+	{ 14, 8 },  { 15, 7 },  { 15, 8 },  { 11, 16 }, { 15, 9 },  { 15, 10 }, { 16, 8 },  { 16, 9 },
-+	{ 16, 10 }, { 15, 13 }, { 17, 9 },  { 17, 10 }, { 17, 11 }, { 15, 16 }, { 18, 10 }, { 18, 11 },
-+	{ 18, 12 }, { 16, 16 }, { 19, 11 }, { 19, 12 }, { 19, 13 }, { 17, 17 }, { 20, 12 }, { 20, 13 },
-+	{ 20, 14 }, { 19, 16 }, { 21, 13 }, { 21, 14 }, { 21, 15 }, { 20, 17 }, { 22, 14 }, { 22, 15 },
-+	{ 25, 10 }, { 22, 16 }, { 23, 15 }, { 23, 16 }, { 26, 11 }, { 23, 17 }, { 24, 16 }, { 24, 17 },
-+	{ 27, 12 }, { 24, 18 }, { 25, 17 }, { 25, 18 }, { 28, 13 }, { 25, 19 }, { 26, 18 }, { 26, 19 },
-+	{ 29, 14 }, { 26, 20 }, { 27, 19 }, { 27, 20 }, { 30, 15 }, { 27, 21 }, { 28, 20 }, { 28, 21 },
-+	{ 28, 21 }, { 28, 22 }, { 29, 21 }, { 29, 22 }, { 24, 32 }, { 29, 23 }, { 30, 22 }, { 30, 23 },
-+	{ 25, 33 }, { 30, 24 }, { 31, 23 }, { 31, 24 }, { 27, 32 }, { 31, 25 }, { 31, 26 }, { 32, 24 },
-+	{ 32, 25 }, { 32, 26 }, { 31, 29 }, { 33, 25 }, { 33, 26 }, { 33, 27 }, { 31, 32 }, { 34, 26 },
-+	{ 34, 27 }, { 34, 28 }, { 32, 32 }, { 35, 27 }, { 35, 28 }, { 35, 29 }, { 33, 33 }, { 36, 28 },
-+	{ 36, 29 }, { 36, 30 }, { 35, 32 }, { 37, 29 }, { 37, 30 }, { 37, 31 }, { 36, 33 }, { 38, 30 },
-+	{ 38, 31 }, { 41, 26 }, { 38, 32 }, { 39, 31 }, { 39, 32 }, { 42, 27 }, { 39, 33 }, { 40, 32 },
-+	{ 40, 33 }, { 43, 28 }, { 40, 34 }, { 41, 33 }, { 41, 34 }, { 44, 29 }, { 41, 35 }, { 42, 34 },
-+	{ 42, 35 }, { 45, 30 }, { 42, 36 }, { 43, 35 }, { 43, 36 }, { 46, 31 }, { 43, 37 }, { 44, 36 },
-+	{ 44, 37 }, { 44, 37 }, { 44, 38 }, { 45, 37 }, { 45, 38 }, { 40, 48 }, { 45, 39 }, { 46, 38 },
-+	{ 46, 39 }, { 41, 49 }, { 46, 40 }, { 47, 39 }, { 47, 40 }, { 43, 48 }, { 47, 41 }, { 47, 42 },
-+	{ 48, 40 }, { 48, 41 }, { 48, 42 }, { 47, 45 }, { 49, 41 }, { 49, 42 }, { 49, 43 }, { 47, 48 },
-+	{ 50, 42 }, { 50, 43 }, { 50, 44 }, { 48, 48 }, { 51, 43 }, { 51, 44 }, { 51, 45 }, { 49, 49 },
-+	{ 52, 44 }, { 52, 45 }, { 52, 46 }, { 51, 48 }, { 53, 45 }, { 53, 46 }, { 53, 47 }, { 52, 49 },
-+	{ 54, 46 }, { 54, 47 }, { 57, 42 }, { 54, 48 }, { 55, 47 }, { 55, 48 }, { 58, 43 }, { 55, 49 },
-+	{ 56, 48 }, { 56, 49 }, { 59, 44 }, { 56, 50 }, { 57, 49 }, { 57, 50 }, { 60, 45 }, { 57, 51 },
-+	{ 58, 50 }, { 58, 51 }, { 61, 46 }, { 58, 52 }, { 59, 51 }, { 59, 52 }, { 62, 47 }, { 59, 53 },
-+	{ 60, 52 }, { 60, 53 }, { 60, 53 }, { 60, 54 }, { 61, 53 }, { 61, 54 }, { 61, 54 }, { 61, 55 },
-+	{ 62, 54 }, { 62, 55 }, { 62, 55 }, { 62, 56 }, { 63, 55 }, { 63, 56 }, { 63, 56 }, { 63, 57 },
-+	{ 63, 58 }, { 63, 59 }, { 63, 59 }, { 63, 60 }, { 63, 61 }, { 63, 62 }, { 63, 62 }, { 63, 63 }
-+};
++#endif
 diff --git a/src/vulkan/runtime/meson.build b/src/vulkan/runtime/meson.build
-index 1c02e1a5dd8..54c89d793a0 100644
+index af134efb04e..1c02e1a5dd8 100644
 --- a/src/vulkan/runtime/meson.build
 +++ b/src/vulkan/runtime/meson.build
-@@ -93,12 +93,31 @@ endif
- 
- if prog_glslang.found()
-   vulkan_runtime_files += files('vk_texcompress_astc.c', 'vk_texcompress_astc.h')
-+  vulkan_runtime_files += files('vk_texcompress_bcn.c', 'vk_texcompress_bcn.h')
-   vulkan_runtime_files += custom_target(
-     'astc_spv.h',
-     input : astc_decoder_glsl_file,
-     output : 'astc_spv.h',
-     command : [prog_glslang, '-V', '-S', 'comp', '-x', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet,
-     )
-+  vulkan_runtime_files += custom_target(
-+    'bc1_spv.h',
-+    input : bc1_glsl_file,
-+    output : 'bc1_spv.h',
-+    command : [prog_glslang, '-V', '-S', 'comp', '-x', '-Ibc3_encode_head_dir', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet,
-+  )
-+  vulkan_runtime_files += custom_target(
-+    'bc4_spv.h',
-+    input : bc4_glsl_file,
-+    output : 'bc4_spv.h',
-+    command : [prog_glslang, '-V', '-S', 'comp', '-x', '-Ibc3_encode_head_dir', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet,
-+  )
-+  vulkan_runtime_files += custom_target(
-+    'bc3_spv.h',
-+    input : etc2_rgba_stitch_glsl_file,
-+    output : 'bc3_spv.h',
-+    command : [prog_glslang, '-V', '-S', 'comp', '-x', '-Ibc3_encode_head_dir', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet,
-+  )
+@@ -91,6 +91,16 @@ if with_platform_android
+   vulkan_runtime_deps += dep_android
  endif
  
- vk_common_entrypoints = custom_target(
-diff --git a/src/vulkan/runtime/vk_command_buffer.h b/src/vulkan/runtime/vk_command_buffer.h
-index 36099d03a56..1a04b53510f 100644
---- a/src/vulkan/runtime/vk_command_buffer.h
-+++ b/src/vulkan/runtime/vk_command_buffer.h
-@@ -28,6 +28,7 @@
- #include "vk_object.h"
- #include "util/list.h"
- #include "util/u_dynarray.h"
-+#include "vk_texcompress_bcn.h"
- 
- #ifdef __cplusplus
- extern "C" {
-@@ -130,6 +131,12 @@ struct vk_command_buffer {
-    /* This uses the same trick as STACK_ARRAY */
-    struct vk_attachment_state *attachments;
-    struct vk_attachment_state _attachments[8];
-+
-+   bool is_need_hard_encode;
-+
-+   struct vk_texcompress_staging_images* staging_image_list_head;
-+
-+   struct vk_texcompress_dummy_image* dummy;
- };
- 
- VK_DEFINE_HANDLE_CASTS(vk_command_buffer, base, VkCommandBuffer,
-diff --git a/src/vulkan/runtime/vk_queue.c b/src/vulkan/runtime/vk_queue.c
-index 6216af9d856..15994e3420d 100644
---- a/src/vulkan/runtime/vk_queue.c
-+++ b/src/vulkan/runtime/vk_queue.c
-@@ -42,6 +42,7 @@
- #include "vk_util.h"
- 
- #include "vulkan/wsi/wsi_common.h"
-+#include "vk_texcompress_bcn.h"
- 
- static VkResult
- vk_queue_start_submit_thread(struct vk_queue *queue);
-@@ -1140,7 +1141,22 @@ vk_common_QueueSubmit2KHR(VkQueue _queue,
-       }
-    }
- 
-+   int n_command_buffers = 0;
-+   for (uint32_t s = 0; s < submitCount; s++) {
-+      n_command_buffers += pSubmits[s].commandBufferInfoCount;
-+   }
-+   const uint32_t const_num_cmd_buf = n_command_buffers;
-+   uint32_t num_cmd_buf = 0;
-+   struct vk_command_buffer *need_delete_cmd_buf[const_num_cmd_buf];
-+
-    for (uint32_t i = 0; i < submitCount; i++) {
-+      for (uint32_t j = 0; j < pSubmits[i].commandBufferInfoCount; j++) {
-+         VK_FROM_HANDLE(vk_command_buffer, commandBuffer, (pSubmits[i].pCommandBufferInfos)[j].commandBuffer);
-+         if (commandBuffer->is_need_hard_encode) {
-+            need_delete_cmd_buf[num_cmd_buf] = commandBuffer;
-+            num_cmd_buf += 1;
-+         }
-+      }
-       struct vulkan_submit_info info = {
-          .pNext = pSubmits[i].pNext,
-          .command_buffer_count = pSubmits[i].commandBufferInfoCount,
-@@ -1156,6 +1172,20 @@ vk_common_QueueSubmit2KHR(VkQueue _queue,
-          return result;
-    }
- 
-+   if (num_cmd_buf > 0 && fence == NULL) {
-+      vk_texcompress_create_fence(queue->base.device, NULL, &_fence);
-+   }
-+
-+   if (num_cmd_buf > 0) {
-+      vk_texcompress_wait_fence(queue->base.device, NULL, &_fence);
-+      for (uint32_t i = 0; i < num_cmd_buf; i++) {
-+         while (need_delete_cmd_buf[i]->staging_image_list_head != NULL) {
-+            vk_texcompress_delete_head(&need_delete_cmd_buf[i]->staging_image_list_head, queue->base.device);
-+         }
-+         need_delete_cmd_buf[i]->is_need_hard_encode = false;
-+      }
-+   }
++if prog_glslang.found()
++  vulkan_runtime_files += files('vk_texcompress_astc.c', 'vk_texcompress_astc.h')
++  vulkan_runtime_files += custom_target(
++    'astc_spv.h',
++    input : astc_decoder_glsl_file,
++    output : 'astc_spv.h',
++    command : [prog_glslang, '-V', '-S', 'comp', '-x', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet,
++    )
++endif
 +
-    return VK_SUCCESS;
- }
- 
-diff --git a/src/vulkan/runtime/vk_texcompress_bcn.c b/src/vulkan/runtime/vk_texcompress_bcn.c
+ vk_common_entrypoints = custom_target(
+   'vk_common_entrypoints',
+   input : [vk_entrypoints_gen, vk_api_xml],
+diff --git a/src/vulkan/runtime/vk_texcompress_astc.c b/src/vulkan/runtime/vk_texcompress_astc.c
 new file mode 100644
-index 00000000000..2c85d98af5c
+index 00000000000..9275f426b76
 --- /dev/null
-+++ b/src/vulkan/runtime/vk_texcompress_bcn.c
-@@ -0,0 +1,740 @@
++++ b/src/vulkan/runtime/vk_texcompress_astc.c
+@@ -0,0 +1,636 @@
 +/* Copyright (c) 2017-2023 Hans-Kristian Arntzen
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining
@@ -1257,39 +5077,13 @@ index 00000000000..2c85d98af5c
 + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 + */
 +
-+#include "vk_texcompress_bcn.h"
-+#include "util/bc1_tables.h"
++#include "vk_texcompress_astc.h"
++#include "util/texcompress_astc_luts_wrap.h"
 +#include "vk_alloc.h"
++#include "vk_device.h"
 +#include "vk_format.h"
 +#include "vk_image.h"
 +#include "vk_physical_device.h"
-+#include "log/log.h"
-+
-+void
-+vk_texcompress_insert_head(staging_images **head, bcn_image *current)
-+{
-+   staging_images *node = (staging_images *)malloc(sizeof(staging_images));
-+   if (node == NULL) {
-+      ALOGE("Failed to alloc staging_images node");
-+      return;
-+   }
-+
-+   node->current = current;
-+   node->next = *head;
-+   *head = node;
-+}
-+
-+void
-+vk_texcompress_delete_head(staging_images **head, struct vk_device *device)
-+{
-+   if (*head == NULL) {
-+      return;
-+   }
-+   staging_images *current_head = *head;
-+   vk_texcompress_finish_image(device, NULL, current_head->current);
-+   *head = current_head->next;
-+   free(current_head);
-+}
 +
 +/* type_indexes_mask bits are set/clear for support memory type index as per
 + * struct VkPhysicalDeviceMemoryProperties.memoryTypes[] */
@@ -1317,9 +5111,11 @@ index 00000000000..2c85d98af5c
 +}
 +
 +static VkResult
-+create_buffer(struct vk_device *device, VkAllocationCallbacks *allocator, struct vk_texcompress_bcn_state *bc1)
++vk_create_buffer(struct vk_device *device, VkAllocationCallbacks *allocator,
++                 VkDeviceSize size, VkMemoryPropertyFlags mem_prop_flags,
++                 VkBufferUsageFlags usage_flags, VkBuffer *vk_buf,
++                 VkDeviceMemory *vk_mem)
 +{
-+   VkDeviceSize size = sizeof(float) * (sizeof(stb__OMatch5) + sizeof(stb__OMatch6));
 +   VkResult result;
 +   VkDevice _device = vk_device_to_handle(device);
 +   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
@@ -1327,17 +5123,17 @@ index 00000000000..2c85d98af5c
 +   VkBufferCreateInfo buffer_create_info = {
 +      .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
 +      .size = size,
-+      .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
++      .usage = usage_flags,
 +      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
 +   };
 +   result =
-+      disp->CreateBuffer(_device, &buffer_create_info, allocator, &bc1->bc1_table_buf);
++      disp->CreateBuffer(_device, &buffer_create_info, allocator, vk_buf);
 +   if (unlikely(result != VK_SUCCESS))
 +      return result;
 +
 +   VkBufferMemoryRequirementsInfo2 mem_req_info = {
 +      .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,
-+      .buffer = bc1->bc1_table_buf,
++      .buffer = *vk_buf,
 +   };
 +   VkMemoryRequirements2 mem_req = {
 +      .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
@@ -1345,8 +5141,7 @@ index 00000000000..2c85d98af5c
 +   disp->GetBufferMemoryRequirements2(_device, &mem_req_info, &mem_req);
 +
 +   uint32_t mem_type_index = get_mem_type_index(
-+      device, mem_req.memoryRequirements.memoryTypeBits,
-+      VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
++      device, mem_req.memoryRequirements.memoryTypeBits, mem_prop_flags);
 +   if (mem_type_index == -1)
 +      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
 +
@@ -1355,128 +5150,213 @@ index 00000000000..2c85d98af5c
 +      .allocationSize = mem_req.memoryRequirements.size,
 +      .memoryTypeIndex = mem_type_index,
 +   };
-+   result = disp->AllocateMemory(_device, &alloc_info, allocator, &bc1->bc1_table_mem);
++   result = disp->AllocateMemory(_device, &alloc_info, allocator, vk_mem);
 +   if (unlikely(result != VK_SUCCESS))
-+      return result;
-+
-+   result = disp->BindBufferMemory(_device, bc1->bc1_table_buf, bc1->bc1_table_mem, 0);
-+   if (unlikely(result != VK_SUCCESS))
-+      return result;
-+
-+   void *data;
-+   disp->MapMemory(_device, bc1->bc1_table_mem, 0, size, 0, &data);
-+   float (*data_array)[2] = (float(*)[2])data;
-+   for (int i = 0; i < 256; i++) {
-+      for (int j = 0; j < 2; j++) {
-+         data_array[i][j] = (float)stb__OMatch5[i][j];
-+         data_array[i + 256][j] = (float)stb__OMatch6[i][j];
-+      }
-+   }
-+   disp->UnmapMemory(_device, bc1->bc1_table_mem);
++      return result;
++
++   disp->BindBufferMemory(_device, *vk_buf, *vk_mem, 0);
 +
 +   return result;
 +}
 +
-+VkResult
-+vk_texcompress_create_image(struct vk_device *device, VkAllocationCallbacks *allocator, VkDeviceMemory *pMem,
-+                            VkImage *image, VkExtent3D extent, VkFormat format, int size)
++static VkResult
++create_buffer_view(struct vk_device *device, VkAllocationCallbacks *allocator,
++                   VkBufferView *buf_view, VkBuffer buf, VkFormat format, VkDeviceSize size,
++                   VkDeviceSize offset)
 +{
-+   bool is_dummy = (size != 0);
 +   VkResult result;
 +   VkDevice _device = vk_device_to_handle(device);
 +   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
 +
-+   VkImageCreateInfo image_create_info = {
-+      .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
-+      .flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT,
-+      .imageType = VK_IMAGE_TYPE_2D,
++   VkBufferViewCreateInfo buffer_view_create_info = {
++      .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
++      .buffer = buf,
 +      .format = format,
-+      .extent = extent,
-+      .mipLevels = 1,
-+      .arrayLayers = 1,
-+      .samples = VK_SAMPLE_COUNT_1_BIT,
-+      .tiling = VK_IMAGE_TILING_OPTIMAL,
-+      .usage = VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
-+      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
-+      .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
++      .offset = offset,
++      .range = size,
 +   };
-+   result = disp->CreateImage(_device, &image_create_info, allocator, image);
-+   if (unlikely(result != VK_SUCCESS)) {
-+      return result;
++   result = disp->CreateBufferView(_device, &buffer_view_create_info,
++                                   allocator, buf_view);
++   return result;
++}
++
++static uint8_t
++get_partition_table_index(VkFormat format)
++{
++   switch (format) {
++   case VK_FORMAT_ASTC_4x4_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_4x4_SRGB_BLOCK:
++      return 0;
++   case VK_FORMAT_ASTC_5x4_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_5x4_SRGB_BLOCK:
++      return 1;
++   case VK_FORMAT_ASTC_5x5_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_5x5_SRGB_BLOCK:
++      return 2;
++   case VK_FORMAT_ASTC_6x5_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_6x5_SRGB_BLOCK:
++      return 3;
++   case VK_FORMAT_ASTC_6x6_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_6x6_SRGB_BLOCK:
++      return 4;
++   case VK_FORMAT_ASTC_8x5_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_8x5_SRGB_BLOCK:
++      return 5;
++   case VK_FORMAT_ASTC_8x6_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_8x6_SRGB_BLOCK:
++      return 6;
++   case VK_FORMAT_ASTC_8x8_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_8x8_SRGB_BLOCK:
++      return 7;
++   case VK_FORMAT_ASTC_10x5_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_10x5_SRGB_BLOCK:
++      return 8;
++   case VK_FORMAT_ASTC_10x6_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_10x6_SRGB_BLOCK:
++      return 9;
++   case VK_FORMAT_ASTC_10x8_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_10x8_SRGB_BLOCK:
++      return 10;
++   case VK_FORMAT_ASTC_10x10_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_10x10_SRGB_BLOCK:
++      return 11;
++   case VK_FORMAT_ASTC_12x10_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_12x10_SRGB_BLOCK:
++      return 12;
++   case VK_FORMAT_ASTC_12x12_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_12x12_SRGB_BLOCK:
++      return 13;
++   default:
++      unreachable("bad astc format\n");
++      return 0;
 +   }
++}
++
++static VkResult
++astc_prepare_buffer(struct vk_device *device,
++                    struct vk_texcompress_astc_state *astc,
++                    VkAllocationCallbacks *allocator,
++                    VkDeviceSize minTexelBufferOffsetAlignment,
++                    uint8_t *single_buf_ptr,
++                    VkDeviceSize *single_buf_size)
++{
++   VkResult result;
++   astc_decoder_lut_holder astc_lut_holder;
++   VkDeviceSize offset = 0;
 +
-+   if (is_dummy) {
-+      VkMemoryRequirements mem_requirements;
-+      disp->GetImageMemoryRequirements(_device, *image, &mem_requirements);
++   _mesa_init_astc_decoder_luts(&astc_lut_holder);
 +
-+      uint32_t mem_type_index = get_mem_type_index(
-+         device, mem_requirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
-+      if (mem_type_index == -1) {
-+         return VK_ERROR_OUT_OF_HOST_MEMORY;
-+      }
++   const astc_decoder_lut *luts[] = {
++      &astc_lut_holder.color_endpoint,
++      &astc_lut_holder.color_endpoint_unquant,
++      &astc_lut_holder.weights,
++      &astc_lut_holder.weights_unquant,
++      &astc_lut_holder.trits_quints,
++   };
 +
-+      VkMemoryAllocateInfo alloc_info = {
-+         .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
-+         .allocationSize = size,
-+         .memoryTypeIndex = mem_type_index,
-+      };
-+      result = disp->AllocateMemory(_device, &alloc_info, allocator, pMem);
-+      if (unlikely(result != VK_SUCCESS)) {
-+         return result;
++   for (unsigned i = 0; i < ARRAY_SIZE(luts); i++) {
++      offset = align(offset, minTexelBufferOffsetAlignment);
++      if (single_buf_ptr) {
++         memcpy(single_buf_ptr + offset, luts[i]->data, luts[i]->size_B);
++         result = create_buffer_view(device, allocator, &astc->luts_buf_view[i], astc->luts_buf,
++                                     vk_format_from_pipe_format(luts[i]->format), luts[i]->size_B,
++                                     offset);
++         if (result != VK_SUCCESS)
++            return result;
 +      }
++      offset += luts[i]->size_B;
 +   }
-+   if (is_dummy) {
-+      disp->DestroyImage(_device, *image, allocator);
-+   }
-+   return result;
-+}
 +
-+void
-+vk_texcompress_finish_image(struct vk_device *device, const VkAllocationCallbacks *allocator,
-+                            struct vk_texcompress_bcn_image *intermidate_image)
-+{
-+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
-+   disp->DestroyImage(vk_device_to_handle(device), intermidate_image->image_rgba, NULL);
-+   disp->DestroyImage(vk_device_to_handle(device), intermidate_image->image_bc1, NULL);
-+   disp->DestroyImage(vk_device_to_handle(device), intermidate_image->image_bc4, NULL);
-+   disp->FreeMemory(vk_device_to_handle(device), intermidate_image->image_mem, NULL);
-+   vk_free(&device->alloc, intermidate_image);
-+}
++   const VkFormat formats[] = {
++      VK_FORMAT_ASTC_4x4_UNORM_BLOCK,
++      VK_FORMAT_ASTC_5x4_UNORM_BLOCK,
++      VK_FORMAT_ASTC_5x5_UNORM_BLOCK,
++      VK_FORMAT_ASTC_6x5_UNORM_BLOCK,
++      VK_FORMAT_ASTC_6x6_UNORM_BLOCK,
++      VK_FORMAT_ASTC_8x5_UNORM_BLOCK,
++      VK_FORMAT_ASTC_8x6_UNORM_BLOCK,
++      VK_FORMAT_ASTC_8x8_UNORM_BLOCK,
++      VK_FORMAT_ASTC_10x5_UNORM_BLOCK,
++      VK_FORMAT_ASTC_10x6_UNORM_BLOCK,
++      VK_FORMAT_ASTC_10x8_UNORM_BLOCK,
++      VK_FORMAT_ASTC_10x10_UNORM_BLOCK,
++      VK_FORMAT_ASTC_12x10_UNORM_BLOCK,
++      VK_FORMAT_ASTC_12x12_UNORM_BLOCK,
++   };
 +
-+void bind_memory(struct vk_device *device, VkImage *image, VkDeviceMemory *pMem, int offset)
-+{
-+   VkDevice _device = vk_device_to_handle(device);
-+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
-+   disp->BindImageMemory(_device, *image, *pMem, offset);
++   for (uint32_t i = 0; i < ARRAY_SIZE(formats); i++) {
++      unsigned lut_width;
++      unsigned lut_height;
++      const void *lut_data = _mesa_get_astc_decoder_partition_table(
++            vk_format_get_blockwidth(formats[i]),
++            vk_format_get_blockheight(formats[i]),
++            &lut_width, &lut_height);
++      const unsigned lut_size = lut_width * lut_height;
++
++      offset = align(offset, minTexelBufferOffsetAlignment);
++      if (single_buf_ptr) {
++         memcpy(single_buf_ptr + offset, lut_data, lut_width * lut_height);
++
++         result = create_buffer_view(device, allocator, &astc->partition_tbl_buf_view[i],
++                                     astc->luts_buf, VK_FORMAT_R8_UINT, lut_width * lut_height,
++                                     offset);
++         if (result != VK_SUCCESS)
++            return result;
++      }
++      offset += lut_size;
++   }
++
++   *single_buf_size = offset;
++   return result;
 +}
 +
 +static VkResult
-+create_sampler(struct vk_device *device, VkAllocationCallbacks *allocator,
-+               VkSampler *sampler, VkFilter filter, VkSamplerMipmapMode mipmapMode)
++create_fill_all_luts_vulkan(struct vk_device *device,
++                            VkAllocationCallbacks *allocator,
++                            struct vk_texcompress_astc_state *astc)
 +{
++   VkResult result;
 +   VkDevice _device = vk_device_to_handle(device);
 +   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++   VkPhysicalDevice _phy_device = vk_physical_device_to_handle(device->physical);
++   const struct vk_physical_device_dispatch_table *phy_disp = &device->physical->dispatch_table;
++   VkDeviceSize single_buf_size;
++   uint8_t *single_buf_ptr;
 +
-+   VkSamplerCreateInfo create_info = {
-+      .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
-+      .magFilter = filter,
-+      .minFilter = filter,
-+      .addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT,
-+      .addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT,
-+      .addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT,
-+      .anisotropyEnable = VK_FALSE,
-+      .borderColor = VK_BORDER_COLOR_INT_OPAQUE_BLACK,
-+      .unnormalizedCoordinates = VK_FALSE,
-+      .compareEnable = VK_FALSE,
-+      .mipmapMode = mipmapMode,
++   VkPhysicalDeviceProperties2 phy_dev_prop = {
++      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2,
++      .pNext = NULL,
 +   };
-+   VkResult result = disp->CreateSampler(_device, &create_info, allocator, sampler);
++   phy_disp->GetPhysicalDeviceProperties2(_phy_device, &phy_dev_prop);
++
++   /* get the single_buf_size */
++   result = astc_prepare_buffer(device, astc, allocator,
++                                phy_dev_prop.properties.limits.minTexelBufferOffsetAlignment,
++                                NULL, &single_buf_size);
++
++   /* create gpu buffer for all the luts */
++   result = vk_create_buffer(device, allocator, single_buf_size,
++                             VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
++                             VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
++                             VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT,
++                             &astc->luts_buf, &astc->luts_mem);
++   if (unlikely(result != VK_SUCCESS))
++      return result;
++
++   disp->MapMemory(_device, astc->luts_mem, 0, VK_WHOLE_SIZE, 0, (void*)&single_buf_ptr);
++
++   /* fill all the luts and create views */
++   result = astc_prepare_buffer(device, astc, allocator,
++                                phy_dev_prop.properties.limits.minTexelBufferOffsetAlignment,
++                                single_buf_ptr, &single_buf_size);
++
++   disp->UnmapMemory(_device, astc->luts_mem);
 +   return result;
 +}
 +
 +static VkResult
-+create_bc1_layout(struct vk_device *device, VkAllocationCallbacks *allocator,
-+                  struct vk_texcompress_bcn_state *bcn)
++create_layout(struct vk_device *device, VkAllocationCallbacks *allocator,
++              struct vk_texcompress_astc_state *astc)
 +{
 +   VkResult result;
 +   VkDevice _device = vk_device_to_handle(device);
@@ -1484,101 +5364,57 @@ index 00000000000..2c85d98af5c
 +
 +   VkDescriptorSetLayoutBinding bindings[] = {
 +      {
-+         .binding = 0,
-+         .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
++         .binding = 0, /* OutputImage2DArray */
++         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
 +         .descriptorCount = 1,
 +         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
 +         .pImmutableSamplers = NULL,
 +      },
 +      {
-+         .binding = 1,
-+         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
++         .binding = 1, /* PayloadInput2DArray */
++         .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
 +         .descriptorCount = 1,
 +         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
 +         .pImmutableSamplers = NULL,
 +      },
 +      {
-+         .binding = 2,
-+         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
++         .binding = 2, /* LUTRemainingBitsToEndpointQuantizer */
++         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
 +         .descriptorCount = 1,
 +         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
 +         .pImmutableSamplers = NULL,
 +      },
-+   };
-+
-+   VkDescriptorSetLayoutCreateInfo ds_create_info = {
-+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
-+      .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
-+      .bindingCount = ARRAY_SIZE(bindings),
-+      .pBindings = bindings,
-+   };
-+
-+   result = disp->CreateDescriptorSetLayout(_device, &ds_create_info, allocator, &bcn->ds_layout);
-+   return result;
-+}
-+
-+static VkResult
-+create_bc4_layout(struct vk_device *device, VkAllocationCallbacks *allocator,
-+                  struct vk_texcompress_bcn_state *bcn)
-+{
-+   VkResult result;
-+   VkDevice _device = vk_device_to_handle(device);
-+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
-+
-+   VkDescriptorSetLayoutBinding bindings[] = {
 +      {
-+         .binding = 0,
-+         .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
++         .binding = 3, /* LUTEndpointUnquantize */
++         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
 +         .descriptorCount = 1,
 +         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
 +         .pImmutableSamplers = NULL,
 +      },
 +      {
-+         .binding = 1,
-+         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
++         .binding = 4, /* LUTWeightQuantizer */
++         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
 +         .descriptorCount = 1,
 +         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
 +         .pImmutableSamplers = NULL,
 +      },
-+   };
-+
-+   VkDescriptorSetLayoutCreateInfo ds_create_info = {
-+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
-+      .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
-+      .bindingCount = ARRAY_SIZE(bindings),
-+      .pBindings = bindings,
-+   };
-+
-+   result = disp->CreateDescriptorSetLayout(_device, &ds_create_info, allocator, &bcn->ds_layout);
-+   return result;
-+}
-+
-+static VkResult
-+create_bc3_layout(struct vk_device *device, VkAllocationCallbacks *allocator,
-+                  struct vk_texcompress_bcn_state *bcn)
-+{
-+   VkResult result;
-+   VkDevice _device = vk_device_to_handle(device);
-+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
-+
-+   VkDescriptorSetLayoutBinding bindings[] = {
 +      {
-+         .binding = 0,
-+         .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
++         .binding = 5, /* LUTWeightUnquantize */
++         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
 +         .descriptorCount = 1,
 +         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
 +         .pImmutableSamplers = NULL,
 +      },
 +      {
-+         .binding = 1,
-+         .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
++         .binding = 6, /* LUTTritQuintDecode */
++         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
 +         .descriptorCount = 1,
 +         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
 +         .pImmutableSamplers = NULL,
 +      },
 +      {
-+         .binding = 2,
-+         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
++         .binding = 7, /* LUTPartitionTable */
++         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
 +         .descriptorCount = 1,
 +         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
 +         .pImmutableSamplers = NULL,
@@ -1592,61 +5428,32 @@ index 00000000000..2c85d98af5c
 +      .pBindings = bindings,
 +   };
 +
-+   result = disp->CreateDescriptorSetLayout(_device, &ds_create_info, allocator, &bcn->ds_layout);
-+   return result;
-+}
-+
-+static VkResult
-+create_pipeline_layout(struct vk_device *device, VkAllocationCallbacks *allocator,
-+                       struct vk_texcompress_bcn_state *bcn, enum compute_pipeline_id id)
-+{
-+   VkResult result = VK_RESULT_MAX_ENUM;
-+   switch (id) {
-+   case BC1_ENCODE:
-+      result = create_bc1_layout(device, allocator, bcn);
-+      break;
-+   case BC4_ENCODE:
-+      result = create_bc4_layout(device, allocator, bcn);
-+      break;
-+   case BC3_ENCODE:
-+      result = create_bc3_layout(device, allocator, bcn);
-+      break;
-+   default:
-+      goto fail;
-+   }
-+   if (result != VK_SUCCESS) {
++   result = disp->CreateDescriptorSetLayout(_device, &ds_create_info,
++                                            allocator, &astc->ds_layout);
++   if (result != VK_SUCCESS)
 +      goto fail;
-+   }
-+   VkDevice _device = vk_device_to_handle(device);
-+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++
 +   VkPipelineLayoutCreateInfo pl_create_info = {
 +      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
 +      .setLayoutCount = 1,
-+      .pSetLayouts = &bcn->ds_layout,
++      .pSetLayouts = &astc->ds_layout,
 +      .pushConstantRangeCount = 1,
-+      .pPushConstantRanges = &(VkPushConstantRange) {VK_SHADER_STAGE_COMPUTE_BIT, 0, 8}
++      .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 20},
 +   };
-+   result = disp->CreatePipelineLayout(_device, &pl_create_info, allocator, &bcn->p_layout);
++   result = disp->CreatePipelineLayout(_device, &pl_create_info, allocator,
++                                       &astc->p_layout);
 +fail:
 +   return result;
 +}
 +
-+static const uint32_t bc1_spv[] = {
-+#include "bc1_spv.h"
-+};
-+
-+static const uint32_t bc4_spv[] = {
-+#include "bc4_spv.h"
-+};
-+
-+static const uint32_t bc3_spv[] = {
-+#include "bc3_spv.h"
++static const uint32_t astc_spv[] = {
++#include "astc_spv.h"
 +};
 +
 +static VkResult
-+vk_bc1_create_shader_module(struct vk_device *device,
++vk_astc_create_shader_module(struct vk_device *device,
 +                             VkAllocationCallbacks *allocator,
-+                             struct vk_texcompress_bcn_state *bcn)
++                             struct vk_texcompress_astc_state *astc)
 +{
 +   VkDevice _device = vk_device_to_handle(device);
 +   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
@@ -1655,131 +5462,114 @@ index 00000000000..2c85d98af5c
 +      .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
 +      .pNext = NULL,
 +      .flags = 0,
-+      .codeSize = sizeof(bc1_spv),
-+      .pCode = bc1_spv,
++      .codeSize = sizeof(astc_spv),
++      .pCode = astc_spv,
 +   };
 +
-+   return disp->CreateShaderModule(_device, &shader_module_create_info, allocator, &bcn->shader_module);
++   return disp->CreateShaderModule(_device, &shader_module_create_info,
++                                   allocator, &astc->shader_module);
 +}
 +
 +static VkResult
-+vk_bc4_create_shader_module(struct vk_device *device,
-+                             VkAllocationCallbacks *allocator,
-+                             struct vk_texcompress_bcn_state *bcn)
++create_astc_decode_pipeline(struct vk_device *device,
++                            VkAllocationCallbacks *allocator,
++                            struct vk_texcompress_astc_state *astc,
++                            VkPipelineCache pipeline_cache, VkFormat format)
 +{
++   VkResult result;
 +   VkDevice _device = vk_device_to_handle(device);
 +   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++   VkPipeline pipeline;
++   uint8_t t_i;
 +
-+   VkShaderModuleCreateInfo shader_module_create_info = {
-+      .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
-+      .pNext = NULL,
-+      .flags = 0,
-+      .codeSize = sizeof(bc4_spv),
-+      .pCode = bc4_spv,
-+   };
-+
-+   return disp->CreateShaderModule(_device, &shader_module_create_info, allocator, &bcn->shader_module);
-+}
-+
-+static VkResult
-+vk_bc3_create_shader_module(struct vk_device *device,
-+                             VkAllocationCallbacks *allocator,
-+                             struct vk_texcompress_bcn_state *bcn)
-+{
-+   VkDevice _device = vk_device_to_handle(device);
-+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++   t_i = get_partition_table_index(format);
 +
-+   VkShaderModuleCreateInfo shader_module_create_info = {
-+      .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
-+      .pNext = NULL,
-+      .flags = 0,
-+      .codeSize = sizeof(bc3_spv),
-+      .pCode = bc3_spv,
++   uint32_t special_data[3] = {
++      vk_format_get_blockwidth(format),
++      vk_format_get_blockheight(format),
++      true,
++   };
++   VkSpecializationMapEntry special_map_entry[3] = {{
++                                                       .constantID = 0,
++                                                       .offset = 0,
++                                                       .size = 4,
++                                                    },
++                                                    {
++                                                       .constantID = 1,
++                                                       .offset = 4,
++                                                       .size = 4,
++                                                    },
++                                                    {
++                                                       .constantID = 2,
++                                                       .offset = 8,
++                                                       .size = 4,
++                                                    }};
++
++   VkSpecializationInfo specialization_info = {
++      .mapEntryCount = 3,
++      .pMapEntries = special_map_entry,
++      .dataSize = 12,
++      .pData = special_data,
 +   };
-+
-+   return disp->CreateShaderModule(_device, &shader_module_create_info, allocator, &bcn->shader_module);
-+}
-+
-+static VkResult
-+create_bcn_encode_pipeline(struct vk_device *device,
-+                           VkAllocationCallbacks *allocator,
-+                           struct vk_texcompress_bcn_state *bcn,
-+                           VkPipelineCache pipeline_cache)
-+{
-+   VkResult result;
-+   VkDevice _device = vk_device_to_handle(device);
-+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
-+   VkPipeline pipeline;
 +
 +   /* compute shader */
 +   VkPipelineShaderStageCreateInfo pipeline_shader_stage = {
 +      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
 +      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
-+      .module = bcn->shader_module,
++      .module = astc->shader_module,
 +      .pName = "main",
++      .pSpecializationInfo = &specialization_info,
 +   };
 +
 +   VkComputePipelineCreateInfo vk_pipeline_info = {
 +      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
 +      .stage = pipeline_shader_stage,
 +      .flags = 0,
-+      .layout = bcn->p_layout,
++      .layout = astc->p_layout,
 +   };
 +
-+   result = disp->CreateComputePipelines(_device, pipeline_cache, 1, &vk_pipeline_info, allocator, &pipeline);
++   result = disp->CreateComputePipelines(
++      _device, pipeline_cache, 1, &vk_pipeline_info, allocator, &pipeline);
 +   if (result != VK_SUCCESS)
 +      return result;
 +
-+   bcn->pipeline = pipeline;
++   astc->pipeline[t_i] = pipeline;
++   astc->pipeline_mask |= (1 << t_i);
 +
 +   return result;
 +}
 +
 +VkPipeline
-+vk_texcompress_rgba_get_encode_pipeline(struct vk_device *device, VkAllocationCallbacks *allocator,
-+                                        struct vk_texcompress_bcn_state *bcn, VkPipelineCache pipeline_cache,
-+                                        enum compute_pipeline_id id)
++vk_texcompress_astc_get_decode_pipeline(struct vk_device *device, VkAllocationCallbacks *allocator,
++                                        struct vk_texcompress_astc_state *astc, VkPipelineCache pipeline_cache,
++                                        VkFormat format)
 +{
 +   VkResult result;
++   uint8_t t_i = get_partition_table_index(format);
 +
-+   simple_mtx_lock(&bcn->mutex);
-+
-+   if (bcn->pipeline)
-+      goto unlock;
++   simple_mtx_lock(&astc->mutex);
 +
-+   if (id >= MAX_PIPELINE_NUM || id < 0)
++   if (astc->pipeline[t_i])
 +      goto unlock;
 +
-+   if (id == BC1_ENCODE && !bcn->shader_module) {
-+      result = vk_bc1_create_shader_module(device, allocator, bcn);
-+      if (result != VK_SUCCESS)
-+         goto unlock;
-+   }
-+
-+   if (id == BC4_ENCODE && !bcn->shader_module) {
-+      result = vk_bc4_create_shader_module(device, allocator, bcn);
-+      if (result != VK_SUCCESS)
-+         goto unlock;
-+   }
-+
-+   if (id == BC3_ENCODE && !bcn->shader_module) {
-+      result = vk_bc3_create_shader_module(device, allocator, bcn);
++   if (!astc->shader_module) {
++      result = vk_astc_create_shader_module(device, allocator, astc);
 +      if (result != VK_SUCCESS)
 +         goto unlock;
 +   }
 +
-+   create_bcn_encode_pipeline(device, allocator, bcn, pipeline_cache);
++   create_astc_decode_pipeline(device, allocator, astc, pipeline_cache, format);
 +
 +unlock:
-+   simple_mtx_unlock(&bcn->mutex);
-+   return bcn->pipeline;
++   simple_mtx_unlock(&astc->mutex);
++   return astc->pipeline[t_i];
 +}
 +
 +static inline void
-+fill_desc_image_info_struct(VkDescriptorImageInfo *info, VkSampler sampler, VkImageView img_view,
++fill_desc_image_info_struct(VkDescriptorImageInfo *info, VkImageView img_view,
 +                            VkImageLayout img_layout)
 +{
-+   info->sampler = sampler;
++   info->sampler = VK_NULL_HANDLE;
 +   info->imageView = img_view;
 +   info->imageLayout = img_layout;
 +}
@@ -1801,9 +5591,9 @@ index 00000000000..2c85d98af5c
 +}
 +
 +static inline void
-+fill_write_descriptor_set_buffer(VkWriteDescriptorSet *set,
-+                                 uint8_t bind_i, VkDescriptorType desc_type,
-+                                 VkDescriptorBufferInfo *buf_info)
++fill_write_descriptor_set_uniform_texel(VkWriteDescriptorSet *set,
++                                        uint8_t bind_i,
++                                        VkBufferView *buf_view)
 +{
 +   set->sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
 +   set->pNext = NULL;
@@ -1811,177 +5601,102 @@ index 00000000000..2c85d98af5c
 +   set->dstBinding = bind_i;
 +   set->dstArrayElement = 0;
 +   set->descriptorCount = 1;
-+   set->descriptorType = desc_type;
++   set->descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
 +   set->pImageInfo = NULL;
-+   set->pBufferInfo = buf_info;
-+   set->pTexelBufferView = NULL;
++   set->pBufferInfo = NULL;
++   set->pTexelBufferView = buf_view;
 +}
 +
 +void
-+vk_texcompress_bc1_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn,
-+                                               struct vk_texcompress_bcn_write_descriptor_set *set,
++vk_texcompress_astc_fill_write_descriptor_sets(struct vk_texcompress_astc_state *astc,
++                                               struct vk_texcompress_astc_write_descriptor_set *set,
 +                                               VkImageView src_img_view, VkImageLayout src_img_layout,
-+                                               VkImageView dst_img_view)
-+{
-+   fill_desc_image_info_struct(&set->src_desc_image_info, bcn->sampler, src_img_view, src_img_layout);
-+   fill_write_descriptor_set_image(&set->descriptor_set[0], 0,
-+                                   VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, &set->src_desc_image_info);
-+
-+   set->bc1_buffer_info.buffer = bcn->bc1_table_buf;
-+   set->bc1_buffer_info.offset = 0;
-+   set->bc1_buffer_info.range = VK_WHOLE_SIZE;
-+   fill_write_descriptor_set_buffer(&set->descriptor_set[1], 1,
-+                                    VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &set->bc1_buffer_info);
-+
-+   fill_desc_image_info_struct(&set->dst_desc_image_info, VK_NULL_HANDLE, dst_img_view, VK_IMAGE_LAYOUT_GENERAL);
-+   fill_write_descriptor_set_image(&set->descriptor_set[2], 2,
-+                                   VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &set->dst_desc_image_info);
-+}
-+
-+void
-+vk_texcompress_bc4_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn,
-+                                              struct vk_texcompress_bc4_write_descriptor_set *set,
-+                                              VkImageView src_img_view, VkImageLayout src_img_layout,
-+                                              VkImageView dst_img_view)
-+{
-+   fill_desc_image_info_struct(&set->src_desc_image_info, bcn->sampler, src_img_view, src_img_layout);
-+   fill_write_descriptor_set_image(&set->descriptor_set[0], 0,
-+                                   VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, &set->src_desc_image_info);
-+
-+   fill_desc_image_info_struct(&set->dst_desc_image_info, VK_NULL_HANDLE, dst_img_view, VK_IMAGE_LAYOUT_GENERAL);
-+   fill_write_descriptor_set_image(&set->descriptor_set[1], 1,
-+                                   VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &set->dst_desc_image_info);
-+}
-+
-+void
-+vk_texcompress_bc3_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn,
-+                                               struct vk_texcompress_bcn_write_descriptor_set *set,
-+                                               VkImageView bc1_src_img_view, VkImageView bc4_src_img_view,
-+                                               VkImageView dst_img_view)
++                                               VkImageView dst_img_view,
++                                               VkFormat format)
 +{
-+   fill_desc_image_info_struct(&set->src_desc_image_info, bcn->sampler, bc1_src_img_view, VK_IMAGE_LAYOUT_GENERAL);
-+   fill_write_descriptor_set_image(&set->descriptor_set[0], 0,
-+                                   VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, &set->src_desc_image_info);
-+
-+   fill_desc_image_info_struct(&set->src_desc_image_info_2, bcn->sampler, bc4_src_img_view, VK_IMAGE_LAYOUT_GENERAL);
-+   fill_write_descriptor_set_image(&set->descriptor_set[1], 1,
-+                                   VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, &set->src_desc_image_info_2);
++   unsigned desc_i;
 +
-+   fill_desc_image_info_struct(&set->dst_desc_image_info, VK_NULL_HANDLE, dst_img_view, VK_IMAGE_LAYOUT_GENERAL);
-+   fill_write_descriptor_set_image(&set->descriptor_set[2], 2,
++   desc_i = 0;
++   fill_desc_image_info_struct(&set->dst_desc_image_info, dst_img_view, VK_IMAGE_LAYOUT_GENERAL);
++   fill_write_descriptor_set_image(&set->descriptor_set[desc_i], desc_i,
 +                                   VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &set->dst_desc_image_info);
++   desc_i++;
++   fill_desc_image_info_struct(&set->src_desc_image_info, src_img_view, src_img_layout);
++   fill_write_descriptor_set_image(&set->descriptor_set[desc_i], desc_i,
++                                   VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, &set->src_desc_image_info);
++   /* fill luts descriptor */
++   desc_i++;
++   for (unsigned i = 0; i < VK_TEXCOMPRESS_ASTC_NUM_LUTS; i++) {
++      fill_write_descriptor_set_uniform_texel(&set->descriptor_set[desc_i + i], desc_i + i,
++                                              &astc->luts_buf_view[i]);
++   }
++   desc_i += VK_TEXCOMPRESS_ASTC_NUM_LUTS;
++   uint8_t t_i = get_partition_table_index(format);
++   fill_write_descriptor_set_uniform_texel(&set->descriptor_set[desc_i], desc_i,
++                                           &astc->partition_tbl_buf_view[t_i]);
++   desc_i++;
++   assert(desc_i == ARRAY_SIZE(set->descriptor_set));
 +}
 +
 +VkResult
-+vk_texcompress_bcn_init(struct vk_device *device, VkAllocationCallbacks *allocator,
-+                        VkPipelineCache pipeline_cache,
-+                        struct vk_texcompress_bcn_state *bcn[])
++vk_texcompress_astc_init(struct vk_device *device, VkAllocationCallbacks *allocator,
++                         VkPipelineCache pipeline_cache,
++                         struct vk_texcompress_astc_state **astc)
 +{
 +   VkResult result;
 +
-+   for (int pi = VK_TEXCOMPRESS_BCN_NUM_COMPUTE_PIPELINES - 1; pi >= 0; --pi) {
-+      /* memory to be freed as part of vk_bcn_decode_finish() */
-+      bcn[pi] = vk_zalloc(allocator, sizeof(struct vk_texcompress_bcn_state), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
-+      if (bcn[pi] == NULL) {
-+         return VK_ERROR_OUT_OF_HOST_MEMORY;
-+      }
-+      simple_mtx_init(&(bcn[pi])->mutex, mtx_plain);
-+   }
-+
-+   result = create_buffer(device, allocator, bcn[BC1_ENCODE]);
-+   if (result != VK_SUCCESS)
-+      goto fail;
-+
-+   result = create_sampler(device, allocator, &bcn[BC1_ENCODE]->sampler,
-+                           VK_FILTER_LINEAR, VK_SAMPLER_MIPMAP_MODE_LINEAR);
-+   if (result != VK_SUCCESS)
-+      goto fail;
-+
-+   result = create_pipeline_layout(device, allocator, bcn[BC1_ENCODE], BC1_ENCODE);
-+   if (result != VK_SUCCESS)
-+      goto fail;
-+
-+   result = create_sampler(device, allocator, &bcn[BC4_ENCODE]->sampler,
-+                           VK_FILTER_LINEAR, VK_SAMPLER_MIPMAP_MODE_LINEAR);
-+   if (result != VK_SUCCESS)
-+      goto fail;
++   /* astc memory to be freed as part of vk_astc_decode_finish() */
++   *astc = vk_zalloc(allocator, sizeof(struct vk_texcompress_astc_state), 8,
++                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
++   if (*astc == NULL)
++      return VK_ERROR_OUT_OF_HOST_MEMORY;
 +
-+   result = create_pipeline_layout(device, allocator, bcn[BC4_ENCODE], BC4_ENCODE);
-+   if (result != VK_SUCCESS)
-+      goto fail;
++   simple_mtx_init(&(*astc)->mutex, mtx_plain);
 +
-+   // 由于VK_FORMAT_R32G32_UINT格式的imageView格式不包含VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT, 采用nearest
-+   result = create_sampler(device, allocator, &bcn[BC3_ENCODE]->sampler,
-+                           VK_FILTER_NEAREST, VK_SAMPLER_MIPMAP_MODE_NEAREST);
++   result = create_fill_all_luts_vulkan(device, allocator, *astc);
 +   if (result != VK_SUCCESS)
 +      goto fail;
 +
-+   result = create_pipeline_layout(device, allocator, bcn[BC3_ENCODE], BC3_ENCODE);
-+   if (result != VK_SUCCESS)
-+      goto fail;
++   result = create_layout(device, allocator, *astc);
 +
 +fail:
 +   return result;
 +}
 +
 +void
-+vk_texcompress_bcn_finish(struct vk_device *device,
-+                          VkAllocationCallbacks *allocator,
-+                          struct vk_texcompress_bcn_state *bcn[])
++vk_texcompress_astc_finish(struct vk_device *device,
++                           VkAllocationCallbacks *allocator,
++                           struct vk_texcompress_astc_state *astc)
 +{
 +   VkDevice _device = vk_device_to_handle(device);
 +   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
 +
-+   for (int pi = VK_TEXCOMPRESS_BCN_NUM_COMPUTE_PIPELINES - 1; pi >= 0; --pi) {
-+      if (bcn[pi]->pipeline)
-+         disp->DestroyPipeline(_device, bcn[pi]->pipeline, allocator);
-+
-+      disp->DestroyPipelineLayout(_device, bcn[pi]->p_layout, allocator);
-+      disp->DestroyShaderModule(_device, bcn[pi]->shader_module, allocator);
-+      disp->DestroyDescriptorSetLayout(_device, bcn[pi]->ds_layout, allocator);
-+
-+      if (pi == BC1_ENCODE) {
-+         disp->DestroyBuffer(_device, bcn[pi]->bc1_table_buf, allocator);
-+         disp->FreeMemory(_device, bcn[pi]->bc1_table_mem, allocator);
-+      }
-+      vk_free(allocator, bcn[pi]);
++   while (astc->pipeline_mask) {
++      uint8_t t_i = u_bit_scan(&astc->pipeline_mask);
++      disp->DestroyPipeline(_device, astc->pipeline[t_i], allocator);
 +   }
-+}
 +
-+void
-+vk_texcompress_create_fence(struct vk_device *device, VkAllocationCallbacks *allocator, VkFence *fence)
-+{
-+   VkDevice _device = vk_device_to_handle(device);
-+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++   disp->DestroyPipelineLayout(_device, astc->p_layout, allocator);
++   disp->DestroyShaderModule(_device, astc->shader_module, allocator);
++   disp->DestroyDescriptorSetLayout(_device, astc->ds_layout, allocator);
 +
-+   VkFenceCreateInfo create_info = {
-+      .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
-+      .flags = VK_FENCE_CREATE_SIGNALED_BIT,
-+   };
-+   VkResult result = disp->CreateFence(_device, &create_info, allocator, fence);
-+   if (result != VK_SUCCESS) {
-+      ALOGE("Failed to create fence");
-+   }
-+}
++   for (unsigned i = 0; i < VK_TEXCOMPRESS_ASTC_NUM_LUTS; i++)
++      disp->DestroyBufferView(_device, astc->luts_buf_view[i], allocator);
 +
-+void
-+vk_texcompress_wait_fence(struct vk_device *device, VkAllocationCallbacks *allocator, VkFence *fence)
-+{
-+   VkDevice _device = vk_device_to_handle(device);
-+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++   for (unsigned i = 0; i < VK_TEXCOMPRESS_ASTC_NUM_PARTITION_TABLES; i++)
++      disp->DestroyBufferView(_device, astc->partition_tbl_buf_view[i], allocator);
 +
-+   VkResult result = disp->WaitForFences(_device, 1, fence, VK_TRUE, UINT64_MAX);
-+   if (result != VK_SUCCESS) {
-+      ALOGE("Failed to wait for fence");
-+   }
++   disp->DestroyBuffer(_device, astc->luts_buf, allocator);
++   disp->FreeMemory(_device, astc->luts_mem, allocator);
++
++   vk_free(allocator, astc);
 +}
-\ No newline at end of file
-diff --git a/src/vulkan/runtime/vk_texcompress_bcn.h b/src/vulkan/runtime/vk_texcompress_bcn.h
+diff --git a/src/vulkan/runtime/vk_texcompress_astc.h b/src/vulkan/runtime/vk_texcompress_astc.h
 new file mode 100644
-index 00000000000..ca8189b657e
+index 00000000000..417ab434be2
 --- /dev/null
-+++ b/src/vulkan/runtime/vk_texcompress_bcn.h
-@@ -0,0 +1,142 @@
++++ b/src/vulkan/runtime/vk_texcompress_astc.h
+@@ -0,0 +1,122 @@
 +/* Copyright (c) 2017-2023 Hans-Kristian Arntzen
 + *
 + * Permission is hereby granted, free of charge, to any person obtaining
@@ -2003,124 +5718,337 @@ index 00000000000..ca8189b657e
 + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 + */
-+#ifndef VK_TEXCOMPRESS_BCN_H
-+#define VK_TEXCOMPRESS_BCN_H
++#ifndef VK_TEXCOMPRESS_ASTC_H
++#define VK_TEXCOMPRESS_ASTC_H
 +
 +#include "util/simple_mtx.h"
-+#include "vk_fence.h"
 +#include "vk_device.h"
 +
-+#define VK_TEXCOMPRESS_BCN_NUM_COMPUTE_PIPELINES 3
-+#define VK_TEXCOMPRESS_BCN_WRITE_DESC_SET_COUNT 3  // 每个着色器最多有3个描述符集
-+
-+struct vk_texcompress_bcn_write_descriptor_set {
-+   VkWriteDescriptorSet descriptor_set[VK_TEXCOMPRESS_BCN_WRITE_DESC_SET_COUNT];
-+   VkDescriptorImageInfo dst_desc_image_info;
-+   VkDescriptorImageInfo src_desc_image_info;
-+   VkDescriptorImageInfo src_desc_image_info_2;
-+   VkDescriptorBufferInfo bc1_buffer_info;
-+};
-+
-+struct vk_texcompress_bc4_write_descriptor_set {
-+   VkWriteDescriptorSet descriptor_set[2];
-+   VkDescriptorImageInfo dst_desc_image_info;
-+   VkDescriptorImageInfo src_desc_image_info;
-+};
-+
-+typedef struct vk_texcompress_bcn_image {
-+   VkImage image_dummy;
-+   VkImage image_rgba;
-+   VkImage image_bc1;
-+   VkImage image_bc4;
-+   VkDeviceMemory image_mem;
-+} bcn_image;
-+
-+struct vk_texcompress_dummy_image {
-+   VkImage image_dummy;
-+   VkDeviceMemory image_mem;
-+};
++/* luts order matching astc glsl shader below,
++ * 0 - color endpoint
++ * 1 - color endpoint unquant
++ * 2 - weights
++ * 3 - weights unquant
++ * 4 - trits quints
++ */
++#define VK_TEXCOMPRESS_ASTC_NUM_LUTS 5
++#define VK_TEXCOMPRESS_ASTC_NUM_PARTITION_TABLES 14
++#define VK_TEXCOMPRESS_ASTC_WRITE_DESC_SET_COUNT 8
 +
-+typedef struct vk_texcompress_staging_images {
-+   struct vk_texcompress_bcn_image *current;
-+   struct vk_texcompress_staging_images* next;
-+} staging_images;
++struct vk_texcompress_astc_state {
++   /* single buffer is allocated for all luts */
++   VkDeviceMemory luts_mem;
++   VkBuffer luts_buf;
 +
-+enum compute_pipeline_id {
-+   BC1_ENCODE = 0,
-+   BC4_ENCODE,
-+   BC3_ENCODE,
-+   MAX_PIPELINE_NUM
-+};
++   VkBufferView luts_buf_view[VK_TEXCOMPRESS_ASTC_NUM_LUTS];
++   VkBufferView partition_tbl_buf_view[VK_TEXCOMPRESS_ASTC_NUM_PARTITION_TABLES];
 +
-+struct vk_texcompress_bcn_state {
 +   simple_mtx_t mutex;
 +   VkDescriptorSetLayout ds_layout;
 +   VkPipelineLayout p_layout;
-+   VkPipeline pipeline;
++   VkPipeline pipeline[VK_TEXCOMPRESS_ASTC_NUM_PARTITION_TABLES];
++   uint32_t pipeline_mask;
 +   VkShaderModule shader_module;
-+   VkSampler sampler;
-+
-+   VkDeviceMemory bc1_table_mem;
-+   VkBuffer bc1_table_buf;
 +};
 +
-+void
-+vk_texcompress_insert_head(staging_images **head, bcn_image *current);
-+
-+void
-+vk_texcompress_delete_head(staging_images **head, struct vk_device *device);
-+   
-+VkResult
-+vk_texcompress_create_image(struct vk_device *device, VkAllocationCallbacks *allocator, VkDeviceMemory *pMem,
-+                            VkImage *image, VkExtent3D extent, VkFormat format, int size);
-+
-+void bind_memory(struct vk_device *device, VkImage *image, VkDeviceMemory *pMem, int offset);
-+
-+void
-+vk_texcompress_finish_image(struct vk_device *device, const VkAllocationCallbacks *allocator,
-+                            struct vk_texcompress_bcn_image *intermidate_image);
-+
-+void
-+vk_texcompress_bc1_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn,
-+                                              struct vk_texcompress_bcn_write_descriptor_set *set,
-+                                              VkImageView src_img_view, VkImageLayout src_img_layout,
-+                                              VkImageView dst_img_view);
-+
-+void
-+vk_texcompress_bc4_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn,
-+                                              struct vk_texcompress_bc4_write_descriptor_set *set,
-+                                              VkImageView src_img_view, VkImageLayout src_img_layout,
-+                                              VkImageView dst_img_view);
++struct vk_texcompress_astc_write_descriptor_set {
++   VkWriteDescriptorSet descriptor_set[VK_TEXCOMPRESS_ASTC_WRITE_DESC_SET_COUNT];
++   VkDescriptorImageInfo dst_desc_image_info;
++   VkDescriptorImageInfo src_desc_image_info;
++};
 +
 +void
-+vk_texcompress_bc3_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn,
-+                                              struct vk_texcompress_bcn_write_descriptor_set *set,
-+                                              VkImageView bc1_src_img_view, VkImageView bc4_src_img_view,
-+                                              VkImageView dst_img_view);
-+
-+VkPipeline
-+vk_texcompress_rgba_get_encode_pipeline(struct vk_device *device,
-+                                        VkAllocationCallbacks *allocator,
-+                                        struct vk_texcompress_bcn_state *bcn,
-+                                        VkPipelineCache pipeline_cache,
-+                                        enum compute_pipeline_id id);
++vk_texcompress_astc_fill_write_descriptor_sets(struct vk_texcompress_astc_state *astc,
++                                               struct vk_texcompress_astc_write_descriptor_set *set,
++                                               VkImageView src_img_view, VkImageLayout src_img_layout,
++                                               VkImageView dst_img_view,
++                                               VkFormat format);
++VkPipeline vk_texcompress_astc_get_decode_pipeline(struct vk_device *device,
++                                                   VkAllocationCallbacks *allocator,
++                                                   struct vk_texcompress_astc_state *astc,
++                                                   VkPipelineCache pipeline_cache,
++                                                   VkFormat format);
++VkResult vk_texcompress_astc_init(struct vk_device *device,
++                                  VkAllocationCallbacks *allocator,
++                                  VkPipelineCache pipeline_cache,
++                                  struct vk_texcompress_astc_state **astc);
++void vk_texcompress_astc_finish(struct vk_device *device,
++                                VkAllocationCallbacks *allocator,
++                                struct vk_texcompress_astc_state *astc);
++
++static inline VkFormat
++vk_texcompress_astc_emulation_format(VkFormat format)
++{
++   /* TODO: From VK_EXT_astc_Decode_mode spec, VK_FORMAT_R16G16B16A16_SFLOAT is the default
++    * option. VK_FORMAT_R8G8B8A8_UNORM is only acceptable image quality option.
++    */
++   switch (format) {
++   case VK_FORMAT_ASTC_4x4_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_5x4_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_5x5_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_6x5_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_6x6_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_8x5_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_8x6_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_8x8_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_10x5_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_10x6_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_10x8_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_10x10_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_12x10_UNORM_BLOCK:
++   case VK_FORMAT_ASTC_12x12_UNORM_BLOCK:
++      return VK_FORMAT_R8G8B8A8_UNORM;
++   case VK_FORMAT_ASTC_4x4_SRGB_BLOCK:
++   case VK_FORMAT_ASTC_5x4_SRGB_BLOCK:
++   case VK_FORMAT_ASTC_5x5_SRGB_BLOCK:
++   case VK_FORMAT_ASTC_6x5_SRGB_BLOCK:
++   case VK_FORMAT_ASTC_6x6_SRGB_BLOCK:
++   case VK_FORMAT_ASTC_8x5_SRGB_BLOCK:
++   case VK_FORMAT_ASTC_8x6_SRGB_BLOCK:
++   case VK_FORMAT_ASTC_8x8_SRGB_BLOCK:
++   case VK_FORMAT_ASTC_10x5_SRGB_BLOCK:
++   case VK_FORMAT_ASTC_10x6_SRGB_BLOCK:
++   case VK_FORMAT_ASTC_10x8_SRGB_BLOCK:
++   case VK_FORMAT_ASTC_10x10_SRGB_BLOCK:
++   case VK_FORMAT_ASTC_12x10_SRGB_BLOCK:
++   case VK_FORMAT_ASTC_12x12_SRGB_BLOCK:
++      return VK_FORMAT_R8G8B8A8_SRGB;
++   default:
++      return VK_FORMAT_UNDEFINED;
++   }
++}
 +
-+VkResult
-+vk_texcompress_bcn_init(struct vk_device *device,
-+                        VkAllocationCallbacks *allocator,
-+                        VkPipelineCache pipeline_cache,
-+                        struct vk_texcompress_bcn_state *bcn[]);
++#endif /* VK_TEXCOMPRESS_ASTC_H */
+diff --git a/src/vulkan/util/vk_format.c b/src/vulkan/util/vk_format.c
+index f5da22bac88..57984de2227 100644
+--- a/src/vulkan/util/vk_format.c
++++ b/src/vulkan/util/vk_format.c
+@@ -288,6 +288,201 @@ vk_format_to_pipe_format(enum VkFormat vkformat)
+    return vk_format_map[vkformat];
+ }
+ 
 +
-+void
-+vk_texcompress_bcn_finish(struct vk_device *device,
-+                          VkAllocationCallbacks *allocator,
-+                          struct vk_texcompress_bcn_state *bcn[]);
++static const VkFormat formats[PIPE_FORMAT_COUNT] = {
++#define MAP_FORMAT_NORM(FMT) \
++   [PIPE_FORMAT_ ## FMT ## _UNORM] = VK_FORMAT_ ## FMT ## _UNORM, \
++   [PIPE_FORMAT_ ## FMT ## _SNORM] = VK_FORMAT_ ## FMT ## _SNORM,
++
++#define MAP_FORMAT_SCALED(FMT) \
++   [PIPE_FORMAT_ ## FMT ## _USCALED] = VK_FORMAT_ ## FMT ## _USCALED, \
++   [PIPE_FORMAT_ ## FMT ## _SSCALED] = VK_FORMAT_ ## FMT ## _SSCALED,
++
++#define MAP_FORMAT_INT(FMT) \
++   [PIPE_FORMAT_ ## FMT ## _UINT] = VK_FORMAT_ ## FMT ## _UINT, \
++   [PIPE_FORMAT_ ## FMT ## _SINT] = VK_FORMAT_ ## FMT ## _SINT,
++
++#define MAP_FORMAT_SRGB(FMT) \
++   [PIPE_FORMAT_ ## FMT ## _SRGB] = VK_FORMAT_ ## FMT ## _SRGB,
++
++#define MAP_FORMAT_FLOAT(FMT) \
++   [PIPE_FORMAT_ ## FMT ## _FLOAT] = VK_FORMAT_ ## FMT ## _SFLOAT,
++
++   // one component
++
++   // 8-bits
++   MAP_FORMAT_NORM(R8)
++   MAP_FORMAT_SCALED(R8)
++   MAP_FORMAT_INT(R8)
++   MAP_FORMAT_SRGB(R8)
++   // 16-bits
++   MAP_FORMAT_NORM(R16)
++   MAP_FORMAT_SCALED(R16)
++   MAP_FORMAT_INT(R16)
++   MAP_FORMAT_FLOAT(R16)
++   // 32-bits
++   MAP_FORMAT_INT(R32)
++   MAP_FORMAT_FLOAT(R32)
++
++   // two components
++
++   // 8-bits
++   MAP_FORMAT_NORM(R8G8)
++   MAP_FORMAT_SCALED(R8G8)
++   MAP_FORMAT_INT(R8G8)
++   MAP_FORMAT_SRGB(R8G8)
++   // 16-bits
++   MAP_FORMAT_NORM(R16G16)
++   MAP_FORMAT_SCALED(R16G16)
++   MAP_FORMAT_INT(R16G16)
++   MAP_FORMAT_FLOAT(R16G16)
++   // 32-bits
++   MAP_FORMAT_INT(R32G32)
++   MAP_FORMAT_FLOAT(R32G32)
++
++   // three components
++
++   // 8-bits
++   MAP_FORMAT_NORM(R8G8B8)
++   MAP_FORMAT_SCALED(R8G8B8)
++   MAP_FORMAT_INT(R8G8B8)
++   MAP_FORMAT_SRGB(R8G8B8)
++   MAP_FORMAT_NORM(B8G8R8)
++   MAP_FORMAT_SCALED(B8G8R8)
++   MAP_FORMAT_INT(B8G8R8)
++   MAP_FORMAT_SRGB(B8G8R8)
++   // 16-bits
++   MAP_FORMAT_NORM(R16G16B16)
++   MAP_FORMAT_SCALED(R16G16B16)
++   MAP_FORMAT_INT(R16G16B16)
++   MAP_FORMAT_FLOAT(R16G16B16)
++   // 32-bits
++   MAP_FORMAT_INT(R32G32B32)
++   MAP_FORMAT_FLOAT(R32G32B32)
++
++   // four components
++
++   // 8-bits
++   MAP_FORMAT_NORM(R8G8B8A8)
++   MAP_FORMAT_SCALED(R8G8B8A8)
++   MAP_FORMAT_INT(R8G8B8A8)
++   MAP_FORMAT_NORM(B8G8R8A8)
++   MAP_FORMAT_SCALED(B8G8R8A8)
++   MAP_FORMAT_INT(B8G8R8A8)
++   MAP_FORMAT_SRGB(B8G8R8A8)
++   [PIPE_FORMAT_RGBA8888_SRGB] = VK_FORMAT_A8B8G8R8_SRGB_PACK32,
++   // 16-bits
++   MAP_FORMAT_NORM(R16G16B16A16)
++   MAP_FORMAT_SCALED(R16G16B16A16)
++   MAP_FORMAT_INT(R16G16B16A16)
++   MAP_FORMAT_FLOAT(R16G16B16A16)
++   // 32-bits
++   MAP_FORMAT_INT(R32G32B32A32)
++   MAP_FORMAT_FLOAT(R32G32B32A32)
++
++   // other color formats
++   [PIPE_FORMAT_A4B4G4R4_UNORM] = VK_FORMAT_R4G4B4A4_UNORM_PACK16,
++   [PIPE_FORMAT_A4R4G4B4_UNORM] = VK_FORMAT_B4G4R4A4_UNORM_PACK16,
++   [PIPE_FORMAT_B4G4R4A4_UNORM] = VK_FORMAT_A4R4G4B4_UNORM_PACK16,
++   [PIPE_FORMAT_R4G4B4A4_UNORM] = VK_FORMAT_A4B4G4R4_UNORM_PACK16,
++   [PIPE_FORMAT_B5G6R5_UNORM] = VK_FORMAT_R5G6B5_UNORM_PACK16,
++   [PIPE_FORMAT_R5G6B5_UNORM] = VK_FORMAT_B5G6R5_UNORM_PACK16,
++
++   [PIPE_FORMAT_A1B5G5R5_UNORM] = VK_FORMAT_R5G5B5A1_UNORM_PACK16,
++   [PIPE_FORMAT_A1R5G5B5_UNORM] = VK_FORMAT_B5G5R5A1_UNORM_PACK16,
++   [PIPE_FORMAT_B5G5R5A1_UNORM] = VK_FORMAT_A1R5G5B5_UNORM_PACK16,
++
++   [PIPE_FORMAT_R11G11B10_FLOAT] = VK_FORMAT_B10G11R11_UFLOAT_PACK32,
++   [PIPE_FORMAT_R9G9B9E5_FLOAT] = VK_FORMAT_E5B9G9R9_UFLOAT_PACK32,
++   /* ARB_vertex_type_2_10_10_10 */
++   [PIPE_FORMAT_R10G10B10A2_UNORM] = VK_FORMAT_A2B10G10R10_UNORM_PACK32,
++   [PIPE_FORMAT_R10G10B10A2_SNORM] = VK_FORMAT_A2B10G10R10_SNORM_PACK32,
++   [PIPE_FORMAT_B10G10R10A2_UNORM] = VK_FORMAT_A2R10G10B10_UNORM_PACK32,
++   [PIPE_FORMAT_B10G10R10A2_SNORM] = VK_FORMAT_A2R10G10B10_SNORM_PACK32,
++   [PIPE_FORMAT_R10G10B10A2_USCALED] = VK_FORMAT_A2B10G10R10_USCALED_PACK32,
++   [PIPE_FORMAT_R10G10B10A2_SSCALED] = VK_FORMAT_A2B10G10R10_SSCALED_PACK32,
++   [PIPE_FORMAT_B10G10R10A2_USCALED] = VK_FORMAT_A2R10G10B10_USCALED_PACK32,
++   [PIPE_FORMAT_B10G10R10A2_SSCALED] = VK_FORMAT_A2R10G10B10_SSCALED_PACK32,
++   [PIPE_FORMAT_R10G10B10A2_UINT] = VK_FORMAT_A2B10G10R10_UINT_PACK32,
++   [PIPE_FORMAT_B10G10R10A2_UINT] = VK_FORMAT_A2R10G10B10_UINT_PACK32,
++   [PIPE_FORMAT_B10G10R10A2_SINT] = VK_FORMAT_A2R10G10B10_SINT_PACK32,
++
++   // depth/stencil formats
++   [PIPE_FORMAT_Z32_FLOAT] = VK_FORMAT_D32_SFLOAT,
++   [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT] = VK_FORMAT_D32_SFLOAT_S8_UINT,
++   [PIPE_FORMAT_Z16_UNORM] = VK_FORMAT_D16_UNORM,
++   [PIPE_FORMAT_Z16_UNORM_S8_UINT] = VK_FORMAT_D16_UNORM_S8_UINT,
++   [PIPE_FORMAT_Z24X8_UNORM] = VK_FORMAT_X8_D24_UNORM_PACK32,
++   [PIPE_FORMAT_Z24_UNORM_S8_UINT] = VK_FORMAT_D24_UNORM_S8_UINT,
++   [PIPE_FORMAT_S8_UINT] = VK_FORMAT_S8_UINT,
++
++   // compressed formats
++   [PIPE_FORMAT_DXT1_RGB] = VK_FORMAT_BC1_RGB_UNORM_BLOCK,
++   [PIPE_FORMAT_DXT1_RGBA] = VK_FORMAT_BC1_RGBA_UNORM_BLOCK,
++   [PIPE_FORMAT_DXT3_RGBA] = VK_FORMAT_BC2_UNORM_BLOCK,
++   [PIPE_FORMAT_DXT5_RGBA] = VK_FORMAT_BC3_UNORM_BLOCK,
++   [PIPE_FORMAT_DXT1_SRGB] = VK_FORMAT_BC1_RGB_SRGB_BLOCK,
++   [PIPE_FORMAT_DXT1_SRGBA] = VK_FORMAT_BC1_RGBA_SRGB_BLOCK,
++   [PIPE_FORMAT_DXT3_SRGBA] = VK_FORMAT_BC2_SRGB_BLOCK,
++   [PIPE_FORMAT_DXT5_SRGBA] = VK_FORMAT_BC3_SRGB_BLOCK,
++
++   [PIPE_FORMAT_RGTC1_UNORM] = VK_FORMAT_BC4_UNORM_BLOCK,
++   [PIPE_FORMAT_RGTC1_SNORM] = VK_FORMAT_BC4_SNORM_BLOCK,
++   [PIPE_FORMAT_RGTC2_UNORM] = VK_FORMAT_BC5_UNORM_BLOCK,
++   [PIPE_FORMAT_RGTC2_SNORM] = VK_FORMAT_BC5_SNORM_BLOCK,
++   [PIPE_FORMAT_BPTC_RGBA_UNORM] = VK_FORMAT_BC7_UNORM_BLOCK,
++   [PIPE_FORMAT_BPTC_SRGBA] = VK_FORMAT_BC7_SRGB_BLOCK,
++   [PIPE_FORMAT_BPTC_RGB_FLOAT] = VK_FORMAT_BC6H_SFLOAT_BLOCK,
++   [PIPE_FORMAT_BPTC_RGB_UFLOAT] = VK_FORMAT_BC6H_UFLOAT_BLOCK,
++
++   [PIPE_FORMAT_ETC1_RGB8] = VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK,
++   [PIPE_FORMAT_ETC2_RGB8] = VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK,
++   [PIPE_FORMAT_ETC2_SRGB8] = VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK,
++   [PIPE_FORMAT_ETC2_RGB8A1] = VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK,
++   [PIPE_FORMAT_ETC2_SRGB8A1] = VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK,
++   [PIPE_FORMAT_ETC2_RGBA8] = VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK,
++   [PIPE_FORMAT_ETC2_SRGBA8] = VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK,
++   [PIPE_FORMAT_ETC2_R11_UNORM] = VK_FORMAT_EAC_R11_UNORM_BLOCK,
++   [PIPE_FORMAT_ETC2_R11_SNORM] = VK_FORMAT_EAC_R11_SNORM_BLOCK,
++   [PIPE_FORMAT_ETC2_RG11_UNORM] = VK_FORMAT_EAC_R11G11_UNORM_BLOCK,
++   [PIPE_FORMAT_ETC2_RG11_SNORM] = VK_FORMAT_EAC_R11G11_SNORM_BLOCK,
++
++   [PIPE_FORMAT_ASTC_4x4] = VK_FORMAT_ASTC_4x4_UNORM_BLOCK,
++   [PIPE_FORMAT_ASTC_4x4_SRGB] = VK_FORMAT_ASTC_4x4_SRGB_BLOCK,
++   [PIPE_FORMAT_ASTC_5x4] = VK_FORMAT_ASTC_5x4_UNORM_BLOCK,
++   [PIPE_FORMAT_ASTC_5x4_SRGB] = VK_FORMAT_ASTC_5x4_SRGB_BLOCK,
++   [PIPE_FORMAT_ASTC_5x5] = VK_FORMAT_ASTC_5x5_UNORM_BLOCK,
++   [PIPE_FORMAT_ASTC_5x5_SRGB] = VK_FORMAT_ASTC_5x5_SRGB_BLOCK,
++   [PIPE_FORMAT_ASTC_6x5] = VK_FORMAT_ASTC_6x5_UNORM_BLOCK,
++   [PIPE_FORMAT_ASTC_6x5_SRGB] = VK_FORMAT_ASTC_6x5_SRGB_BLOCK,
++   [PIPE_FORMAT_ASTC_6x6] = VK_FORMAT_ASTC_6x6_UNORM_BLOCK,
++   [PIPE_FORMAT_ASTC_6x6_SRGB] = VK_FORMAT_ASTC_6x6_SRGB_BLOCK,
++   [PIPE_FORMAT_ASTC_8x5] = VK_FORMAT_ASTC_8x5_UNORM_BLOCK,
++   [PIPE_FORMAT_ASTC_8x5_SRGB] = VK_FORMAT_ASTC_8x5_SRGB_BLOCK,
++   [PIPE_FORMAT_ASTC_8x6] = VK_FORMAT_ASTC_8x6_UNORM_BLOCK,
++   [PIPE_FORMAT_ASTC_8x6_SRGB] = VK_FORMAT_ASTC_8x6_SRGB_BLOCK,
++   [PIPE_FORMAT_ASTC_8x8] = VK_FORMAT_ASTC_8x8_UNORM_BLOCK,
++   [PIPE_FORMAT_ASTC_8x8_SRGB] = VK_FORMAT_ASTC_8x8_SRGB_BLOCK,
++   [PIPE_FORMAT_ASTC_10x5] = VK_FORMAT_ASTC_10x5_UNORM_BLOCK,
++   [PIPE_FORMAT_ASTC_10x5_SRGB] = VK_FORMAT_ASTC_10x5_SRGB_BLOCK,
++   [PIPE_FORMAT_ASTC_10x6] = VK_FORMAT_ASTC_10x6_UNORM_BLOCK,
++   [PIPE_FORMAT_ASTC_10x6_SRGB] = VK_FORMAT_ASTC_10x6_SRGB_BLOCK,
++   [PIPE_FORMAT_ASTC_10x8] = VK_FORMAT_ASTC_10x8_UNORM_BLOCK,
++   [PIPE_FORMAT_ASTC_10x8_SRGB] = VK_FORMAT_ASTC_10x8_SRGB_BLOCK,
++   [PIPE_FORMAT_ASTC_10x10] = VK_FORMAT_ASTC_10x10_UNORM_BLOCK,
++   [PIPE_FORMAT_ASTC_10x10_SRGB] = VK_FORMAT_ASTC_10x10_SRGB_BLOCK,
++   [PIPE_FORMAT_ASTC_12x10] = VK_FORMAT_ASTC_12x10_UNORM_BLOCK,
++   [PIPE_FORMAT_ASTC_12x10_SRGB] = VK_FORMAT_ASTC_12x10_SRGB_BLOCK,
++   [PIPE_FORMAT_ASTC_12x12] = VK_FORMAT_ASTC_12x12_UNORM_BLOCK,
++   [PIPE_FORMAT_ASTC_12x12_SRGB] = VK_FORMAT_ASTC_12x12_SRGB_BLOCK,
++};
 +
++VkFormat
++vk_format_from_pipe_format(enum pipe_format format)
++{
++   return formats[format];
++}
 +
-+void
-+vk_texcompress_create_fence(struct vk_device *device, VkAllocationCallbacks *allocator, VkFence *fence);
+ VkImageAspectFlags
+ vk_format_aspects(VkFormat format)
+ {
+diff --git a/src/vulkan/util/vk_format.h b/src/vulkan/util/vk_format.h
+index 630cfbe5929..ca43c134bbb 100644
+--- a/src/vulkan/util/vk_format.h
++++ b/src/vulkan/util/vk_format.h
+@@ -35,6 +35,9 @@ extern "C" {
+ enum pipe_format
+ vk_format_to_pipe_format(enum VkFormat vkformat);
+ 
++VkFormat
++vk_format_from_pipe_format(enum pipe_format format);
 +
-+void
-+vk_texcompress_wait_fence(struct vk_device *device, VkAllocationCallbacks *allocator, VkFence *fence);
+ VkImageAspectFlags
+ vk_format_aspects(VkFormat format);
+ 
+@@ -158,6 +161,12 @@ vk_format_is_compressed(VkFormat format)
+    return vk_format_get_blockwidth(format) > 1;
+ }
+ 
++static inline bool
++vk_format_is_block_compressed(VkFormat format)
++{
++   return util_format_is_compressed(vk_format_to_pipe_format(format));
++}
 +
-+#endif /* VK_TEXCOMPRESS_BCN_H */
+ static inline const struct util_format_description *
+ vk_format_description(VkFormat format)
+ {
-- 
Gitee


From f1b25b20cc768bab1e7ad3b85d72ed5c375f14d0 Mon Sep 17 00:00:00 2001
From: zhuanghb3 <zhuanghb3@163.com>
Date: Thu, 25 Sep 2025 02:18:45 +0000
Subject: [PATCH 2/3] =?UTF-8?q?=E4=BD=BF=E8=83=BDopengles=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81astc=E6=A0=BC=E5=BC=8F=E7=A1=AC=E8=A7=A3=E7=A1=AC?=
 =?UTF-8?q?=E7=BC=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 patchForAndroid/external-mesa-0004.patch | 2126 ++++++++++++++++++++++
 1 file changed, 2126 insertions(+)
 create mode 100644 patchForAndroid/external-mesa-0004.patch

diff --git a/patchForAndroid/external-mesa-0004.patch b/patchForAndroid/external-mesa-0004.patch
new file mode 100644
index 0000000..bd9c3bf
--- /dev/null
+++ b/patchForAndroid/external-mesa-0004.patch
@@ -0,0 +1,2126 @@
+diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build
+index 71c8b16d4fc..691ebd9d616 100644
+--- a/src/amd/vulkan/meson.build
++++ b/src/amd/vulkan/meson.build
+@@ -70,6 +70,7 @@ libradv_files = files(
+   'radv_meta_fmask_copy.c',
+   'radv_meta_fmask_expand.c',
+   'radv_meta_astc_decode.c',
++  'radv_meta_rgba_encode.c',
+   'radv_meta_resolve.c',
+   'radv_meta_resolve_cs.c',
+   'radv_meta_resolve_fs.c',
+diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c
+index 3fd4ea777a8..dfd7bdca894 100644
+--- a/src/amd/vulkan/radv_image.c
++++ b/src/amd/vulkan/radv_image.c
+@@ -38,6 +38,7 @@
+ #include <cutils/properties.h>
+ 
+ #include "gfx10_format_table.h"
++#include "vk_texcompress_bcn.h"
+ 
+ static const VkImageUsageFlagBits RADV_IMAGE_USAGE_WRITE_BITS =
+    VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT |
+@@ -546,6 +547,66 @@ radv_patch_image_from_extra_info(struct radv_device *device, struct radv_image *
+ 
+ extern bool radv_is_format_emulated(const struct radv_physical_device *physical_device, VkFormat format);
+ 
++static char isEnableHardEncode[PROPERTY_VALUE_MAX];
++VkFormat
++radv_translate_rgba2dxt5(VkFormat format)
++{
++   switch (format) {
++   case VK_FORMAT_R8G8B8A8_UNORM:
++      return VK_FORMAT_BC3_UNORM_BLOCK;
++   case VK_FORMAT_R8G8B8A8_SRGB:
++      return VK_FORMAT_BC3_SRGB_BLOCK;
++   default:
++      return format;
++   }
++}
++
++static bool
++radv_need_hard_encode(const struct radv_physical_device *pdev, const VkImageCreateInfo *pCreateInfo)
++{
++   if (property_get("sys.vmi.vk.texturecompress", isEnableHardEncode, "0") && strcmp(isEnableHardEncode, "2") != 0) {
++      return false;
++   }
++
++   // 暂不支持对图像数组进行压缩
++   if (pCreateInfo->arrayLayers > 1) {
++      return false;
++   }
++
++   if (!radv_is_format_emulated(pdev, pCreateInfo->format)) {
++      return false;
++   }
++
++   const enum util_format_layout format_layout = vk_format_description(pCreateInfo->format)->layout;
++   if (format_layout != UTIL_FORMAT_LAYOUT_ASTC &&
++       vk_texcompress_etc2_emulation_format(pCreateInfo->format) != VK_FORMAT_R8G8B8A8_UNORM &&
++       vk_texcompress_etc2_emulation_format(pCreateInfo->format) != VK_FORMAT_R8G8B8A8_SRGB) {
++      return false;
++   }
++
++   // 过滤掉对稀疏纹理的压缩，对该部分纹理进行压缩时可能造成GPU reset。 
++   if (pCreateInfo->flags & (VK_IMAGE_CREATE_SPARSE_BINDING_BIT | VK_IMAGE_CREATE_SPARSE_RESIDENCY_BIT |
++      VK_IMAGE_CREATE_SPARSE_ALIASED_BIT)) {
++      return false;
++   }
++
++	// 检查图像的用途，防止压缩 swapchain 的 images；并只对 VK_IMAGE_TYPE_2D 类型纹理进行压缩
++	if (!(pCreateInfo->usage & VK_IMAGE_USAGE_SAMPLED_BIT) || pCreateInfo->imageType != VK_IMAGE_TYPE_2D) {
++		return false;
++	}
++
++   // 图像太大内存申请不了
++	if (pCreateInfo->extent.width >= 3960 && pCreateInfo->extent.height >= 3960) {
++	   return false;
++	}
++
++	// 检查图像大小
++	if (pCreateInfo->extent.width >= 256 && pCreateInfo->extent.height >= 256) {
++	   return true;
++	}
++	return false;
++}
++
+ static VkFormat
+ radv_image_get_plane_format(const struct radv_physical_device *pdev, const struct radv_image *image,
+                             unsigned plane)
+@@ -553,10 +614,15 @@ radv_image_get_plane_format(const struct radv_physical_device *pdev, const struc
+    if (radv_is_format_emulated(pdev, image->vk_format)) {
+       if (plane == 0)
+          return image->vk_format;
+-      if (vk_format_description(image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ASTC)
++      if (vk_format_description(image->vk_format)->layout == UTIL_FORMAT_LAYOUT_ASTC) {
++         if (image->isNeedHardEncode)
++            return radv_translate_rgba2dxt5(vk_texcompress_astc_emulation_format(image->vk_format));
+          return vk_texcompress_astc_emulation_format(image->vk_format);
+-      else
++      } else {
++         if (image->isNeedHardEncode)
++            return radv_translate_rgba2dxt5(vk_texcompress_etc2_emulation_format(image->vk_format));
+          return vk_texcompress_etc2_emulation_format(image->vk_format);
++      }
+    }
+    return vk_format_get_plane_format(image->vk_format, plane);
+ }
+@@ -1710,6 +1776,12 @@ radv_destroy_image(struct radv_device *device, const VkAllocationCallbacks *pAll
+    vk_free2(&device->vk.alloc, pAllocator, image);
+ }
+ 
++void
++radv_internal_image_finish(struct radv_image *image)
++{
++   vk_object_base_finish(&image->base);
++}
++
+ static void
+ radv_image_print_info(struct radv_device *device, struct radv_image *image)
+ {
+@@ -1890,9 +1962,11 @@ radv_image_create(VkDevice _device, const struct radv_image_create_info *create_
+       vk_find_struct_const(pCreateInfo->pNext, EXTERNAL_MEMORY_IMAGE_CREATE_INFO);
+    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO);
+ 
++   bool needHardEncode = radv_need_hard_encode(device->physical_device, pCreateInfo);
++
+    unsigned plane_count = radv_get_internal_plane_count(device->physical_device, format);
+ 
+-   bool needSoftEncode = !external_info && radv_need_soft_encode(pCreateInfo);
++   bool needSoftEncode = !external_info && radv_need_soft_encode(pCreateInfo) && !needHardEncode;
+    bool needSoftDecode = needSoftEncode && radv_need_soft_decode(format);
+    VkFormat softDecodeFormat = radv_translate_etc(format);
+    VkFormat softEncodeFormat = radv_translate_rgb2BC(format);
+@@ -1936,6 +2010,7 @@ radv_image_create(VkDevice _device, const struct radv_image_create_info *create_
+    image->usage = pCreateInfo->usage;
+    image->flags = pCreateInfo->flags;
+    image->plane_count = vk_format_get_plane_count(format);
++   image->isNeedHardEncode = needHardEncode;
+ 
+    //满足要求的 ETC2 纹理使用软解
+    if (needSoftDecode) {
+diff --git a/src/amd/vulkan/radv_meta.c b/src/amd/vulkan/radv_meta.c
+index b1080ef8935..9b5a73cd1be 100644
+--- a/src/amd/vulkan/radv_meta.c
++++ b/src/amd/vulkan/radv_meta.c
+@@ -601,8 +601,14 @@ radv_device_init_meta(struct radv_device *device)
+    if (result != VK_SUCCESS)
+       goto fail_astc_decode;
+ 
++   result = radv_device_init_meta_rgba_encode_state(device);
++   if (result != VK_SUCCESS)
++      goto fail_bcn_encode;
++
+    return VK_SUCCESS;
+ 
++fail_bcn_encode:
++   radv_device_finish_meta_rgba_encode_state(device);
+ fail_astc_decode:
+    radv_device_finish_meta_astc_decode_state(device);
+ fail_etc_decode:
+@@ -644,6 +650,7 @@ fail_clear:
+ void
+ radv_device_finish_meta(struct radv_device *device)
+ {
++   radv_device_finish_meta_rgba_encode_state(device);
+    radv_device_finish_meta_etc_decode_state(device);
+    radv_device_finish_meta_astc_decode_state(device);
+    radv_device_finish_accel_struct_build_state(device);
+diff --git a/src/amd/vulkan/radv_meta.h b/src/amd/vulkan/radv_meta.h
+index d8ab75481d9..9702bfc9c7b 100644
+--- a/src/amd/vulkan/radv_meta.h
++++ b/src/amd/vulkan/radv_meta.h
+@@ -107,6 +107,9 @@ void radv_device_finish_meta_etc_decode_state(struct radv_device *device);
+ VkResult radv_device_init_meta_astc_decode_state(struct radv_device *device, bool on_demand);
+ void radv_device_finish_meta_astc_decode_state(struct radv_device *device);
+ 
++VkResult radv_device_init_meta_rgba_encode_state(struct radv_device *device);
++void radv_device_finish_meta_rgba_encode_state(struct radv_device *device);
++
+ void radv_meta_save(struct radv_meta_saved_state *saved_state, struct radv_cmd_buffer *cmd_buffer,
+                     uint32_t flags);
+ 
+@@ -232,6 +235,9 @@ void radv_meta_decode_etc(struct radv_cmd_buffer *cmd_buffer, struct radv_image
+ void radv_meta_decode_astc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout layout,
+                            const VkImageSubresourceLayers *subresource, VkOffset3D offset, VkExtent3D extent);
+ 
++void radv_meta_bcn_encoded(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout layout,
++                           const VkImageSubresourceLayers *subresource, VkOffset3D offset, VkExtent3D extent);
++
+ /**
+  * Return whether the bound pipeline is the FMASK decompress pass.
+  */
+diff --git a/src/amd/vulkan/radv_meta_astc_decode.c b/src/amd/vulkan/radv_meta_astc_decode.c
+index 93a63692c09..006394c639b 100644
+--- a/src/amd/vulkan/radv_meta_astc_decode.c
++++ b/src/amd/vulkan/radv_meta_astc_decode.c
+@@ -166,8 +166,15 @@ radv_meta_decode_astc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *ima
+    struct radv_image_view src_iview, dst_iview;
+    image_view_init(device, image, VK_FORMAT_R32G32B32A32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
+                    subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &src_iview);
+-   image_view_init(device, image, VK_FORMAT_R8G8B8A8_UINT, VK_IMAGE_ASPECT_PLANE_1_BIT, subresource->mipLevel,
+-                   subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &dst_iview);
++   if (image->isNeedHardEncode) {
++      RADV_FROM_HANDLE(radv_image, store_image, image->staging_images->image_rgba);
++      cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, store_image);
++      image_view_init(device, store_image, VK_FORMAT_R8G8B8A8_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
++         subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &dst_iview);
++   } else {
++      image_view_init(device, image, VK_FORMAT_R8G8B8A8_UINT, VK_IMAGE_ASPECT_PLANE_1_BIT, subresource->mipLevel,
++         subresource->baseArrayLayer, vk_image_subresource_layer_count(vkImage, subresource), &dst_iview);
++   }
+ 
+    VkExtent3D extent_copy = {
+       .width = extent.width,
+diff --git a/src/amd/vulkan/radv_meta_copy.c b/src/amd/vulkan/radv_meta_copy.c
+index b418d3f2ab9..a0a6b41cce9 100644
+--- a/src/amd/vulkan/radv_meta_copy.c
++++ b/src/amd/vulkan/radv_meta_copy.c
+@@ -26,6 +26,7 @@
+ #include "main/formats.h"
+ #include "main/texcompress_etc.h"
+ #include "main/texcompress_bptc_tmp.h"
++#include "vk_texcompress_bcn.h"
+ 
+ static VkExtent3D
+ meta_image_block_size(const struct radv_image *image)
+@@ -471,6 +472,74 @@ copy_buffer_to_image(struct radv_cmd_buffer *cmd_buffer, struct radv_buffer *buf
+    radv_meta_restore(&saved_state, cmd_buffer);
+ }
+ 
++void radv_create_staging_images(struct radv_image *dst_image, struct radv_cmd_buffer *cmd_buffer)
++{
++   struct vk_texcompress_bcn_image *staging_images =
++      vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(struct vk_texcompress_bcn_image), 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
++   dst_image->staging_images = staging_images;
++
++   uint32_t size = 0;
++   VkExtent3D extent = (VkExtent3D){dst_image->info.width, dst_image->info.height, 1};
++   VkFormat format = VK_FORMAT_R8G8B8A8_UNORM;
++   vk_texcompress_create_image(&cmd_buffer->device->vk, NULL, &staging_images->image_mem,
++                               &staging_images->image_rgba, extent, format, 0);
++   RADV_FROM_HANDLE(radv_image, _image_rgba, dst_image->staging_images->image_rgba);
++   size += _image_rgba->size;
++
++   extent = (VkExtent3D){(dst_image->info.width + 3) / 4, (dst_image->info.height + 3) / 4, 1};
++   format = VK_FORMAT_R32G32_UINT;
++   vk_texcompress_create_image(&cmd_buffer->device->vk, NULL, &staging_images->image_mem,
++                               &staging_images->image_bc1, extent, format, 0);
++   RADV_FROM_HANDLE(radv_image, _image_bc1, dst_image->staging_images->image_bc1);
++   size += _image_bc1->size;
++
++   extent = (VkExtent3D){(dst_image->info.width + 3) / 4, (dst_image->info.height + 3) / 4, 1};
++   format = VK_FORMAT_R32G32_UINT;
++   vk_texcompress_create_image(&cmd_buffer->device->vk, NULL, &staging_images->image_mem,
++                               &staging_images->image_bc4, extent, format, 0);
++   RADV_FROM_HANDLE(radv_image, _image_bc4, dst_image->staging_images->image_bc4);
++   size += _image_bc4->size;
++
++   extent = (VkExtent3D){dst_image->info.width, dst_image->info.height, 1};
++   format = VK_FORMAT_R8G8B8A8_UNORM;
++   vk_texcompress_create_image(&cmd_buffer->device->vk, NULL, &staging_images->image_mem,
++      &staging_images->image_dummy, extent, format, size);
++   RADV_FROM_HANDLE(radv_image, _image_dummy, dst_image->staging_images->image_dummy);
++
++   bind_memory(&cmd_buffer->device->vk, &staging_images->image_rgba, &staging_images->image_mem, 0);
++   bind_memory(&cmd_buffer->device->vk, &staging_images->image_bc1, &staging_images->image_mem, _image_rgba->size);
++   bind_memory(&cmd_buffer->device->vk, &staging_images->image_bc4, &staging_images->image_mem, _image_bc1->size);
++
++   cmd_buffer->vk.is_need_hard_encode = true;
++
++   vk_texcompress_insert_head(&cmd_buffer->vk.staging_image_list_head, dst_image->staging_images);
++}
++
++void radv_copy_buffer_hard_encode(struct radv_image *dst_image,
++                                  const enum util_format_layout format_layout,
++                                  struct radv_cmd_buffer *cmd_buffer,
++                                  const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
++{
++   radv_create_staging_images(dst_image, cmd_buffer);
++   for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) {
++      if (format_layout == UTIL_FORMAT_LAYOUT_ASTC) {
++         radv_meta_decode_astc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout,
++                               &pCopyBufferToImageInfo->pRegions[r].imageSubresource,
++                               pCopyBufferToImageInfo->pRegions[r].imageOffset,
++                               pCopyBufferToImageInfo->pRegions[r].imageExtent);
++      } else {
++         radv_meta_decode_etc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout,
++                              &pCopyBufferToImageInfo->pRegions[r].imageSubresource,
++                              pCopyBufferToImageInfo->pRegions[r].imageOffset,
++                              pCopyBufferToImageInfo->pRegions[r].imageExtent);
++      }
++      radv_meta_bcn_encoded(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout,
++                            &pCopyBufferToImageInfo->pRegions[r].imageSubresource,
++                            pCopyBufferToImageInfo->pRegions[r].imageOffset,
++                            pCopyBufferToImageInfo->pRegions[r].imageExtent);
++   }
++}
++
+ extern bool
+ radv_is_format_emulated(const struct radv_physical_device *physical_device, VkFormat format);
+ 
+@@ -495,6 +564,12 @@ radv_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
+          radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_image);
+ 
+       const enum util_format_layout format_layout = vk_format_description(dst_image->vk_format)->layout;
++
++      if (dst_image->isNeedHardEncode) {
++         radv_copy_buffer_hard_encode(dst_image, format_layout, cmd_buffer, pCopyBufferToImageInfo);
++         return;
++      }
++
+       for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) {
+          if (format_layout == UTIL_FORMAT_LAYOUT_ASTC) {
+             radv_meta_decode_astc(cmd_buffer, dst_image, pCopyBufferToImageInfo->dstImageLayout,
+@@ -847,6 +922,36 @@ copy_image(struct radv_cmd_buffer *cmd_buffer, struct radv_image *src_image,
+    radv_meta_restore(&saved_state, cmd_buffer);
+ }
+ 
++void radv_copy_image_hard_encode(struct radv_image *dst_image,
++                                  const enum util_format_layout format_layout,
++                                  struct radv_cmd_buffer *cmd_buffer,
++                                  const VkCopyImageInfo2 *pCopyImageInfo)
++{
++   radv_create_staging_images(dst_image, cmd_buffer);
++   RADV_FROM_HANDLE(radv_image, src_image, pCopyImageInfo->srcImage);
++   for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) {
++      VkExtent3D dst_extent = pCopyImageInfo->pRegions[r].extent;
++      if (src_image->vk_format != dst_image->vk_format) {
++         dst_extent.width = dst_extent.width / vk_format_get_blockwidth(src_image->vk_format) *
++                            vk_format_get_blockwidth(dst_image->vk_format);
++         dst_extent.height = dst_extent.height / vk_format_get_blockheight(src_image->vk_format) *
++                             vk_format_get_blockheight(dst_image->vk_format);
++      }
++      if (format_layout == UTIL_FORMAT_LAYOUT_ASTC) {
++         radv_meta_decode_astc(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout,
++                               &pCopyImageInfo->pRegions[r].dstSubresource, pCopyImageInfo->pRegions[r].dstOffset,
++                               dst_extent);
++      } else {
++         radv_meta_decode_etc(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout,
++                              &pCopyImageInfo->pRegions[r].dstSubresource, pCopyImageInfo->pRegions[r].dstOffset,
++                              dst_extent);
++      }
++      radv_meta_bcn_encoded(cmd_buffer, dst_image, pCopyImageInfo->dstImageLayout,
++                            &pCopyImageInfo->pRegions[r].dstSubresource, pCopyImageInfo->pRegions[r].dstOffset,
++                            dst_extent);
++   }
++}
++
+ VKAPI_ATTR void VKAPI_CALL
+ radv_CmdCopyImage2(VkCommandBuffer commandBuffer, const VkCopyImageInfo2 *pCopyImageInfo)
+ {
+@@ -866,6 +971,12 @@ radv_CmdCopyImage2(VkCommandBuffer commandBuffer, const VkCopyImageInfo2 *pCopyI
+          radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_image);
+ 
+       const enum util_format_layout format_layout = vk_format_description(dst_image->vk_format)->layout;
++
++      if (dst_image->isNeedHardEncode) {
++         radv_copy_image_hard_encode(dst_image, format_layout, cmd_buffer, pCopyImageInfo);
++         return;
++      }
++
+       for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) {
+          VkExtent3D dst_extent = pCopyImageInfo->pRegions[r].extent;
+          if (src_image->vk_format != dst_image->vk_format) {
+diff --git a/src/amd/vulkan/radv_meta_etc_decode.c b/src/amd/vulkan/radv_meta_etc_decode.c
+index d5d002c63a0..3685218d915 100644
+--- a/src/amd/vulkan/radv_meta_etc_decode.c
++++ b/src/amd/vulkan/radv_meta_etc_decode.c
+@@ -801,23 +801,45 @@ radv_meta_decode_etc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *imag
+       store_format = VK_FORMAT_R8G8B8A8_UNORM;
+    }
+    struct radv_image_view dest_iview;
+-   radv_image_view_init(
+-      &dest_iview, cmd_buffer->device,
+-      &(VkImageViewCreateInfo){
+-         .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+-         .image = radv_image_to_handle(image),
+-         .viewType = radv_meta_get_view_type(image),
+-         .format = store_format,
+-         .subresourceRange =
+-            {
+-               .aspectMask = VK_IMAGE_ASPECT_PLANE_1_BIT,
+-               .baseMipLevel = subresource->mipLevel,
+-               .levelCount = 1,
+-               .baseArrayLayer = 0,
+-               .layerCount = subresource->baseArrayLayer + subresource->layerCount,
+-            },
+-      },
+-      NULL);
++   if (image->isNeedHardEncode) {
++      RADV_FROM_HANDLE(radv_image, store_image, image->staging_images->image_rgba);
++      cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, store_image);
++      radv_image_view_init(
++         &dest_iview, cmd_buffer->device,
++         &(VkImageViewCreateInfo){
++            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
++            .image = radv_image_to_handle(store_image),
++            .viewType = radv_meta_get_view_type(store_image),
++            .format = store_format,
++            .subresourceRange =
++               {
++                  .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
++                  .baseMipLevel = subresource->mipLevel,
++                  .levelCount = 1,
++                  .baseArrayLayer = 0,
++                  .layerCount = subresource->baseArrayLayer + subresource->layerCount,
++               },
++         },
++         NULL);
++   } else {
++      radv_image_view_init(
++         &dest_iview, cmd_buffer->device,
++         &(VkImageViewCreateInfo){
++            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
++            .image = radv_image_to_handle(image),
++            .viewType = radv_meta_get_view_type(image),
++            .format = store_format,
++            .subresourceRange =
++               {
++                  .aspectMask = VK_IMAGE_ASPECT_PLANE_1_BIT,
++                  .baseMipLevel = subresource->mipLevel,
++                  .levelCount = 1,
++                  .baseArrayLayer = 0,
++                  .layerCount = subresource->baseArrayLayer + subresource->layerCount,
++               },
++         },
++         NULL);
++   }
+ 
+    decode_etc(cmd_buffer, &src_iview, &dest_iview, &(VkOffset3D){offset.x, offset.y, base_slice},
+               &(VkExtent3D){extent.width, extent.height, slice_count});
+diff --git a/src/amd/vulkan/radv_meta_rgba_encode.c b/src/amd/vulkan/radv_meta_rgba_encode.c
+new file mode 100644
+index 00000000000..b144767143d
+--- /dev/null
++++ b/src/amd/vulkan/radv_meta_rgba_encode.c
+@@ -0,0 +1,312 @@
++/* Copyright (c) 2017-2023 Hans-Kristian Arntzen
++ *
++ * SPDX-License-Identifier: MIT
++ */
++
++#include <assert.h>
++#include <stdbool.h>
++
++#include "radv_meta.h"
++#include "sid.h"
++#include "vk_common_entrypoints.h"
++#include "vk_format.h"
++#include "vk_texcompress_bcn.h"
++#include "radv_debug.h"
++
++struct radv_dispatch_info {
++   /**
++    * Determine the layout of the grid (in block units) to be used.
++    */
++   uint32_t blocks[3];
++
++   /**
++    * A starting offset for the grid. If unaligned is set, the offset
++    * must still be aligned.
++    */
++   uint32_t offsets[3];
++
++   /**
++    * Whether it's an unaligned compute dispatch.
++    */
++   bool unaligned;
++
++   /**
++    * Whether waves must be launched in order.
++    */
++   bool ordered;
++
++   /**
++    * Indirect compute parameters resource.
++    */
++   struct radeon_winsys_bo *indirect;
++   uint64_t va;
++};
++
++VkResult
++radv_device_init_meta_rgba_encode_state(struct radv_device *device)
++{
++   const struct radv_physical_device *pdev = device->physical_device;
++   struct radv_meta_state *state = &device->meta_state;
++
++   return vk_texcompress_bcn_init(&device->vk, &state->alloc, radv_pipeline_cache_to_handle(&state->cache), state->bcn_encoded);
++}
++
++void
++radv_device_finish_meta_rgba_encode_state(struct radv_device *device)
++{
++   struct radv_meta_state *state = &device->meta_state;
++   struct vk_texcompress_bcn_state **bcns = state->bcn_encoded;
++
++   if (bcns)
++      vk_texcompress_bcn_finish(&device->vk, &state->alloc, state->bcn_encoded);
++}
++
++extern void
++radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info);
++
++static VkImageViewType
++get_view_type(const struct radv_image *image)
++{
++   switch (image->type) {
++   case VK_IMAGE_TYPE_2D:
++      return VK_IMAGE_VIEW_TYPE_2D_ARRAY;
++   case VK_IMAGE_TYPE_3D:
++      return VK_IMAGE_VIEW_TYPE_3D;
++   default:
++      unreachable("bad VkImageViewType");
++   }
++}
++
++static void
++image_view_init(struct radv_device *device, struct radv_image *image, VkFormat format, VkImageAspectFlags aspectMask,
++                uint32_t baseMipLevel, uint32_t baseArrayLayer, uint32_t layerCount, VkComponentMapping *component,
++                struct radv_image_view *iview)
++{
++   VkImageViewCreateInfo iview_create_info = {
++      .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
++      .image = radv_image_to_handle(image),
++      .viewType = get_view_type(image),
++      .format = format,
++      .subresourceRange =
++         {
++            .aspectMask = aspectMask,
++            .baseMipLevel = baseMipLevel,
++            .levelCount = 1,
++            .baseArrayLayer = 0,
++            .layerCount = baseArrayLayer + layerCount,
++         },
++   };
++   if (component) {
++      iview_create_info.components = *component;
++   }
++
++   radv_image_view_init(iview, device, &iview_create_info, NULL);
++}
++
++static void
++encode_bc1(struct radv_cmd_buffer *cmd_buffer, VkImageLayout layout,
++           const VkExtent3D *extent, const VkImageSubresourceLayers *subresource,
++           struct radv_image *src_image, struct radv_image *dst_image)
++{
++   struct radv_device *device = cmd_buffer->device;
++   struct radv_meta_state *state = &device->meta_state;
++
++   struct radv_image_view src_iview, dst_iview;
++   VkComponentMapping component = {
++      .r = VK_COMPONENT_SWIZZLE_R,
++      .g = VK_COMPONENT_SWIZZLE_G,
++      .b = VK_COMPONENT_SWIZZLE_B,
++      .a = VK_COMPONENT_SWIZZLE_A
++   };
++   image_view_init(device, src_image, VK_FORMAT_R8G8B8A8_UNORM, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
++                   subresource->baseArrayLayer, 1, &component, &src_iview);
++   image_view_init(device, dst_image, VK_FORMAT_R16G16B16A16_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
++                   subresource->baseArrayLayer, 1, NULL, &dst_iview);
++
++   cmd_buffer->state.flush_bits |=
++      RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
++      radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, src_iview.image) |
++      radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_iview.image);
++
++   struct vk_texcompress_bcn_write_descriptor_set write_desc_set;
++   vk_texcompress_bc1_fill_write_descriptor_sets(state->bcn_encoded[BC1_ENCODE], &write_desc_set,
++                                                  radv_image_view_to_handle(&src_iview), layout,
++                                                  radv_image_view_to_handle(&dst_iview));
++
++   radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, state->bcn_encoded[BC1_ENCODE]->p_layout,
++                                 0, /* set number */
++                                 3, write_desc_set.descriptor_set);
++
++   VkPipeline pipeline =
++      vk_texcompress_rgba_get_encode_pipeline(&device->vk, &state->alloc,
++                                              state->bcn_encoded[BC1_ENCODE],
++                                              radv_pipeline_cache_to_handle(&state->cache), BC1_ENCODE);
++   if (pipeline == VK_NULL_HANDLE)
++      return;
++
++   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
++
++   const unsigned num_refinements = 1;
++   const unsigned push_constants[2] = {num_refinements};
++   radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), state->bcn_encoded[BC1_ENCODE]->p_layout,
++                         VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants);
++
++   struct radv_dispatch_info info = {
++      .blocks[0] = DIV_ROUND_UP(extent->width, 32),
++      .blocks[1] = DIV_ROUND_UP(extent->height, 32),
++      .blocks[2] = 1,
++   };
++   radv_compute_dispatch(cmd_buffer, &info);
++   radv_image_view_finish(&src_iview);
++   radv_image_view_finish(&dst_iview);
++}
++
++static void
++encode_bc4(struct radv_cmd_buffer *cmd_buffer, VkImageLayout layout,
++           const VkExtent3D *extent, const VkImageSubresourceLayers *subresource,
++           struct radv_image *src_image, struct radv_image *dst_image)
++{
++   struct radv_device *device = cmd_buffer->device;
++   struct radv_meta_state *state = &device->meta_state;
++
++   struct radv_image_view src_iview, dst_iview;
++   VkComponentMapping component = {
++      .r = VK_COMPONENT_SWIZZLE_A,
++      .g = VK_COMPONENT_SWIZZLE_ZERO,
++      .b = VK_COMPONENT_SWIZZLE_ZERO,
++      .a = VK_COMPONENT_SWIZZLE_ONE
++   };
++   image_view_init(device, src_image, VK_FORMAT_R8G8B8A8_UNORM, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
++                   subresource->baseArrayLayer, 1, &component, &src_iview);
++   image_view_init(device, dst_image, VK_FORMAT_R16G16B16A16_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
++                   subresource->baseArrayLayer, 1, NULL, &dst_iview);
++
++   cmd_buffer->state.flush_bits |=
++      RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
++      radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, src_iview.image) |
++      radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_iview.image);
++
++   struct vk_texcompress_bc4_write_descriptor_set write_desc_set;
++   vk_texcompress_bc4_fill_write_descriptor_sets(state->bcn_encoded[BC4_ENCODE], &write_desc_set,
++                                                  radv_image_view_to_handle(&src_iview), layout,
++                                                  radv_image_view_to_handle(&dst_iview));
++
++   radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, state->bcn_encoded[BC4_ENCODE]->p_layout,
++                                 0, /* set number */
++                                 2, write_desc_set.descriptor_set);
++
++   VkPipeline pipeline =
++      vk_texcompress_rgba_get_encode_pipeline(&device->vk, &state->alloc,
++                                              state->bcn_encoded[BC4_ENCODE],
++                                              radv_pipeline_cache_to_handle(&state->cache), BC4_ENCODE);
++   if (pipeline == VK_NULL_HANDLE)
++      return;
++
++   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
++
++   const unsigned alpha_channel_id = 0;  // r通道
++   const unsigned use_norm = 0;
++   const unsigned push_constants[2] = {alpha_channel_id, use_norm};
++   radv_CmdPushConstants(radv_cmd_buffer_to_handle(cmd_buffer), state->bcn_encoded[BC4_ENCODE]->p_layout,
++                         VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), &push_constants);
++
++   struct radv_dispatch_info info = {
++      .blocks[0] = 1,
++      .blocks[1] = DIV_ROUND_UP(extent->width, 16),
++      .blocks[2] = DIV_ROUND_UP(extent->height, 16),
++   };
++   radv_compute_dispatch(cmd_buffer, &info);
++   radv_image_view_finish(&src_iview);
++   radv_image_view_finish(&dst_iview);
++}
++
++static void
++encode_bc3(struct radv_cmd_buffer *cmd_buffer,
++           const VkExtent3D *extent, const VkImageSubresourceLayers *subresource,
++           struct radv_image *bc1_image, struct radv_image *bc4_image, struct radv_image *dst_image)
++{
++   struct radv_device *device = cmd_buffer->device;
++   struct radv_meta_state *state = &device->meta_state;
++
++   struct radv_image_view bc1_iview, bc4_iview, dst_iview;
++   VkComponentMapping component = {
++      .r = VK_COMPONENT_SWIZZLE_R,
++      .g = VK_COMPONENT_SWIZZLE_G,
++      .b = VK_COMPONENT_SWIZZLE_ZERO,
++      .a = VK_COMPONENT_SWIZZLE_ONE
++   };
++   image_view_init(device, bc1_image, VK_FORMAT_R32G32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
++                   subresource->baseArrayLayer, 1, &component, &bc1_iview);
++   image_view_init(device, bc4_image, VK_FORMAT_R32G32_UINT, VK_IMAGE_ASPECT_COLOR_BIT, subresource->mipLevel,
++                   subresource->baseArrayLayer, 1, &component, &bc4_iview);
++   image_view_init(device, dst_image, VK_FORMAT_R32G32B32A32_UINT, VK_IMAGE_ASPECT_PLANE_1_BIT, subresource->mipLevel,
++                   subresource->baseArrayLayer, 1, NULL, &dst_iview);
++
++   cmd_buffer->state.flush_bits |=
++      RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
++      radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, bc1_iview.image) |
++      radv_src_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, bc4_iview.image) |
++      radv_dst_access_flush(cmd_buffer, VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, dst_iview.image);
++
++   struct vk_texcompress_bcn_write_descriptor_set write_desc_set;
++   vk_texcompress_bc3_fill_write_descriptor_sets(state->bcn_encoded[BC3_ENCODE], &write_desc_set,
++                                                 radv_image_view_to_handle(&bc1_iview), radv_image_view_to_handle(&bc4_iview),
++                                                 radv_image_view_to_handle(&dst_iview));
++
++   radv_meta_push_descriptor_set(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, state->bcn_encoded[BC3_ENCODE]->p_layout,
++                                 0, /* set number */
++                                 3, write_desc_set.descriptor_set);
++
++   VkPipeline pipeline =
++      vk_texcompress_rgba_get_encode_pipeline(&device->vk, &state->alloc,
++                                              state->bcn_encoded[BC3_ENCODE],
++                                              radv_pipeline_cache_to_handle(&state->cache), BC3_ENCODE);
++   if (pipeline == VK_NULL_HANDLE)
++      return;
++
++   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
++
++   struct radv_dispatch_info info = {
++      .blocks[0] = DIV_ROUND_UP(extent->width, 32),
++      .blocks[1] = DIV_ROUND_UP(extent->height, 32),
++      .blocks[2] = 1,
++   };
++   radv_compute_dispatch(cmd_buffer, &info);
++   radv_image_view_finish(&bc1_iview);
++   radv_image_view_finish(&bc4_iview);
++   radv_image_view_finish(&dst_iview);
++}
++
++void
++radv_meta_bcn_encoded(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout layout,
++                      const VkImageSubresourceLayers *subresource, VkOffset3D offset, VkExtent3D extent)
++{
++   struct radv_device *device = cmd_buffer->device;
++   struct radv_meta_saved_state saved_state;
++   radv_meta_save(&saved_state, cmd_buffer,
++                  RADV_META_SAVE_COMPUTE_PIPELINE | RADV_META_SAVE_CONSTANTS | RADV_META_SAVE_DESCRIPTORS);
++
++   VkImage _image = radv_image_to_handle(image);
++   VK_FROM_HANDLE(vk_image, vkImage, _image);
++
++   extent = radv_sanitize_image_extent(image->type, extent);
++   offset = radv_sanitize_image_offset(image->type, offset);
++
++   VkExtent3D extent_copy = {
++      .width = extent.width,
++      .height = extent.height,
++      .depth = 1,
++   };
++
++   RADV_FROM_HANDLE(radv_image, rgba_image, image->staging_images->image_rgba);
++   RADV_FROM_HANDLE(radv_image, image1, image->staging_images->image_bc1);
++   RADV_FROM_HANDLE(radv_image, image4, image->staging_images->image_bc4);
++
++   encode_bc1(cmd_buffer, layout, &extent_copy, subresource, rgba_image, image1);
++
++   encode_bc4(cmd_buffer, layout, &extent_copy, subresource, rgba_image, image4);
++
++   encode_bc3(cmd_buffer, &extent_copy, subresource, image1, image4, image);
++
++   radv_meta_restore(&saved_state, cmd_buffer);
++}
+diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
+index 472858dd401..04a670d714d 100644
+--- a/src/amd/vulkan/radv_private.h
++++ b/src/amd/vulkan/radv_private.h
+@@ -98,6 +98,7 @@ typedef uint32_t xcb_window_t;
+ 
+ #include "vk_texcompress_astc.h"
+ #include "wsi_common.h"
++#include "vk_texcompress_bcn.h"
+ 
+ #ifdef __cplusplus
+ extern "C"
+@@ -681,6 +682,8 @@ struct radv_meta_state {
+    } etc_decode;
+ 
+    struct vk_texcompress_astc_state *astc_decode;
++
++   struct vk_texcompress_bcn_state *bcn_encoded[VK_TEXCOMPRESS_BCN_NUM_COMPUTE_PIPELINES];
+ };
+ 
+ #define RADV_NUM_HW_CTX (RADEON_CTX_PRIORITY_REALTIME + 1)
+@@ -2090,8 +2093,12 @@ struct radv_image {
+    bool dcc_sign_reinterpret;
+    bool support_comp_to_single;
+ 
+-   bool isNeedSoftEncode;// 是否需要使用 软编
+-   bool isNeedSoftDecode;// 是否需要使用 软解
++   bool isNeedSoftEncode;  // 是否需要使用 软编
++   bool isNeedSoftDecode;  // 是否需要使用 软解
++
++   bool isNeedHardEncode;  // 是否需要使用 硬编
++   struct vk_texcompress_bcn_image *staging_images;  // 硬编的中间资源
++
+ 	VkFormat srcFormat; //用于保存纹理的原始格式
+ 	VkFormat unpackETCFormat;//用于保存ETC纹理解压后的格式
+ 
+@@ -2474,6 +2481,8 @@ void radv_image_view_init(struct radv_image_view *view, struct radv_device *devi
+                           const struct radv_image_view_extra_create_info *extra_create_info);
+ void radv_image_view_finish(struct radv_image_view *iview);
+ 
++void radv_internal_image_finish(struct radv_image *image);
++
+ VkFormat radv_get_aspect_format(struct radv_image *image, VkImageAspectFlags mask);
+ 
+ struct radv_sampler_ycbcr_conversion_state {
+diff --git a/src/compiler/glsl/bc1.glsl b/src/compiler/glsl/bc1.glsl
+index 251635d7b69..627b2e5419c 100644
+--- a/src/compiler/glsl/bc1.glsl
++++ b/src/compiler/glsl/bc1.glsl
+@@ -1,3 +1,4 @@
++#version 310 es
+ /*
+  * Copyright 2020-2022 Matias N. Goldberg
+  * Copyright 2022 Intel Corporation
+@@ -21,7 +22,6 @@
+  * DEALINGS IN THE SOFTWARE.
+  */
+ 
+-#version 310 es
+ 
+ #if defined(GL_ES) && GL_ES == 1
+ 	// Desktop GLSL allows the const keyword for either compile-time or
+@@ -31,10 +31,31 @@
+ 	#define const
+ #endif
+ 
+-%s // include "CrossPlatformSettings_piece_all.glsl"
+-
+ #define FLT_MAX 340282346638528859811704183484516925440.0f
+ 
++#ifdef VULKAN
++
++#extension GL_GOOGLE_include_directive : require
++#include "CrossPlatformSettings_piece_all.glsl"
++
++layout(push_constant, std430) uniform pc {
++	uint p_numRefinements;
++};
++
++layout ( set = 0, binding = 0 ) uniform sampler2D srcTex;
++
++layout (std430, set = 0, binding = 1) readonly restrict buffer globalBuffer
++{
++	float2 c_oMatch5[256];
++	float2 c_oMatch6[256];
++};
++
++layout (rgba16ui, set = 0, binding = 2 ) uniform restrict writeonly mediump uimage2D dstTexture;
++
++#else
++
++%s // include "CrossPlatformSettings_piece_all.glsl"
++
+ layout( location = 0 ) uniform uint p_numRefinements;
+ 
+ uniform sampler2D srcTex;
+@@ -47,6 +68,8 @@ layout( std430, binding = 1 ) readonly restrict buffer globalBuffer
+ 	float2 c_oMatch6[256];
+ };
+ 
++#endif
++
+ layout( local_size_x = 8,  //
+ 		local_size_y = 8,  //
+ 		local_size_z = 1 ) in;
+diff --git a/src/compiler/glsl/bc4.glsl b/src/compiler/glsl/bc4.glsl
+index 2486fa4ae0c..0b9d236d624 100644
+--- a/src/compiler/glsl/bc4.glsl
++++ b/src/compiler/glsl/bc4.glsl
+@@ -1,3 +1,5 @@
++#version 310 es
++
+ /*
+  * Copyright 2020-2022 Matias N. Goldberg
+  * Copyright 2022 Intel Corporation
+@@ -21,8 +23,6 @@
+  * DEALINGS IN THE SOFTWARE.
+  */
+ 
+-#version 310 es
+-
+ #if defined(GL_ES) && GL_ES == 1
+ 	// Desktop GLSL allows the const keyword for either compile-time or
+ 	// run-time constants. GLSL ES only allows the keyword for compile-time
+@@ -33,20 +33,37 @@
+ 
+ #define __sharedOnlyBarrier memoryBarrierShared();barrier();
+ 
+-%s // include "CrossPlatformSettings_piece_all.glsl"
++#ifdef VULKAN
+ 
+-shared float2 g_minMaxValues[4u * 4u * 4u];
+-shared uint2 g_mask[4u * 4u];
++#extension GL_GOOGLE_include_directive : require
++#include "CrossPlatformSettings_piece_all.glsl"
+ 
+-layout( location = 0 ) uniform uint2 params;
++layout(push_constant, std430) uniform pc {
++	uint2 params;
++};
+ 
+-#define p_channelIdx params.x
+-#define p_useSNorm params.y
++layout ( set = 0, binding = 0 ) uniform sampler2D srcTex;
++
++layout (rgba16ui, set = 0, binding = 1) uniform restrict writeonly mediump uimage2D dstTexture;
++
++#else
++
++%s // include "CrossPlatformSettings_piece_all.glsl"
+ 
+ uniform sampler2D srcTex;
+ 
+ layout( rgba16ui ) uniform restrict writeonly mediump uimage2D dstTexture;
+ 
++layout( location = 0 ) uniform uint2 params;
++
++#endif
++
++shared float2 g_minMaxValues[4u * 4u * 4u];
++shared uint2 g_mask[4u * 4u];
++
++#define p_channelIdx params.x
++#define p_useSNorm params.y
++
+ layout( local_size_x = 4,  //
+ 		local_size_y = 4,  //
+ 		local_size_z = 4 ) in;
+diff --git a/src/compiler/glsl/etc2_rgba_stitch.glsl b/src/compiler/glsl/etc2_rgba_stitch.glsl
+index f90accb82e1..8d50c469956 100644
+--- a/src/compiler/glsl/etc2_rgba_stitch.glsl
++++ b/src/compiler/glsl/etc2_rgba_stitch.glsl
+@@ -1,3 +1,5 @@
++#version 310 es
++
+ /*
+  * Copyright 2020-2022 Matias N. Goldberg
+  * Copyright 2022 Intel Corporation
+@@ -25,18 +27,30 @@
+ // This compute shader merely stitches them together to form the final result
+ // It's also used by RG11 driver to stitch two R11 into one RG11
+ 
+-#version 310 es
++#ifdef VULKAN
+ 
+-%s // include "CrossPlatformSettings_piece_all.glsl"
++#extension GL_GOOGLE_include_directive : require
++#include "CrossPlatformSettings_piece_all.glsl"
+ 
+-layout( local_size_x = 8,  //
+-		local_size_y = 8,  //
+-		local_size_z = 1 ) in;
++layout ( binding = 0 ) uniform highp usampler2D srcRGB;
++layout ( binding = 1 ) uniform highp usampler2D srcAlpha;
++
++layout ( rgba32ui, binding = 2 ) uniform restrict writeonly mediump uimage2D dstTexture;
++
++#else
++
++%s // include "CrossPlatformSettings_piece_all.glsl"
+ 
+ layout( binding = 0 ) uniform highp usampler2D srcRGB;
+ layout( binding = 1 ) uniform highp usampler2D srcAlpha;
+ layout( rgba32ui ) uniform restrict writeonly highp uimage2D dstTexture;
+ 
++#endif
++
++layout( local_size_x = 8,  //
++		local_size_y = 8,  //
++		local_size_z = 1 ) in;
++
+ void main()
+ {
+ 	uint2 etcRgb = OGRE_Load2D( srcRGB, int2( gl_GlobalInvocationID.xy ), 0 ).xy;
+diff --git a/src/compiler/meson.build b/src/compiler/meson.build
+index e18af3bf335..85b19b84bec 100644
+--- a/src/compiler/meson.build
++++ b/src/compiler/meson.build
+@@ -24,6 +24,14 @@ inc_spirv = include_directories('spirv')
+ 
+ astc_decoder_glsl_file = files('glsl/astc_decoder.glsl')
+ 
++bc3_encode_head_dir = meson.current_source_dir() / 'glsl'
++
++bc1_glsl_file = files('glsl/bc1.glsl')
++
++bc4_glsl_file = files('glsl/bc4.glsl')
++
++etc2_rgba_stitch_glsl_file = files('glsl/etc2_rgba_stitch.glsl')
++
+ files_libcompiler = files(
+   'builtin_type_macros.h',
+   'glsl_types.cpp',
+diff --git a/src/util/bc1_tables.h b/src/util/bc1_tables.h
+new file mode 100644
+index 00000000000..654ccdadb78
+--- /dev/null
++++ b/src/util/bc1_tables.h
+@@ -0,0 +1,124 @@
++/*
++ * Copyright 2020-2022 Matias N. Goldberg
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++/* These tables have been generated with:
++
++		#define STB_DXT_IMPLEMENTATION
++		#include "stb_dxt.h"
++		#include <stdio.h>
++
++		int main()
++		{
++			stb__InitDXT();
++
++			printf( "static unsigned char stb__OMatch5[256][2] = {\n" );
++			for( size_t i = 0u; i < 256u; ++i )
++			{
++				printf( "{ %i, %i }", stb__OMatch5[i][0], stb__OMatch5[i][1] );
++				if( i != 255u )
++					printf( ",\n" );
++			}
++			printf( "\n};\n" );
++
++			printf( "static unsigned char stb__OMatch6[256][2] = {\n" );
++			for( size_t i = 0u; i < 256u; ++i )
++			{
++				printf( "{ %i, %i }", stb__OMatch6[i][0], stb__OMatch6[i][1] );
++				if( i != 255u )
++					printf( ",\n" );
++			}
++			printf( "\n};\n" );
++
++			return 0;
++		}
++
++	Note that stb__OMatch5[i][0] = max and stb__OMatch5[i][1] = min
++*/
++static unsigned char stb__OMatch5[256][2] = {
++	{ 0, 0 },   { 0, 0 },   { 0, 1 },   { 0, 1 },   { 1, 0 },   { 1, 0 },   { 1, 0 },   { 1, 1 },
++	{ 1, 1 },   { 2, 0 },   { 2, 0 },   { 0, 4 },   { 2, 1 },   { 2, 1 },   { 2, 1 },   { 3, 0 },
++	{ 3, 0 },   { 3, 0 },   { 3, 1 },   { 1, 5 },   { 3, 2 },   { 3, 2 },   { 4, 0 },   { 4, 0 },
++	{ 4, 1 },   { 4, 1 },   { 4, 2 },   { 4, 2 },   { 4, 2 },   { 3, 5 },   { 5, 1 },   { 5, 1 },
++	{ 5, 2 },   { 4, 4 },   { 5, 3 },   { 5, 3 },   { 5, 3 },   { 6, 2 },   { 6, 2 },   { 6, 2 },
++	{ 6, 3 },   { 5, 5 },   { 6, 4 },   { 6, 4 },   { 4, 8 },   { 7, 3 },   { 7, 3 },   { 7, 3 },
++	{ 7, 4 },   { 7, 4 },   { 7, 4 },   { 7, 5 },   { 5, 9 },   { 7, 6 },   { 7, 6 },   { 8, 4 },
++	{ 8, 4 },   { 8, 5 },   { 8, 5 },   { 8, 6 },   { 8, 6 },   { 8, 6 },   { 7, 9 },   { 9, 5 },
++	{ 9, 5 },   { 9, 6 },   { 8, 8 },   { 9, 7 },   { 9, 7 },   { 9, 7 },   { 10, 6 },  { 10, 6 },
++	{ 10, 6 },  { 10, 7 },  { 9, 9 },   { 10, 8 },  { 10, 8 },  { 8, 12 },  { 11, 7 },  { 11, 7 },
++	{ 11, 7 },  { 11, 8 },  { 11, 8 },  { 11, 8 },  { 11, 9 },  { 9, 13 },  { 11, 10 }, { 11, 10 },
++	{ 12, 8 },  { 12, 8 },  { 12, 9 },  { 12, 9 },  { 12, 10 }, { 12, 10 }, { 12, 10 }, { 11, 13 },
++	{ 13, 9 },  { 13, 9 },  { 13, 10 }, { 12, 12 }, { 13, 11 }, { 13, 11 }, { 13, 11 }, { 14, 10 },
++	{ 14, 10 }, { 14, 10 }, { 14, 11 }, { 13, 13 }, { 14, 12 }, { 14, 12 }, { 12, 16 }, { 15, 11 },
++	{ 15, 11 }, { 15, 11 }, { 15, 12 }, { 15, 12 }, { 15, 12 }, { 15, 13 }, { 13, 17 }, { 15, 14 },
++	{ 15, 14 }, { 16, 12 }, { 16, 12 }, { 16, 13 }, { 16, 13 }, { 16, 14 }, { 16, 14 }, { 16, 14 },
++	{ 15, 17 }, { 17, 13 }, { 17, 13 }, { 17, 14 }, { 16, 16 }, { 17, 15 }, { 17, 15 }, { 17, 15 },
++	{ 18, 14 }, { 18, 14 }, { 18, 14 }, { 18, 15 }, { 17, 17 }, { 18, 16 }, { 18, 16 }, { 16, 20 },
++	{ 19, 15 }, { 19, 15 }, { 19, 15 }, { 19, 16 }, { 19, 16 }, { 19, 16 }, { 19, 17 }, { 17, 21 },
++	{ 19, 18 }, { 19, 18 }, { 20, 16 }, { 20, 16 }, { 20, 17 }, { 20, 17 }, { 20, 18 }, { 20, 18 },
++	{ 20, 18 }, { 19, 21 }, { 21, 17 }, { 21, 17 }, { 21, 18 }, { 20, 20 }, { 21, 19 }, { 21, 19 },
++	{ 21, 19 }, { 22, 18 }, { 22, 18 }, { 22, 18 }, { 22, 19 }, { 21, 21 }, { 22, 20 }, { 22, 20 },
++	{ 20, 24 }, { 23, 19 }, { 23, 19 }, { 23, 19 }, { 23, 20 }, { 23, 20 }, { 23, 20 }, { 23, 21 },
++	{ 21, 25 }, { 23, 22 }, { 23, 22 }, { 24, 20 }, { 24, 20 }, { 24, 21 }, { 24, 21 }, { 24, 22 },
++	{ 24, 22 }, { 24, 22 }, { 23, 25 }, { 25, 21 }, { 25, 21 }, { 25, 22 }, { 24, 24 }, { 25, 23 },
++	{ 25, 23 }, { 25, 23 }, { 26, 22 }, { 26, 22 }, { 26, 22 }, { 26, 23 }, { 25, 25 }, { 26, 24 },
++	{ 26, 24 }, { 24, 28 }, { 27, 23 }, { 27, 23 }, { 27, 23 }, { 27, 24 }, { 27, 24 }, { 27, 24 },
++	{ 27, 25 }, { 25, 29 }, { 27, 26 }, { 27, 26 }, { 28, 24 }, { 28, 24 }, { 28, 25 }, { 28, 25 },
++	{ 28, 26 }, { 28, 26 }, { 28, 26 }, { 27, 29 }, { 29, 25 }, { 29, 25 }, { 29, 26 }, { 28, 28 },
++	{ 29, 27 }, { 29, 27 }, { 29, 27 }, { 30, 26 }, { 30, 26 }, { 30, 26 }, { 30, 27 }, { 29, 29 },
++	{ 30, 28 }, { 30, 28 }, { 30, 28 }, { 31, 27 }, { 31, 27 }, { 31, 27 }, { 31, 28 }, { 31, 28 },
++	{ 31, 28 }, { 31, 29 }, { 31, 29 }, { 31, 30 }, { 31, 30 }, { 31, 30 }, { 31, 31 }, { 31, 31 }
++};
++
++static unsigned char stb__OMatch6[256][2] = {
++	{ 0, 0 },   { 0, 1 },   { 1, 0 },   { 1, 0 },   { 1, 1 },   { 2, 0 },   { 2, 1 },   { 3, 0 },
++	{ 3, 0 },   { 3, 1 },   { 4, 0 },   { 4, 0 },   { 4, 1 },   { 5, 0 },   { 5, 1 },   { 6, 0 },
++	{ 6, 0 },   { 6, 1 },   { 7, 0 },   { 7, 0 },   { 7, 1 },   { 8, 0 },   { 8, 1 },   { 8, 1 },
++	{ 8, 2 },   { 9, 1 },   { 9, 2 },   { 9, 2 },   { 9, 3 },   { 10, 2 },  { 10, 3 },  { 10, 3 },
++	{ 10, 4 },  { 11, 3 },  { 11, 4 },  { 11, 4 },  { 11, 5 },  { 12, 4 },  { 12, 5 },  { 12, 5 },
++	{ 12, 6 },  { 13, 5 },  { 13, 6 },  { 8, 16 },  { 13, 7 },  { 14, 6 },  { 14, 7 },  { 9, 17 },
++	{ 14, 8 },  { 15, 7 },  { 15, 8 },  { 11, 16 }, { 15, 9 },  { 15, 10 }, { 16, 8 },  { 16, 9 },
++	{ 16, 10 }, { 15, 13 }, { 17, 9 },  { 17, 10 }, { 17, 11 }, { 15, 16 }, { 18, 10 }, { 18, 11 },
++	{ 18, 12 }, { 16, 16 }, { 19, 11 }, { 19, 12 }, { 19, 13 }, { 17, 17 }, { 20, 12 }, { 20, 13 },
++	{ 20, 14 }, { 19, 16 }, { 21, 13 }, { 21, 14 }, { 21, 15 }, { 20, 17 }, { 22, 14 }, { 22, 15 },
++	{ 25, 10 }, { 22, 16 }, { 23, 15 }, { 23, 16 }, { 26, 11 }, { 23, 17 }, { 24, 16 }, { 24, 17 },
++	{ 27, 12 }, { 24, 18 }, { 25, 17 }, { 25, 18 }, { 28, 13 }, { 25, 19 }, { 26, 18 }, { 26, 19 },
++	{ 29, 14 }, { 26, 20 }, { 27, 19 }, { 27, 20 }, { 30, 15 }, { 27, 21 }, { 28, 20 }, { 28, 21 },
++	{ 28, 21 }, { 28, 22 }, { 29, 21 }, { 29, 22 }, { 24, 32 }, { 29, 23 }, { 30, 22 }, { 30, 23 },
++	{ 25, 33 }, { 30, 24 }, { 31, 23 }, { 31, 24 }, { 27, 32 }, { 31, 25 }, { 31, 26 }, { 32, 24 },
++	{ 32, 25 }, { 32, 26 }, { 31, 29 }, { 33, 25 }, { 33, 26 }, { 33, 27 }, { 31, 32 }, { 34, 26 },
++	{ 34, 27 }, { 34, 28 }, { 32, 32 }, { 35, 27 }, { 35, 28 }, { 35, 29 }, { 33, 33 }, { 36, 28 },
++	{ 36, 29 }, { 36, 30 }, { 35, 32 }, { 37, 29 }, { 37, 30 }, { 37, 31 }, { 36, 33 }, { 38, 30 },
++	{ 38, 31 }, { 41, 26 }, { 38, 32 }, { 39, 31 }, { 39, 32 }, { 42, 27 }, { 39, 33 }, { 40, 32 },
++	{ 40, 33 }, { 43, 28 }, { 40, 34 }, { 41, 33 }, { 41, 34 }, { 44, 29 }, { 41, 35 }, { 42, 34 },
++	{ 42, 35 }, { 45, 30 }, { 42, 36 }, { 43, 35 }, { 43, 36 }, { 46, 31 }, { 43, 37 }, { 44, 36 },
++	{ 44, 37 }, { 44, 37 }, { 44, 38 }, { 45, 37 }, { 45, 38 }, { 40, 48 }, { 45, 39 }, { 46, 38 },
++	{ 46, 39 }, { 41, 49 }, { 46, 40 }, { 47, 39 }, { 47, 40 }, { 43, 48 }, { 47, 41 }, { 47, 42 },
++	{ 48, 40 }, { 48, 41 }, { 48, 42 }, { 47, 45 }, { 49, 41 }, { 49, 42 }, { 49, 43 }, { 47, 48 },
++	{ 50, 42 }, { 50, 43 }, { 50, 44 }, { 48, 48 }, { 51, 43 }, { 51, 44 }, { 51, 45 }, { 49, 49 },
++	{ 52, 44 }, { 52, 45 }, { 52, 46 }, { 51, 48 }, { 53, 45 }, { 53, 46 }, { 53, 47 }, { 52, 49 },
++	{ 54, 46 }, { 54, 47 }, { 57, 42 }, { 54, 48 }, { 55, 47 }, { 55, 48 }, { 58, 43 }, { 55, 49 },
++	{ 56, 48 }, { 56, 49 }, { 59, 44 }, { 56, 50 }, { 57, 49 }, { 57, 50 }, { 60, 45 }, { 57, 51 },
++	{ 58, 50 }, { 58, 51 }, { 61, 46 }, { 58, 52 }, { 59, 51 }, { 59, 52 }, { 62, 47 }, { 59, 53 },
++	{ 60, 52 }, { 60, 53 }, { 60, 53 }, { 60, 54 }, { 61, 53 }, { 61, 54 }, { 61, 54 }, { 61, 55 },
++	{ 62, 54 }, { 62, 55 }, { 62, 55 }, { 62, 56 }, { 63, 55 }, { 63, 56 }, { 63, 56 }, { 63, 57 },
++	{ 63, 58 }, { 63, 59 }, { 63, 59 }, { 63, 60 }, { 63, 61 }, { 63, 62 }, { 63, 62 }, { 63, 63 }
++};
+diff --git a/src/vulkan/runtime/meson.build b/src/vulkan/runtime/meson.build
+index 1c02e1a5dd8..54c89d793a0 100644
+--- a/src/vulkan/runtime/meson.build
++++ b/src/vulkan/runtime/meson.build
+@@ -93,12 +93,31 @@ endif
+ 
+ if prog_glslang.found()
+   vulkan_runtime_files += files('vk_texcompress_astc.c', 'vk_texcompress_astc.h')
++  vulkan_runtime_files += files('vk_texcompress_bcn.c', 'vk_texcompress_bcn.h')
+   vulkan_runtime_files += custom_target(
+     'astc_spv.h',
+     input : astc_decoder_glsl_file,
+     output : 'astc_spv.h',
+     command : [prog_glslang, '-V', '-S', 'comp', '-x', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet,
+     )
++  vulkan_runtime_files += custom_target(
++    'bc1_spv.h',
++    input : bc1_glsl_file,
++    output : 'bc1_spv.h',
++    command : [prog_glslang, '-V', '-S', 'comp', '-x', '-Ibc3_encode_head_dir', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet,
++  )
++  vulkan_runtime_files += custom_target(
++    'bc4_spv.h',
++    input : bc4_glsl_file,
++    output : 'bc4_spv.h',
++    command : [prog_glslang, '-V', '-S', 'comp', '-x', '-Ibc3_encode_head_dir', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet,
++  )
++  vulkan_runtime_files += custom_target(
++    'bc3_spv.h',
++    input : etc2_rgba_stitch_glsl_file,
++    output : 'bc3_spv.h',
++    command : [prog_glslang, '-V', '-S', 'comp', '-x', '-Ibc3_encode_head_dir', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet,
++  )
+ endif
+ 
+ vk_common_entrypoints = custom_target(
+diff --git a/src/vulkan/runtime/vk_command_buffer.h b/src/vulkan/runtime/vk_command_buffer.h
+index 36099d03a56..1a04b53510f 100644
+--- a/src/vulkan/runtime/vk_command_buffer.h
++++ b/src/vulkan/runtime/vk_command_buffer.h
+@@ -28,6 +28,7 @@
+ #include "vk_object.h"
+ #include "util/list.h"
+ #include "util/u_dynarray.h"
++#include "vk_texcompress_bcn.h"
+ 
+ #ifdef __cplusplus
+ extern "C" {
+@@ -130,6 +131,12 @@ struct vk_command_buffer {
+    /* This uses the same trick as STACK_ARRAY */
+    struct vk_attachment_state *attachments;
+    struct vk_attachment_state _attachments[8];
++
++   bool is_need_hard_encode;
++
++   struct vk_texcompress_staging_images* staging_image_list_head;
++
++   struct vk_texcompress_dummy_image* dummy;
+ };
+ 
+ VK_DEFINE_HANDLE_CASTS(vk_command_buffer, base, VkCommandBuffer,
+diff --git a/src/vulkan/runtime/vk_queue.c b/src/vulkan/runtime/vk_queue.c
+index 6216af9d856..15994e3420d 100644
+--- a/src/vulkan/runtime/vk_queue.c
++++ b/src/vulkan/runtime/vk_queue.c
+@@ -42,6 +42,7 @@
+ #include "vk_util.h"
+ 
+ #include "vulkan/wsi/wsi_common.h"
++#include "vk_texcompress_bcn.h"
+ 
+ static VkResult
+ vk_queue_start_submit_thread(struct vk_queue *queue);
+@@ -1140,7 +1141,22 @@ vk_common_QueueSubmit2KHR(VkQueue _queue,
+       }
+    }
+ 
++   int n_command_buffers = 0;
++   for (uint32_t s = 0; s < submitCount; s++) {
++      n_command_buffers += pSubmits[s].commandBufferInfoCount;
++   }
++   const uint32_t const_num_cmd_buf = n_command_buffers;
++   uint32_t num_cmd_buf = 0;
++   struct vk_command_buffer *need_delete_cmd_buf[const_num_cmd_buf];
++
+    for (uint32_t i = 0; i < submitCount; i++) {
++      for (uint32_t j = 0; j < pSubmits[i].commandBufferInfoCount; j++) {
++         VK_FROM_HANDLE(vk_command_buffer, commandBuffer, (pSubmits[i].pCommandBufferInfos)[j].commandBuffer);
++         if (commandBuffer->is_need_hard_encode) {
++            need_delete_cmd_buf[num_cmd_buf] = commandBuffer;
++            num_cmd_buf += 1;
++         }
++      }
+       struct vulkan_submit_info info = {
+          .pNext = pSubmits[i].pNext,
+          .command_buffer_count = pSubmits[i].commandBufferInfoCount,
+@@ -1156,6 +1172,20 @@ vk_common_QueueSubmit2KHR(VkQueue _queue,
+          return result;
+    }
+ 
++   if (num_cmd_buf > 0 && fence == NULL) {
++      vk_texcompress_create_fence(queue->base.device, NULL, &_fence);
++   }
++
++   if (num_cmd_buf > 0) {
++      vk_texcompress_wait_fence(queue->base.device, NULL, &_fence);
++      for (uint32_t i = 0; i < num_cmd_buf; i++) {
++         while (need_delete_cmd_buf[i]->staging_image_list_head != NULL) {
++            vk_texcompress_delete_head(&need_delete_cmd_buf[i]->staging_image_list_head, queue->base.device);
++         }
++         need_delete_cmd_buf[i]->is_need_hard_encode = false;
++      }
++   }
++
+    return VK_SUCCESS;
+ }
+ 
+diff --git a/src/vulkan/runtime/vk_texcompress_bcn.c b/src/vulkan/runtime/vk_texcompress_bcn.c
+new file mode 100644
+index 00000000000..2c85d98af5c
+--- /dev/null
++++ b/src/vulkan/runtime/vk_texcompress_bcn.c
+@@ -0,0 +1,740 @@
++/* Copyright (c) 2017-2023 Hans-Kristian Arntzen
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining
++ * a copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sublicense, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be
++ * included in all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include "vk_texcompress_bcn.h"
++#include "util/bc1_tables.h"
++#include "vk_alloc.h"
++#include "vk_format.h"
++#include "vk_image.h"
++#include "vk_physical_device.h"
++#include "log/log.h"
++
++void
++vk_texcompress_insert_head(staging_images **head, bcn_image *current)
++{
++   staging_images *node = (staging_images *)malloc(sizeof(staging_images));
++   if (node == NULL) {
++      ALOGE("Failed to alloc staging_images node");
++      return;
++   }
++
++   node->current = current;
++   node->next = *head;
++   *head = node;
++}
++
++void
++vk_texcompress_delete_head(staging_images **head, struct vk_device *device)
++{
++   if (*head == NULL) {
++      return;
++   }
++   staging_images *current_head = *head;
++   vk_texcompress_finish_image(device, NULL, current_head->current);
++   *head = current_head->next;
++   free(current_head);
++}
++
++/* type_indexes_mask bits are set/clear for support memory type index as per
++ * struct VkPhysicalDeviceMemoryProperties.memoryTypes[] */
++static uint32_t
++get_mem_type_index(struct vk_device *device, uint32_t type_indexes_mask,
++                   VkMemoryPropertyFlags mem_property)
++{
++   const struct vk_physical_device_dispatch_table *disp = &device->physical->dispatch_table;
++   VkPhysicalDevice _phy_device = vk_physical_device_to_handle(device->physical);
++
++   VkPhysicalDeviceMemoryProperties2 props2 = {
++      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2,
++      .pNext = NULL,
++   };
++   disp->GetPhysicalDeviceMemoryProperties2(_phy_device, &props2);
++
++   for (uint32_t i = 0; i < props2.memoryProperties.memoryTypeCount; i++) {
++      if ((type_indexes_mask & (1 << i)) &&
++          ((props2.memoryProperties.memoryTypes[i].propertyFlags & mem_property) == mem_property)) {
++         return i;
++      }
++   }
++
++   return -1;
++}
++
++static VkResult
++create_buffer(struct vk_device *device, VkAllocationCallbacks *allocator, struct vk_texcompress_bcn_state *bc1)
++{
++   VkDeviceSize size = sizeof(float) * (sizeof(stb__OMatch5) + sizeof(stb__OMatch6));
++   VkResult result;
++   VkDevice _device = vk_device_to_handle(device);
++   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++
++   VkBufferCreateInfo buffer_create_info = {
++      .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
++      .size = size,
++      .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
++      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
++   };
++   result =
++      disp->CreateBuffer(_device, &buffer_create_info, allocator, &bc1->bc1_table_buf);
++   if (unlikely(result != VK_SUCCESS))
++      return result;
++
++   VkBufferMemoryRequirementsInfo2 mem_req_info = {
++      .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,
++      .buffer = bc1->bc1_table_buf,
++   };
++   VkMemoryRequirements2 mem_req = {
++      .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
++   };
++   disp->GetBufferMemoryRequirements2(_device, &mem_req_info, &mem_req);
++
++   uint32_t mem_type_index = get_mem_type_index(
++      device, mem_req.memoryRequirements.memoryTypeBits,
++      VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
++   if (mem_type_index == -1)
++      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
++
++   VkMemoryAllocateInfo alloc_info = {
++      .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
++      .allocationSize = mem_req.memoryRequirements.size,
++      .memoryTypeIndex = mem_type_index,
++   };
++   result = disp->AllocateMemory(_device, &alloc_info, allocator, &bc1->bc1_table_mem);
++   if (unlikely(result != VK_SUCCESS))
++      return result;
++
++   result = disp->BindBufferMemory(_device, bc1->bc1_table_buf, bc1->bc1_table_mem, 0);
++   if (unlikely(result != VK_SUCCESS))
++      return result;
++
++   void *data;
++   disp->MapMemory(_device, bc1->bc1_table_mem, 0, size, 0, &data);
++   float (*data_array)[2] = (float(*)[2])data;
++   for (int i = 0; i < 256; i++) {
++      for (int j = 0; j < 2; j++) {
++         data_array[i][j] = (float)stb__OMatch5[i][j];
++         data_array[i + 256][j] = (float)stb__OMatch6[i][j];
++      }
++   }
++   disp->UnmapMemory(_device, bc1->bc1_table_mem);
++
++   return result;
++}
++
++VkResult
++vk_texcompress_create_image(struct vk_device *device, VkAllocationCallbacks *allocator, VkDeviceMemory *pMem,
++                            VkImage *image, VkExtent3D extent, VkFormat format, int size)
++{
++   bool is_dummy = (size != 0);
++   VkResult result;
++   VkDevice _device = vk_device_to_handle(device);
++   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++
++   VkImageCreateInfo image_create_info = {
++      .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
++      .flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT,
++      .imageType = VK_IMAGE_TYPE_2D,
++      .format = format,
++      .extent = extent,
++      .mipLevels = 1,
++      .arrayLayers = 1,
++      .samples = VK_SAMPLE_COUNT_1_BIT,
++      .tiling = VK_IMAGE_TILING_OPTIMAL,
++      .usage = VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
++      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
++      .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
++   };
++   result = disp->CreateImage(_device, &image_create_info, allocator, image);
++   if (unlikely(result != VK_SUCCESS)) {
++      return result;
++   }
++
++   if (is_dummy) {
++      VkMemoryRequirements mem_requirements;
++      disp->GetImageMemoryRequirements(_device, *image, &mem_requirements);
++
++      uint32_t mem_type_index = get_mem_type_index(
++         device, mem_requirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
++      if (mem_type_index == -1) {
++         return VK_ERROR_OUT_OF_HOST_MEMORY;
++      }
++
++      VkMemoryAllocateInfo alloc_info = {
++         .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
++         .allocationSize = size,
++         .memoryTypeIndex = mem_type_index,
++      };
++      result = disp->AllocateMemory(_device, &alloc_info, allocator, pMem);
++      if (unlikely(result != VK_SUCCESS)) {
++         return result;
++      }
++   }
++   if (is_dummy) {
++      disp->DestroyImage(_device, *image, allocator);
++   }
++   return result;
++}
++
++void
++vk_texcompress_finish_image(struct vk_device *device, const VkAllocationCallbacks *allocator,
++                            struct vk_texcompress_bcn_image *intermidate_image)
++{
++   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++   disp->DestroyImage(vk_device_to_handle(device), intermidate_image->image_rgba, NULL);
++   disp->DestroyImage(vk_device_to_handle(device), intermidate_image->image_bc1, NULL);
++   disp->DestroyImage(vk_device_to_handle(device), intermidate_image->image_bc4, NULL);
++   disp->FreeMemory(vk_device_to_handle(device), intermidate_image->image_mem, NULL);
++   vk_free(&device->alloc, intermidate_image);
++}
++
++void bind_memory(struct vk_device *device, VkImage *image, VkDeviceMemory *pMem, int offset)
++{
++   VkDevice _device = vk_device_to_handle(device);
++   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++   disp->BindImageMemory(_device, *image, *pMem, offset);
++}
++
++static VkResult
++create_sampler(struct vk_device *device, VkAllocationCallbacks *allocator,
++               VkSampler *sampler, VkFilter filter, VkSamplerMipmapMode mipmapMode)
++{
++   VkDevice _device = vk_device_to_handle(device);
++   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++
++   VkSamplerCreateInfo create_info = {
++      .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
++      .magFilter = filter,
++      .minFilter = filter,
++      .addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT,
++      .addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT,
++      .addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT,
++      .anisotropyEnable = VK_FALSE,
++      .borderColor = VK_BORDER_COLOR_INT_OPAQUE_BLACK,
++      .unnormalizedCoordinates = VK_FALSE,
++      .compareEnable = VK_FALSE,
++      .mipmapMode = mipmapMode,
++   };
++   VkResult result = disp->CreateSampler(_device, &create_info, allocator, sampler);
++   return result;
++}
++
++static VkResult
++create_bc1_layout(struct vk_device *device, VkAllocationCallbacks *allocator,
++                  struct vk_texcompress_bcn_state *bcn)
++{
++   VkResult result;
++   VkDevice _device = vk_device_to_handle(device);
++   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++
++   VkDescriptorSetLayoutBinding bindings[] = {
++      {
++         .binding = 0,
++         .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
++         .descriptorCount = 1,
++         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
++         .pImmutableSamplers = NULL,
++      },
++      {
++         .binding = 1,
++         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
++         .descriptorCount = 1,
++         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
++         .pImmutableSamplers = NULL,
++      },
++      {
++         .binding = 2,
++         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
++         .descriptorCount = 1,
++         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
++         .pImmutableSamplers = NULL,
++      },
++   };
++
++   VkDescriptorSetLayoutCreateInfo ds_create_info = {
++      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
++      .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
++      .bindingCount = ARRAY_SIZE(bindings),
++      .pBindings = bindings,
++   };
++
++   result = disp->CreateDescriptorSetLayout(_device, &ds_create_info, allocator, &bcn->ds_layout);
++   return result;
++}
++
++static VkResult
++create_bc4_layout(struct vk_device *device, VkAllocationCallbacks *allocator,
++                  struct vk_texcompress_bcn_state *bcn)
++{
++   VkResult result;
++   VkDevice _device = vk_device_to_handle(device);
++   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++
++   VkDescriptorSetLayoutBinding bindings[] = {
++      {
++         .binding = 0,
++         .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
++         .descriptorCount = 1,
++         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
++         .pImmutableSamplers = NULL,
++      },
++      {
++         .binding = 1,
++         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
++         .descriptorCount = 1,
++         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
++         .pImmutableSamplers = NULL,
++      },
++   };
++
++   VkDescriptorSetLayoutCreateInfo ds_create_info = {
++      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
++      .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
++      .bindingCount = ARRAY_SIZE(bindings),
++      .pBindings = bindings,
++   };
++
++   result = disp->CreateDescriptorSetLayout(_device, &ds_create_info, allocator, &bcn->ds_layout);
++   return result;
++}
++
++static VkResult
++create_bc3_layout(struct vk_device *device, VkAllocationCallbacks *allocator,
++                  struct vk_texcompress_bcn_state *bcn)
++{
++   VkResult result;
++   VkDevice _device = vk_device_to_handle(device);
++   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++
++   VkDescriptorSetLayoutBinding bindings[] = {
++      {
++         .binding = 0,
++         .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
++         .descriptorCount = 1,
++         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
++         .pImmutableSamplers = NULL,
++      },
++      {
++         .binding = 1,
++         .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
++         .descriptorCount = 1,
++         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
++         .pImmutableSamplers = NULL,
++      },
++      {
++         .binding = 2,
++         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
++         .descriptorCount = 1,
++         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
++         .pImmutableSamplers = NULL,
++      },
++   };
++
++   VkDescriptorSetLayoutCreateInfo ds_create_info = {
++      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
++      .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
++      .bindingCount = ARRAY_SIZE(bindings),
++      .pBindings = bindings,
++   };
++
++   result = disp->CreateDescriptorSetLayout(_device, &ds_create_info, allocator, &bcn->ds_layout);
++   return result;
++}
++
++static VkResult
++create_pipeline_layout(struct vk_device *device, VkAllocationCallbacks *allocator,
++                       struct vk_texcompress_bcn_state *bcn, enum compute_pipeline_id id)
++{
++   VkResult result = VK_RESULT_MAX_ENUM;
++   switch (id) {
++   case BC1_ENCODE:
++      result = create_bc1_layout(device, allocator, bcn);
++      break;
++   case BC4_ENCODE:
++      result = create_bc4_layout(device, allocator, bcn);
++      break;
++   case BC3_ENCODE:
++      result = create_bc3_layout(device, allocator, bcn);
++      break;
++   default:
++      goto fail;
++   }
++   if (result != VK_SUCCESS) {
++      goto fail;
++   }
++   VkDevice _device = vk_device_to_handle(device);
++   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++   VkPipelineLayoutCreateInfo pl_create_info = {
++      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
++      .setLayoutCount = 1,
++      .pSetLayouts = &bcn->ds_layout,
++      .pushConstantRangeCount = 1,
++      .pPushConstantRanges = &(VkPushConstantRange) {VK_SHADER_STAGE_COMPUTE_BIT, 0, 8}
++   };
++   result = disp->CreatePipelineLayout(_device, &pl_create_info, allocator, &bcn->p_layout);
++fail:
++   return result;
++}
++
++static const uint32_t bc1_spv[] = {
++#include "bc1_spv.h"
++};
++
++static const uint32_t bc4_spv[] = {
++#include "bc4_spv.h"
++};
++
++static const uint32_t bc3_spv[] = {
++#include "bc3_spv.h"
++};
++
++static VkResult
++vk_bc1_create_shader_module(struct vk_device *device,
++                             VkAllocationCallbacks *allocator,
++                             struct vk_texcompress_bcn_state *bcn)
++{
++   VkDevice _device = vk_device_to_handle(device);
++   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++
++   VkShaderModuleCreateInfo shader_module_create_info = {
++      .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
++      .pNext = NULL,
++      .flags = 0,
++      .codeSize = sizeof(bc1_spv),
++      .pCode = bc1_spv,
++   };
++
++   return disp->CreateShaderModule(_device, &shader_module_create_info, allocator, &bcn->shader_module);
++}
++
++static VkResult
++vk_bc4_create_shader_module(struct vk_device *device,
++                             VkAllocationCallbacks *allocator,
++                             struct vk_texcompress_bcn_state *bcn)
++{
++   VkDevice _device = vk_device_to_handle(device);
++   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++
++   VkShaderModuleCreateInfo shader_module_create_info = {
++      .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
++      .pNext = NULL,
++      .flags = 0,
++      .codeSize = sizeof(bc4_spv),
++      .pCode = bc4_spv,
++   };
++
++   return disp->CreateShaderModule(_device, &shader_module_create_info, allocator, &bcn->shader_module);
++}
++
++static VkResult
++vk_bc3_create_shader_module(struct vk_device *device,
++                             VkAllocationCallbacks *allocator,
++                             struct vk_texcompress_bcn_state *bcn)
++{
++   VkDevice _device = vk_device_to_handle(device);
++   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++
++   VkShaderModuleCreateInfo shader_module_create_info = {
++      .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
++      .pNext = NULL,
++      .flags = 0,
++      .codeSize = sizeof(bc3_spv),
++      .pCode = bc3_spv,
++   };
++
++   return disp->CreateShaderModule(_device, &shader_module_create_info, allocator, &bcn->shader_module);
++}
++
++static VkResult
++create_bcn_encode_pipeline(struct vk_device *device,
++                           VkAllocationCallbacks *allocator,
++                           struct vk_texcompress_bcn_state *bcn,
++                           VkPipelineCache pipeline_cache)
++{
++   VkResult result;
++   VkDevice _device = vk_device_to_handle(device);
++   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++   VkPipeline pipeline;
++
++   /* compute shader */
++   VkPipelineShaderStageCreateInfo pipeline_shader_stage = {
++      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
++      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
++      .module = bcn->shader_module,
++      .pName = "main",
++   };
++
++   VkComputePipelineCreateInfo vk_pipeline_info = {
++      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
++      .stage = pipeline_shader_stage,
++      .flags = 0,
++      .layout = bcn->p_layout,
++   };
++
++   result = disp->CreateComputePipelines(_device, pipeline_cache, 1, &vk_pipeline_info, allocator, &pipeline);
++   if (result != VK_SUCCESS)
++      return result;
++
++   bcn->pipeline = pipeline;
++
++   return result;
++}
++
++VkPipeline
++vk_texcompress_rgba_get_encode_pipeline(struct vk_device *device, VkAllocationCallbacks *allocator,
++                                        struct vk_texcompress_bcn_state *bcn, VkPipelineCache pipeline_cache,
++                                        enum compute_pipeline_id id)
++{
++   VkResult result;
++
++   simple_mtx_lock(&bcn->mutex);
++
++   if (bcn->pipeline)
++      goto unlock;
++
++   if (id >= MAX_PIPELINE_NUM || id < 0)
++      goto unlock;
++
++   if (id == BC1_ENCODE && !bcn->shader_module) {
++      result = vk_bc1_create_shader_module(device, allocator, bcn);
++      if (result != VK_SUCCESS)
++         goto unlock;
++   }
++
++   if (id == BC4_ENCODE && !bcn->shader_module) {
++      result = vk_bc4_create_shader_module(device, allocator, bcn);
++      if (result != VK_SUCCESS)
++         goto unlock;
++   }
++
++   if (id == BC3_ENCODE && !bcn->shader_module) {
++      result = vk_bc3_create_shader_module(device, allocator, bcn);
++      if (result != VK_SUCCESS)
++         goto unlock;
++   }
++
++   create_bcn_encode_pipeline(device, allocator, bcn, pipeline_cache);
++
++unlock:
++   simple_mtx_unlock(&bcn->mutex);
++   return bcn->pipeline;
++}
++
++static inline void
++fill_desc_image_info_struct(VkDescriptorImageInfo *info, VkSampler sampler, VkImageView img_view,
++                            VkImageLayout img_layout)
++{
++   info->sampler = sampler;
++   info->imageView = img_view;
++   info->imageLayout = img_layout;
++}
++
++static inline void
++fill_write_descriptor_set_image(VkWriteDescriptorSet *set, uint8_t bind_i,
++                                VkDescriptorType desc_type, VkDescriptorImageInfo *image_info)
++{
++   set->sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
++   set->pNext = NULL;
++   set->dstSet = VK_NULL_HANDLE;
++   set->dstBinding = bind_i;
++   set->dstArrayElement = 0;
++   set->descriptorCount = 1;
++   set->descriptorType = desc_type;
++   set->pImageInfo = image_info;
++   set->pBufferInfo = NULL;
++   set->pTexelBufferView = NULL;
++}
++
++static inline void
++fill_write_descriptor_set_buffer(VkWriteDescriptorSet *set,
++                                 uint8_t bind_i, VkDescriptorType desc_type,
++                                 VkDescriptorBufferInfo *buf_info)
++{
++   set->sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
++   set->pNext = NULL;
++   set->dstSet = VK_NULL_HANDLE;
++   set->dstBinding = bind_i;
++   set->dstArrayElement = 0;
++   set->descriptorCount = 1;
++   set->descriptorType = desc_type;
++   set->pImageInfo = NULL;
++   set->pBufferInfo = buf_info;
++   set->pTexelBufferView = NULL;
++}
++
++void
++vk_texcompress_bc1_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn,
++                                               struct vk_texcompress_bcn_write_descriptor_set *set,
++                                               VkImageView src_img_view, VkImageLayout src_img_layout,
++                                               VkImageView dst_img_view)
++{
++   fill_desc_image_info_struct(&set->src_desc_image_info, bcn->sampler, src_img_view, src_img_layout);
++   fill_write_descriptor_set_image(&set->descriptor_set[0], 0,
++                                   VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, &set->src_desc_image_info);
++
++   set->bc1_buffer_info.buffer = bcn->bc1_table_buf;
++   set->bc1_buffer_info.offset = 0;
++   set->bc1_buffer_info.range = VK_WHOLE_SIZE;
++   fill_write_descriptor_set_buffer(&set->descriptor_set[1], 1,
++                                    VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, &set->bc1_buffer_info);
++
++   fill_desc_image_info_struct(&set->dst_desc_image_info, VK_NULL_HANDLE, dst_img_view, VK_IMAGE_LAYOUT_GENERAL);
++   fill_write_descriptor_set_image(&set->descriptor_set[2], 2,
++                                   VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &set->dst_desc_image_info);
++}
++
++void
++vk_texcompress_bc4_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn,
++                                              struct vk_texcompress_bc4_write_descriptor_set *set,
++                                              VkImageView src_img_view, VkImageLayout src_img_layout,
++                                              VkImageView dst_img_view)
++{
++   fill_desc_image_info_struct(&set->src_desc_image_info, bcn->sampler, src_img_view, src_img_layout);
++   fill_write_descriptor_set_image(&set->descriptor_set[0], 0,
++                                   VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, &set->src_desc_image_info);
++
++   fill_desc_image_info_struct(&set->dst_desc_image_info, VK_NULL_HANDLE, dst_img_view, VK_IMAGE_LAYOUT_GENERAL);
++   fill_write_descriptor_set_image(&set->descriptor_set[1], 1,
++                                   VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &set->dst_desc_image_info);
++}
++
++void
++vk_texcompress_bc3_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn,
++                                               struct vk_texcompress_bcn_write_descriptor_set *set,
++                                               VkImageView bc1_src_img_view, VkImageView bc4_src_img_view,
++                                               VkImageView dst_img_view)
++{
++   fill_desc_image_info_struct(&set->src_desc_image_info, bcn->sampler, bc1_src_img_view, VK_IMAGE_LAYOUT_GENERAL);
++   fill_write_descriptor_set_image(&set->descriptor_set[0], 0,
++                                   VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, &set->src_desc_image_info);
++
++   fill_desc_image_info_struct(&set->src_desc_image_info_2, bcn->sampler, bc4_src_img_view, VK_IMAGE_LAYOUT_GENERAL);
++   fill_write_descriptor_set_image(&set->descriptor_set[1], 1,
++                                   VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, &set->src_desc_image_info_2);
++
++   fill_desc_image_info_struct(&set->dst_desc_image_info, VK_NULL_HANDLE, dst_img_view, VK_IMAGE_LAYOUT_GENERAL);
++   fill_write_descriptor_set_image(&set->descriptor_set[2], 2,
++                                   VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &set->dst_desc_image_info);
++}
++
++VkResult
++vk_texcompress_bcn_init(struct vk_device *device, VkAllocationCallbacks *allocator,
++                        VkPipelineCache pipeline_cache,
++                        struct vk_texcompress_bcn_state *bcn[])
++{
++   VkResult result;
++
++   for (int pi = VK_TEXCOMPRESS_BCN_NUM_COMPUTE_PIPELINES - 1; pi >= 0; --pi) {
++      /* memory to be freed as part of vk_bcn_decode_finish() */
++      bcn[pi] = vk_zalloc(allocator, sizeof(struct vk_texcompress_bcn_state), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
++      if (bcn[pi] == NULL) {
++         return VK_ERROR_OUT_OF_HOST_MEMORY;
++      }
++      simple_mtx_init(&(bcn[pi])->mutex, mtx_plain);
++   }
++
++   result = create_buffer(device, allocator, bcn[BC1_ENCODE]);
++   if (result != VK_SUCCESS)
++      goto fail;
++
++   result = create_sampler(device, allocator, &bcn[BC1_ENCODE]->sampler,
++                           VK_FILTER_LINEAR, VK_SAMPLER_MIPMAP_MODE_LINEAR);
++   if (result != VK_SUCCESS)
++      goto fail;
++
++   result = create_pipeline_layout(device, allocator, bcn[BC1_ENCODE], BC1_ENCODE);
++   if (result != VK_SUCCESS)
++      goto fail;
++
++   result = create_sampler(device, allocator, &bcn[BC4_ENCODE]->sampler,
++                           VK_FILTER_LINEAR, VK_SAMPLER_MIPMAP_MODE_LINEAR);
++   if (result != VK_SUCCESS)
++      goto fail;
++
++   result = create_pipeline_layout(device, allocator, bcn[BC4_ENCODE], BC4_ENCODE);
++   if (result != VK_SUCCESS)
++      goto fail;
++
++   // 由于VK_FORMAT_R32G32_UINT格式的imageView格式不包含VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT, 采用nearest
++   result = create_sampler(device, allocator, &bcn[BC3_ENCODE]->sampler,
++                           VK_FILTER_NEAREST, VK_SAMPLER_MIPMAP_MODE_NEAREST);
++   if (result != VK_SUCCESS)
++      goto fail;
++
++   result = create_pipeline_layout(device, allocator, bcn[BC3_ENCODE], BC3_ENCODE);
++   if (result != VK_SUCCESS)
++      goto fail;
++
++fail:
++   return result;
++}
++
++void
++vk_texcompress_bcn_finish(struct vk_device *device,
++                          VkAllocationCallbacks *allocator,
++                          struct vk_texcompress_bcn_state *bcn[])
++{
++   VkDevice _device = vk_device_to_handle(device);
++   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++
++   for (int pi = VK_TEXCOMPRESS_BCN_NUM_COMPUTE_PIPELINES - 1; pi >= 0; --pi) {
++      if (bcn[pi]->pipeline)
++         disp->DestroyPipeline(_device, bcn[pi]->pipeline, allocator);
++
++      disp->DestroyPipelineLayout(_device, bcn[pi]->p_layout, allocator);
++      disp->DestroyShaderModule(_device, bcn[pi]->shader_module, allocator);
++      disp->DestroyDescriptorSetLayout(_device, bcn[pi]->ds_layout, allocator);
++
++      if (pi == BC1_ENCODE) {
++         disp->DestroyBuffer(_device, bcn[pi]->bc1_table_buf, allocator);
++         disp->FreeMemory(_device, bcn[pi]->bc1_table_mem, allocator);
++      }
++      vk_free(allocator, bcn[pi]);
++   }
++}
++
++void
++vk_texcompress_create_fence(struct vk_device *device, VkAllocationCallbacks *allocator, VkFence *fence)
++{
++   VkDevice _device = vk_device_to_handle(device);
++   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++
++   VkFenceCreateInfo create_info = {
++      .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
++      .flags = VK_FENCE_CREATE_SIGNALED_BIT,
++   };
++   VkResult result = disp->CreateFence(_device, &create_info, allocator, fence);
++   if (result != VK_SUCCESS) {
++      ALOGE("Failed to create fence");
++   }
++}
++
++void
++vk_texcompress_wait_fence(struct vk_device *device, VkAllocationCallbacks *allocator, VkFence *fence)
++{
++   VkDevice _device = vk_device_to_handle(device);
++   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
++
++   VkResult result = disp->WaitForFences(_device, 1, fence, VK_TRUE, UINT64_MAX);
++   if (result != VK_SUCCESS) {
++      ALOGE("Failed to wait for fence");
++   }
++}
+\ No newline at end of file
+diff --git a/src/vulkan/runtime/vk_texcompress_bcn.h b/src/vulkan/runtime/vk_texcompress_bcn.h
+new file mode 100644
+index 00000000000..ca8189b657e
+--- /dev/null
++++ b/src/vulkan/runtime/vk_texcompress_bcn.h
+@@ -0,0 +1,142 @@
++/* Copyright (c) 2017-2023 Hans-Kristian Arntzen
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining
++ * a copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sublicense, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be
++ * included in all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
++ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
++ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
++ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++#ifndef VK_TEXCOMPRESS_BCN_H
++#define VK_TEXCOMPRESS_BCN_H
++
++#include "util/simple_mtx.h"
++#include "vk_fence.h"
++#include "vk_device.h"
++
++#define VK_TEXCOMPRESS_BCN_NUM_COMPUTE_PIPELINES 3
++#define VK_TEXCOMPRESS_BCN_WRITE_DESC_SET_COUNT 3  // 每个着色器最多有3个描述符集
++
++struct vk_texcompress_bcn_write_descriptor_set {
++   VkWriteDescriptorSet descriptor_set[VK_TEXCOMPRESS_BCN_WRITE_DESC_SET_COUNT];
++   VkDescriptorImageInfo dst_desc_image_info;
++   VkDescriptorImageInfo src_desc_image_info;
++   VkDescriptorImageInfo src_desc_image_info_2;
++   VkDescriptorBufferInfo bc1_buffer_info;
++};
++
++struct vk_texcompress_bc4_write_descriptor_set {
++   VkWriteDescriptorSet descriptor_set[2];
++   VkDescriptorImageInfo dst_desc_image_info;
++   VkDescriptorImageInfo src_desc_image_info;
++};
++
++typedef struct vk_texcompress_bcn_image {
++   VkImage image_dummy;
++   VkImage image_rgba;
++   VkImage image_bc1;
++   VkImage image_bc4;
++   VkDeviceMemory image_mem;
++} bcn_image;
++
++struct vk_texcompress_dummy_image {
++   VkImage image_dummy;
++   VkDeviceMemory image_mem;
++};
++
++typedef struct vk_texcompress_staging_images {
++   struct vk_texcompress_bcn_image *current;
++   struct vk_texcompress_staging_images* next;
++} staging_images;
++
++enum compute_pipeline_id {
++   BC1_ENCODE = 0,
++   BC4_ENCODE,
++   BC3_ENCODE,
++   MAX_PIPELINE_NUM
++};
++
++struct vk_texcompress_bcn_state {
++   simple_mtx_t mutex;
++   VkDescriptorSetLayout ds_layout;
++   VkPipelineLayout p_layout;
++   VkPipeline pipeline;
++   VkShaderModule shader_module;
++   VkSampler sampler;
++
++   VkDeviceMemory bc1_table_mem;
++   VkBuffer bc1_table_buf;
++};
++
++void
++vk_texcompress_insert_head(staging_images **head, bcn_image *current);
++
++void
++vk_texcompress_delete_head(staging_images **head, struct vk_device *device);
++   
++VkResult
++vk_texcompress_create_image(struct vk_device *device, VkAllocationCallbacks *allocator, VkDeviceMemory *pMem,
++                            VkImage *image, VkExtent3D extent, VkFormat format, int size);
++
++void bind_memory(struct vk_device *device, VkImage *image, VkDeviceMemory *pMem, int offset);
++
++void
++vk_texcompress_finish_image(struct vk_device *device, const VkAllocationCallbacks *allocator,
++                            struct vk_texcompress_bcn_image *intermidate_image);
++
++void
++vk_texcompress_bc1_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn,
++                                              struct vk_texcompress_bcn_write_descriptor_set *set,
++                                              VkImageView src_img_view, VkImageLayout src_img_layout,
++                                              VkImageView dst_img_view);
++
++void
++vk_texcompress_bc4_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn,
++                                              struct vk_texcompress_bc4_write_descriptor_set *set,
++                                              VkImageView src_img_view, VkImageLayout src_img_layout,
++                                              VkImageView dst_img_view);
++
++void
++vk_texcompress_bc3_fill_write_descriptor_sets(struct vk_texcompress_bcn_state *bcn,
++                                              struct vk_texcompress_bcn_write_descriptor_set *set,
++                                              VkImageView bc1_src_img_view, VkImageView bc4_src_img_view,
++                                              VkImageView dst_img_view);
++
++VkPipeline
++vk_texcompress_rgba_get_encode_pipeline(struct vk_device *device,
++                                        VkAllocationCallbacks *allocator,
++                                        struct vk_texcompress_bcn_state *bcn,
++                                        VkPipelineCache pipeline_cache,
++                                        enum compute_pipeline_id id);
++
++VkResult
++vk_texcompress_bcn_init(struct vk_device *device,
++                        VkAllocationCallbacks *allocator,
++                        VkPipelineCache pipeline_cache,
++                        struct vk_texcompress_bcn_state *bcn[]);
++
++void
++vk_texcompress_bcn_finish(struct vk_device *device,
++                          VkAllocationCallbacks *allocator,
++                          struct vk_texcompress_bcn_state *bcn[]);
++
++
++void
++vk_texcompress_create_fence(struct vk_device *device, VkAllocationCallbacks *allocator, VkFence *fence);
++
++void
++vk_texcompress_wait_fence(struct vk_device *device, VkAllocationCallbacks *allocator, VkFence *fence);
++
++#endif /* VK_TEXCOMPRESS_BCN_H */
-- 
Gitee


From 646e99d044d2e307bd583369a8d3c462b3e16635 Mon Sep 17 00:00:00 2001
From: zhuanghb3 <zhuanghb3@163.com>
Date: Thu, 25 Sep 2025 02:18:55 +0000
Subject: [PATCH 3/3] =?UTF-8?q?=E4=BD=BF=E8=83=BDvulkan=20astc=E5=92=8Cetc?=
 =?UTF-8?q?=E7=BA=B9=E7=90=86=E7=A1=AC=E8=A7=A3=E7=A1=AC=E7=BC=96=E4=B8=BA?=
 =?UTF-8?q?bc3=E7=BA=B9=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 patchForAndroid/external-mesa-0003.patch | 1579 ++++++++++++++++++++++
 1 file changed, 1579 insertions(+)
 create mode 100644 patchForAndroid/external-mesa-0003.patch

diff --git a/patchForAndroid/external-mesa-0003.patch b/patchForAndroid/external-mesa-0003.patch
new file mode 100644
index 0000000..1b0df66
--- /dev/null
+++ b/patchForAndroid/external-mesa-0003.patch
@@ -0,0 +1,1579 @@
+diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
+index 7172e41ab75..522da8c8699 100644
+--- a/src/mesa/main/shaderapi.c
++++ b/src/mesa/main/shaderapi.c
+@@ -2674,12 +2674,11 @@ _mesa_copy_linked_program_data(const struct gl_shader_program *src,
+ /**
+  * ARB_separate_shader_objects: Compile & Link Program
+  */
+-GLuint GLAPIENTRY
+-_mesa_CreateShaderProgramv(GLenum type, GLsizei count,
+-                           const GLchar* const *strings)
++GLuint
++_mesa_CreateShaderProgramv_impl(struct gl_context *ctx,
++                                GLenum type, GLsizei count,
++                                const GLchar* const *strings)
+ {
+-   GET_CURRENT_CONTEXT(ctx);
+-
+    const GLuint shader = create_shader_err(ctx, type, "glCreateShaderProgramv");
+    GLuint program = 0;
+ 
+@@ -2731,6 +2730,17 @@ _mesa_CreateShaderProgramv(GLenum type, GLsizei count,
+    return program;
+ }
+ 
++/**
++ * ARB_separate_shader_objects: Compile & Link Program
++ */
++GLuint GLAPIENTRY
++_mesa_CreateShaderProgramv(GLenum type, GLsizei count,
++                           const GLchar* const *strings)
++{
++   GET_CURRENT_CONTEXT(ctx);
++
++   return _mesa_CreateShaderProgramv_impl(ctx, type, count, strings);
++}
+ 
+ static void
+ set_patch_vertices(struct gl_context *ctx, GLint value)
+diff --git a/src/mesa/main/shaderapi.h b/src/mesa/main/shaderapi.h
+index 90e90fee743..1367dd8b5ee 100644
+--- a/src/mesa/main/shaderapi.h
++++ b/src/mesa/main/shaderapi.h
+@@ -200,6 +200,10 @@ const char *
+ _mesa_lookup_shader_include(struct gl_context *ctx, char *path,
+                             bool error_check);
+ 
++GLuint
++_mesa_CreateShaderProgramv_impl(struct gl_context *ctx,
++                                GLenum type, GLsizei count,
++                                const GLchar* const *strings);
+ #ifdef __cplusplus
+ }
+ #endif
+diff --git a/src/mesa/meson.build b/src/mesa/meson.build
+index bc7963413ff..06cba8d8353 100644
+--- a/src/mesa/meson.build
++++ b/src/mesa/meson.build
+@@ -320,6 +320,7 @@ files_libmesa = files(
+   'state_tracker/st_atom_tess.c',
+   'state_tracker/st_atom_texture.c',
+   'state_tracker/st_atom_viewport.c',
++  'state_tracker/st_bc1_tables.h',
+   'state_tracker/st_cb_bitmap.c',
+   'state_tracker/st_cb_bitmap.h',
+   'state_tracker/st_cb_bitmap_shader.c',
+@@ -388,6 +389,8 @@ files_libmesa = files(
+   'state_tracker/st_scissor.h',
+   'state_tracker/st_shader_cache.c',
+   'state_tracker/st_shader_cache.h',
++  'state_tracker/st_texcompress_compute.c',
++  'state_tracker/st_texcompress_compute.h',
+   'state_tracker/st_texture.c',
+   'state_tracker/st_texture.h',
+   'state_tracker/st_tgsi_lower_yuv.c',
+diff --git a/src/mesa/state_tracker/st_bc1_tables.h b/src/mesa/state_tracker/st_bc1_tables.h
+new file mode 100644
+index 00000000000..654ccdadb78
+--- /dev/null
++++ b/src/mesa/state_tracker/st_bc1_tables.h
+@@ -0,0 +1,124 @@
++/*
++ * Copyright 2020-2022 Matias N. Goldberg
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice shall be included in
++ * all copies or substantial portions of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ */
++
++/* These tables have been generated with:
++
++		#define STB_DXT_IMPLEMENTATION
++		#include "stb_dxt.h"
++		#include <stdio.h>
++
++		int main()
++		{
++			stb__InitDXT();
++
++			printf( "static unsigned char stb__OMatch5[256][2] = {\n" );
++			for( size_t i = 0u; i < 256u; ++i )
++			{
++				printf( "{ %i, %i }", stb__OMatch5[i][0], stb__OMatch5[i][1] );
++				if( i != 255u )
++					printf( ",\n" );
++			}
++			printf( "\n};\n" );
++
++			printf( "static unsigned char stb__OMatch6[256][2] = {\n" );
++			for( size_t i = 0u; i < 256u; ++i )
++			{
++				printf( "{ %i, %i }", stb__OMatch6[i][0], stb__OMatch6[i][1] );
++				if( i != 255u )
++					printf( ",\n" );
++			}
++			printf( "\n};\n" );
++
++			return 0;
++		}
++
++	Note that stb__OMatch5[i][0] = max and stb__OMatch5[i][1] = min
++*/
++static unsigned char stb__OMatch5[256][2] = {
++	{ 0, 0 },   { 0, 0 },   { 0, 1 },   { 0, 1 },   { 1, 0 },   { 1, 0 },   { 1, 0 },   { 1, 1 },
++	{ 1, 1 },   { 2, 0 },   { 2, 0 },   { 0, 4 },   { 2, 1 },   { 2, 1 },   { 2, 1 },   { 3, 0 },
++	{ 3, 0 },   { 3, 0 },   { 3, 1 },   { 1, 5 },   { 3, 2 },   { 3, 2 },   { 4, 0 },   { 4, 0 },
++	{ 4, 1 },   { 4, 1 },   { 4, 2 },   { 4, 2 },   { 4, 2 },   { 3, 5 },   { 5, 1 },   { 5, 1 },
++	{ 5, 2 },   { 4, 4 },   { 5, 3 },   { 5, 3 },   { 5, 3 },   { 6, 2 },   { 6, 2 },   { 6, 2 },
++	{ 6, 3 },   { 5, 5 },   { 6, 4 },   { 6, 4 },   { 4, 8 },   { 7, 3 },   { 7, 3 },   { 7, 3 },
++	{ 7, 4 },   { 7, 4 },   { 7, 4 },   { 7, 5 },   { 5, 9 },   { 7, 6 },   { 7, 6 },   { 8, 4 },
++	{ 8, 4 },   { 8, 5 },   { 8, 5 },   { 8, 6 },   { 8, 6 },   { 8, 6 },   { 7, 9 },   { 9, 5 },
++	{ 9, 5 },   { 9, 6 },   { 8, 8 },   { 9, 7 },   { 9, 7 },   { 9, 7 },   { 10, 6 },  { 10, 6 },
++	{ 10, 6 },  { 10, 7 },  { 9, 9 },   { 10, 8 },  { 10, 8 },  { 8, 12 },  { 11, 7 },  { 11, 7 },
++	{ 11, 7 },  { 11, 8 },  { 11, 8 },  { 11, 8 },  { 11, 9 },  { 9, 13 },  { 11, 10 }, { 11, 10 },
++	{ 12, 8 },  { 12, 8 },  { 12, 9 },  { 12, 9 },  { 12, 10 }, { 12, 10 }, { 12, 10 }, { 11, 13 },
++	{ 13, 9 },  { 13, 9 },  { 13, 10 }, { 12, 12 }, { 13, 11 }, { 13, 11 }, { 13, 11 }, { 14, 10 },
++	{ 14, 10 }, { 14, 10 }, { 14, 11 }, { 13, 13 }, { 14, 12 }, { 14, 12 }, { 12, 16 }, { 15, 11 },
++	{ 15, 11 }, { 15, 11 }, { 15, 12 }, { 15, 12 }, { 15, 12 }, { 15, 13 }, { 13, 17 }, { 15, 14 },
++	{ 15, 14 }, { 16, 12 }, { 16, 12 }, { 16, 13 }, { 16, 13 }, { 16, 14 }, { 16, 14 }, { 16, 14 },
++	{ 15, 17 }, { 17, 13 }, { 17, 13 }, { 17, 14 }, { 16, 16 }, { 17, 15 }, { 17, 15 }, { 17, 15 },
++	{ 18, 14 }, { 18, 14 }, { 18, 14 }, { 18, 15 }, { 17, 17 }, { 18, 16 }, { 18, 16 }, { 16, 20 },
++	{ 19, 15 }, { 19, 15 }, { 19, 15 }, { 19, 16 }, { 19, 16 }, { 19, 16 }, { 19, 17 }, { 17, 21 },
++	{ 19, 18 }, { 19, 18 }, { 20, 16 }, { 20, 16 }, { 20, 17 }, { 20, 17 }, { 20, 18 }, { 20, 18 },
++	{ 20, 18 }, { 19, 21 }, { 21, 17 }, { 21, 17 }, { 21, 18 }, { 20, 20 }, { 21, 19 }, { 21, 19 },
++	{ 21, 19 }, { 22, 18 }, { 22, 18 }, { 22, 18 }, { 22, 19 }, { 21, 21 }, { 22, 20 }, { 22, 20 },
++	{ 20, 24 }, { 23, 19 }, { 23, 19 }, { 23, 19 }, { 23, 20 }, { 23, 20 }, { 23, 20 }, { 23, 21 },
++	{ 21, 25 }, { 23, 22 }, { 23, 22 }, { 24, 20 }, { 24, 20 }, { 24, 21 }, { 24, 21 }, { 24, 22 },
++	{ 24, 22 }, { 24, 22 }, { 23, 25 }, { 25, 21 }, { 25, 21 }, { 25, 22 }, { 24, 24 }, { 25, 23 },
++	{ 25, 23 }, { 25, 23 }, { 26, 22 }, { 26, 22 }, { 26, 22 }, { 26, 23 }, { 25, 25 }, { 26, 24 },
++	{ 26, 24 }, { 24, 28 }, { 27, 23 }, { 27, 23 }, { 27, 23 }, { 27, 24 }, { 27, 24 }, { 27, 24 },
++	{ 27, 25 }, { 25, 29 }, { 27, 26 }, { 27, 26 }, { 28, 24 }, { 28, 24 }, { 28, 25 }, { 28, 25 },
++	{ 28, 26 }, { 28, 26 }, { 28, 26 }, { 27, 29 }, { 29, 25 }, { 29, 25 }, { 29, 26 }, { 28, 28 },
++	{ 29, 27 }, { 29, 27 }, { 29, 27 }, { 30, 26 }, { 30, 26 }, { 30, 26 }, { 30, 27 }, { 29, 29 },
++	{ 30, 28 }, { 30, 28 }, { 30, 28 }, { 31, 27 }, { 31, 27 }, { 31, 27 }, { 31, 28 }, { 31, 28 },
++	{ 31, 28 }, { 31, 29 }, { 31, 29 }, { 31, 30 }, { 31, 30 }, { 31, 30 }, { 31, 31 }, { 31, 31 }
++};
++
++static unsigned char stb__OMatch6[256][2] = {
++	{ 0, 0 },   { 0, 1 },   { 1, 0 },   { 1, 0 },   { 1, 1 },   { 2, 0 },   { 2, 1 },   { 3, 0 },
++	{ 3, 0 },   { 3, 1 },   { 4, 0 },   { 4, 0 },   { 4, 1 },   { 5, 0 },   { 5, 1 },   { 6, 0 },
++	{ 6, 0 },   { 6, 1 },   { 7, 0 },   { 7, 0 },   { 7, 1 },   { 8, 0 },   { 8, 1 },   { 8, 1 },
++	{ 8, 2 },   { 9, 1 },   { 9, 2 },   { 9, 2 },   { 9, 3 },   { 10, 2 },  { 10, 3 },  { 10, 3 },
++	{ 10, 4 },  { 11, 3 },  { 11, 4 },  { 11, 4 },  { 11, 5 },  { 12, 4 },  { 12, 5 },  { 12, 5 },
++	{ 12, 6 },  { 13, 5 },  { 13, 6 },  { 8, 16 },  { 13, 7 },  { 14, 6 },  { 14, 7 },  { 9, 17 },
++	{ 14, 8 },  { 15, 7 },  { 15, 8 },  { 11, 16 }, { 15, 9 },  { 15, 10 }, { 16, 8 },  { 16, 9 },
++	{ 16, 10 }, { 15, 13 }, { 17, 9 },  { 17, 10 }, { 17, 11 }, { 15, 16 }, { 18, 10 }, { 18, 11 },
++	{ 18, 12 }, { 16, 16 }, { 19, 11 }, { 19, 12 }, { 19, 13 }, { 17, 17 }, { 20, 12 }, { 20, 13 },
++	{ 20, 14 }, { 19, 16 }, { 21, 13 }, { 21, 14 }, { 21, 15 }, { 20, 17 }, { 22, 14 }, { 22, 15 },
++	{ 25, 10 }, { 22, 16 }, { 23, 15 }, { 23, 16 }, { 26, 11 }, { 23, 17 }, { 24, 16 }, { 24, 17 },
++	{ 27, 12 }, { 24, 18 }, { 25, 17 }, { 25, 18 }, { 28, 13 }, { 25, 19 }, { 26, 18 }, { 26, 19 },
++	{ 29, 14 }, { 26, 20 }, { 27, 19 }, { 27, 20 }, { 30, 15 }, { 27, 21 }, { 28, 20 }, { 28, 21 },
++	{ 28, 21 }, { 28, 22 }, { 29, 21 }, { 29, 22 }, { 24, 32 }, { 29, 23 }, { 30, 22 }, { 30, 23 },
++	{ 25, 33 }, { 30, 24 }, { 31, 23 }, { 31, 24 }, { 27, 32 }, { 31, 25 }, { 31, 26 }, { 32, 24 },
++	{ 32, 25 }, { 32, 26 }, { 31, 29 }, { 33, 25 }, { 33, 26 }, { 33, 27 }, { 31, 32 }, { 34, 26 },
++	{ 34, 27 }, { 34, 28 }, { 32, 32 }, { 35, 27 }, { 35, 28 }, { 35, 29 }, { 33, 33 }, { 36, 28 },
++	{ 36, 29 }, { 36, 30 }, { 35, 32 }, { 37, 29 }, { 37, 30 }, { 37, 31 }, { 36, 33 }, { 38, 30 },
++	{ 38, 31 }, { 41, 26 }, { 38, 32 }, { 39, 31 }, { 39, 32 }, { 42, 27 }, { 39, 33 }, { 40, 32 },
++	{ 40, 33 }, { 43, 28 }, { 40, 34 }, { 41, 33 }, { 41, 34 }, { 44, 29 }, { 41, 35 }, { 42, 34 },
++	{ 42, 35 }, { 45, 30 }, { 42, 36 }, { 43, 35 }, { 43, 36 }, { 46, 31 }, { 43, 37 }, { 44, 36 },
++	{ 44, 37 }, { 44, 37 }, { 44, 38 }, { 45, 37 }, { 45, 38 }, { 40, 48 }, { 45, 39 }, { 46, 38 },
++	{ 46, 39 }, { 41, 49 }, { 46, 40 }, { 47, 39 }, { 47, 40 }, { 43, 48 }, { 47, 41 }, { 47, 42 },
++	{ 48, 40 }, { 48, 41 }, { 48, 42 }, { 47, 45 }, { 49, 41 }, { 49, 42 }, { 49, 43 }, { 47, 48 },
++	{ 50, 42 }, { 50, 43 }, { 50, 44 }, { 48, 48 }, { 51, 43 }, { 51, 44 }, { 51, 45 }, { 49, 49 },
++	{ 52, 44 }, { 52, 45 }, { 52, 46 }, { 51, 48 }, { 53, 45 }, { 53, 46 }, { 53, 47 }, { 52, 49 },
++	{ 54, 46 }, { 54, 47 }, { 57, 42 }, { 54, 48 }, { 55, 47 }, { 55, 48 }, { 58, 43 }, { 55, 49 },
++	{ 56, 48 }, { 56, 49 }, { 59, 44 }, { 56, 50 }, { 57, 49 }, { 57, 50 }, { 60, 45 }, { 57, 51 },
++	{ 58, 50 }, { 58, 51 }, { 61, 46 }, { 58, 52 }, { 59, 51 }, { 59, 52 }, { 62, 47 }, { 59, 53 },
++	{ 60, 52 }, { 60, 53 }, { 60, 53 }, { 60, 54 }, { 61, 53 }, { 61, 54 }, { 61, 54 }, { 61, 55 },
++	{ 62, 54 }, { 62, 55 }, { 62, 55 }, { 62, 56 }, { 63, 55 }, { 63, 56 }, { 63, 56 }, { 63, 57 },
++	{ 63, 58 }, { 63, 59 }, { 63, 59 }, { 63, 60 }, { 63, 61 }, { 63, 62 }, { 63, 62 }, { 63, 63 }
++};
+diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
+index f4609e73cf4..bd81d022e6a 100644
+--- a/src/mesa/state_tracker/st_cb_texture.c
++++ b/src/mesa/state_tracker/st_cb_texture.c
+@@ -27,6 +27,7 @@
+ 
+ #include <stdio.h>
+ #include "main/bufferobj.h"
++#include "main/context.h"
+ #include "main/enums.h"
+ #include "main/errors.h"
+ #include "main/fbobject.h"
+@@ -62,10 +63,12 @@
+ #include "state_tracker/st_gen_mipmap.h"
+ #include "state_tracker/st_atom.h"
+ #include "state_tracker/st_sampler_view.h"
++#include "state_tracker/st_texcompress_compute.h"
+ #include "state_tracker/st_util.h"
+ 
+ #include "pipe/p_context.h"
+ #include "pipe/p_defines.h"
++#include "util/log.h"
+ #include "util/u_inlines.h"
+ #include "util/u_upload_mgr.h"
+ #include "pipe/p_shader_tokens.h"
+@@ -479,8 +482,6 @@ st_MapTextureImage(struct gl_context *ctx,
+                    GLubyte **mapOut, GLint *rowStrideOut)
+ {
+    struct st_context *st = st_context(ctx);
+-   GLubyte *map;
+-   struct pipe_transfer *transfer;
+ 
+    /* Check for unexpected flags */
+    assert((mode & ~(GL_MAP_READ_BIT |
+@@ -490,47 +491,59 @@ st_MapTextureImage(struct gl_context *ctx,
+    const enum pipe_map_flags transfer_flags =
+       _mesa_access_flags_to_transfer_flags(mode, false);
+ 
+-   map = st_texture_image_map(st, texImage, transfer_flags, x, y, slice, w, h, 1,
+-                              &transfer);
+-   if (map) {
+-      if (st_compressed_format_fallback(st, texImage->TexFormat)) {
+-         /* Some compressed formats don't have to be supported by drivers,
+-          * and st/mesa transparently handles decompression on upload (Unmap),
+-          * so that drivers don't see the compressed formats.
+-          *
+-          * We store the compressed data (it's needed for glGetCompressedTex-
+-          * Image and image copies in OES_copy_image).
+-          */
+-         unsigned z = transfer->box.z;
+-         struct st_texture_image_transfer *itransfer = &texImage->transfer[z];
+-
+-         unsigned blk_w, blk_h;
+-         _mesa_get_format_block_size(texImage->TexFormat, &blk_w, &blk_h);
+-
+-         unsigned y_blocks = DIV_ROUND_UP(texImage->Height2, blk_h);
+-         unsigned stride = *rowStrideOut = itransfer->temp_stride =
+-            _mesa_format_row_stride(texImage->TexFormat, texImage->Width2);
+-         unsigned block_size = _mesa_get_format_bytes(texImage->TexFormat);
+-
+-         assert(texImage->compressed_data);
+-         *mapOut = itransfer->temp_data =
+-            texImage->compressed_data->ptr +
+-            (z * y_blocks + (y / blk_h)) * stride +
+-            (x / blk_w) * block_size;
+-         itransfer->map = map;
+-      }
+-      else {
+-         /* supported mapping */
+-         *mapOut = map;
+-         *rowStrideOut = transfer->stride;
+-      }
+-   }
+-   else {
+-      *mapOut = NULL;
+-      *rowStrideOut = 0;
++   if (st_compressed_format_fallback(st, texImage->TexFormat)) {
++      /* Some compressed formats don't have to be supported by drivers,
++       * and st/mesa transparently handles decompression on upload (Unmap),
++       * so that drivers don't see the compressed formats.
++       *
++       * We store the compressed data (it's needed for glGetCompressedTexImage
++       * and image copies in OES_copy_image).
++       */
++      unsigned z = slice + texImage->Face +
++                   texImage->TexObject->Attrib.MinLayer;
++
++      /* Enlarge the transfer array if it's not large enough. */
++      st_texture_image_insert_transfer(texImage, z, NULL);
++
++      struct st_texture_image_transfer *itransfer = &texImage->transfer[z];
++
++      assert(itransfer->box.depth == 0);
++      if (transfer_flags & PIPE_MAP_WRITE)
++         u_box_2d_zslice(x, y, z, w, h, &itransfer->box);
++
++      unsigned blk_w, blk_h;
++      _mesa_get_format_block_size(texImage->TexFormat, &blk_w, &blk_h);
++
++      unsigned y_blocks = DIV_ROUND_UP(texImage->Height2, blk_h);
++      unsigned stride = *rowStrideOut = itransfer->temp_stride =
++         _mesa_format_row_stride(texImage->TexFormat, texImage->Width2);
++      unsigned block_size = _mesa_get_format_bytes(texImage->TexFormat);
++
++      assert(texImage->compressed_data);
++      *mapOut = itransfer->temp_data =
++         texImage->compressed_data->ptr +
++         (z * y_blocks + (y / blk_h)) * stride +
++         (x / blk_w) * block_size;
++   } else {
++      struct pipe_transfer *transfer;
++      *mapOut = st_texture_image_map(st, texImage, transfer_flags,
++                                     x, y, slice, w, h, 1, &transfer);
++      *rowStrideOut = *mapOut ? transfer->stride : 0;
+    }
+ }
+ 
++static void
++log_unmap_time_delta(const struct pipe_box *box,
++                     const struct gl_texture_image *texImage,
++                     const char *pathname, int64_t start_us)
++{
++   assert(start_us >= 0);
++   mesa_logi("unmap %dx%d pixels of %s data for %s tex, %s path: "
++             "%"PRIi64" us\n", box->width, box->height,
++             util_format_short_name(texImage->TexFormat),
++             util_format_short_name(texImage->pt->format),
++             pathname, os_time_get() - start_us);
++}
+ 
+ void
+ st_UnmapTextureImage(struct gl_context *ctx,
+@@ -544,11 +557,64 @@ st_UnmapTextureImage(struct gl_context *ctx,
+        * support the compressed format. */
+       unsigned z = slice + texImage->Face;
+       struct st_texture_image_transfer *itransfer = &texImage->transfer[z];
+-      struct pipe_transfer *transfer = itransfer->transfer;
+ 
+-      assert(z == transfer->box.z);
++      if (itransfer->box.depth != 0) {
++         assert(itransfer->box.depth == 1);
++
++         /* Toggle logging for the different unmap paths. */
++         const bool log_unmap_time = false;
++         const int64_t unmap_start_us = log_unmap_time ? os_time_get() : 0;
++         if (_mesa_is_format_astc_2d(texImage->TexFormat) &&
++             util_format_is_compressed(texImage->pt->format)) {
++
++            /* DXT5 is the only supported transcode target from ASTC. */
++            assert(texImage->pt->format == PIPE_FORMAT_DXT5_RGBA ||
++                   texImage->pt->format == PIPE_FORMAT_DXT5_SRGBA);
++
++            /* Try a compute-based transcode. */
++            if (itransfer->box.x == 0 &&
++                itransfer->box.y == 0 &&
++                itransfer->box.width == texImage->Width &&
++                itransfer->box.height == texImage->Height &&
++                _mesa_has_compute_shaders(ctx) &&
++                st_compute_transcode_astc_to_dxt5(st,
++                   itransfer->temp_data,
++                   itransfer->temp_stride,
++                   texImage->TexFormat,
++                   texImage->pt,
++                   st_texture_image_resource_level(texImage),
++                   itransfer->box.z)) {
++
++               if (log_unmap_time) {
++                  log_unmap_time_delta(&itransfer->box, texImage, "GPU",
++                                       unmap_start_us);
++               }
++
++               /* Mark the unmap as complete. */
++               assert(itransfer->transfer == NULL);
++               memset(itransfer, 0, sizeof(struct st_texture_image_transfer));
++
++               return;
++            }
++         }
++
++         struct pipe_transfer *transfer;
++         GLubyte *map = st_texture_image_map(st, texImage,
++                                             PIPE_MAP_WRITE |
++                                             PIPE_MAP_DISCARD_RANGE,
++                                             itransfer->box.x,
++                                             itransfer->box.y, slice,
++                                             itransfer->box.width,
++                                             itransfer->box.height, 1,
++                                             &transfer);
++
++         if (!map) {
++            _mesa_error(ctx, GL_OUT_OF_MEMORY, "compressed fallback map");
++            return;
++         }
++
++         assert(z == transfer->box.z);
+ 
+-      if (transfer->usage & PIPE_MAP_WRITE) {
+          if (util_format_is_compressed(texImage->pt->format)) {
+             /* Transcode into a different compressed format. */
+             unsigned size =
+@@ -590,7 +656,7 @@ st_UnmapTextureImage(struct gl_context *ctx,
+             pack.Alignment = 4;
+ 
+             _mesa_texstore(ctx, 2, GL_RGBA, texImage->pt->format,
+-                           transfer->stride, &itransfer->map,
++                           transfer->stride, &map,
+                            transfer->box.width,
+                            transfer->box.height, 1, GL_RGBA,
+                            GL_UNSIGNED_BYTE, tmp, &pack);
+@@ -598,7 +664,7 @@ st_UnmapTextureImage(struct gl_context *ctx,
+          } else {
+             /* Decompress into an uncompressed format. */
+             if (texImage->TexFormat == MESA_FORMAT_ETC1_RGB8) {
+-               _mesa_etc1_unpack_rgba8888(itransfer->map, transfer->stride,
++               _mesa_etc1_unpack_rgba8888(map, transfer->stride,
+                                           itransfer->temp_data,
+                                           itransfer->temp_stride,
+                                           transfer->box.width,
+@@ -606,14 +672,14 @@ st_UnmapTextureImage(struct gl_context *ctx,
+             } else if (_mesa_is_format_etc2(texImage->TexFormat)) {
+                bool bgra = texImage->pt->format == PIPE_FORMAT_B8G8R8A8_SRGB;
+ 
+-               _mesa_unpack_etc2_format(itransfer->map, transfer->stride,
++               _mesa_unpack_etc2_format(map, transfer->stride,
+                                         itransfer->temp_data,
+                                         itransfer->temp_stride,
+                                         transfer->box.width, transfer->box.height,
+                                         texImage->TexFormat,
+                                         bgra);
+             } else if (_mesa_is_format_astc_2d(texImage->TexFormat)) {
+-               _mesa_unpack_astc_2d_ldr(itransfer->map, transfer->stride,
++               _mesa_unpack_astc_2d_ldr(map, transfer->stride,
+                                         itransfer->temp_data,
+                                         itransfer->temp_stride,
+                                         transfer->box.width, transfer->box.height,
+@@ -622,14 +688,22 @@ st_UnmapTextureImage(struct gl_context *ctx,
+                unreachable("unexpected format for a compressed format fallback");
+             }
+          }
++
++         st_texture_image_unmap(st, texImage, slice);
++
++         if (log_unmap_time) {
++            log_unmap_time_delta(&itransfer->box, texImage, "CPU",
++                                 unmap_start_us);
++         }
++
++         memset(&itransfer->box, 0, sizeof(struct pipe_box));
+       }
+ 
+       itransfer->temp_data = NULL;
+       itransfer->temp_stride = 0;
+-      itransfer->map = NULL;
++   } else {
++      st_texture_image_unmap(st, texImage, slice);
+    }
+-
+-   st_texture_image_unmap(st, texImage, slice);
+ }
+ 
+ 
+@@ -1964,6 +2038,11 @@ st_TexSubImage(struct gl_context *ctx, GLuint dims,
+       goto fallback;
+    }
+ 
++   /* We need both the compressed and non-compressed textures updated,
++    * which neither the PBO nor memcpy code-paths does */
++   if (st_compressed_format_fallback(st, texImage->TexFormat)) {
++      goto fallback;
++   }
+ 
+    /* See if the destination format is supported. */
+    if (format == GL_DEPTH_COMPONENT || format == GL_DEPTH_STENCIL)
+diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
+index 0c6b630a2e4..bb08b66491c 100644
+--- a/src/mesa/state_tracker/st_context.c
++++ b/src/mesa/state_tracker/st_context.c
+@@ -56,6 +56,7 @@
+ #include "st_program.h"
+ #include "st_sampler_view.h"
+ #include "st_shader_cache.h"
++#include "st_texcompress_compute.h"
+ #include "st_texture.h"
+ #include "st_util.h"
+ #include "pipe/p_context.h"
+@@ -68,6 +69,8 @@
+ #include "compiler/glsl/glsl_parser_extras.h"
+ #include "nir/nir_to_tgsi.h"
+ 
++#include <cutils/properties.h>
++
+ DEBUG_GET_ONCE_BOOL_OPTION(mesa_mvp_dp4, "MESA_MVP_DP4", FALSE)
+ 
+ static uint64_t
+@@ -380,6 +383,10 @@ st_destroy_context_priv(struct st_context *st, bool destroy_pipe)
+    st_destroy_drawpix(st);
+    st_destroy_drawtex(st);
+    st_destroy_pbo_helpers(st);
++
++   if (st->transcode_astc)
++      st_destroy_texcompress_compute(st);
++
+    st_destroy_bound_texture_handles(st);
+    st_destroy_bound_image_handles(st);
+ 
+@@ -472,6 +479,7 @@ st_have_perfquery(struct st_context *ctx)
+           pipe->get_intel_perf_query_data;
+ }
+ 
++static char isEnableHardEncode[PROPERTY_VALUE_MAX];
+ static struct st_context *
+ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe,
+                        const struct st_config_options *options)
+@@ -571,6 +579,8 @@ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe,
+                                                    PIPE_TEXTURE_2D, 0, 0,
+                                                    PIPE_BIND_SAMPLER_VIEW);
+    st->transcode_astc = options->transcode_astc &&
++                        property_get("sys.vmi.gl.texturecompress", isEnableHardEncode, "0") &&
++                        strcmp(isEnableHardEncode, "1") == 0 &&
+                         screen->is_format_supported(screen, PIPE_FORMAT_DXT5_SRGBA,
+                                                     PIPE_TEXTURE_2D, 0, 0,
+                                                     PIPE_BIND_SAMPLER_VIEW) &&
+@@ -776,6 +786,17 @@ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe,
+       return NULL;
+    }
+ 
++   if (_mesa_has_compute_shaders(ctx) &&
++       st->transcode_astc && !st_init_texcompress_compute(st)) {
++      /* Transcoding ASTC to DXT5 using compute shaders can provide a
++       * significant performance benefit over the CPU path. It isn't strictly
++       * necessary to fail if we can't use the compute shader path, but it's
++       * very convenient to do so. This should be rare.
++       */
++      st_destroy_context_priv(st, false);
++      return NULL;
++   }
++
+    /* This must be done after extensions are initialized to enable persistent
+     * mappings immediately.
+     */
+diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
+index 5f422769615..d83c614a91a 100644
+--- a/src/mesa/state_tracker/st_context.h
++++ b/src/mesa/state_tracker/st_context.h
+@@ -335,6 +335,13 @@ struct st_context
+       bool use_gs;
+    } pbo;
+ 
++   struct {
++      struct gl_program **progs;
++      struct pipe_resource *bc1_endpoint_buf;
++      struct pipe_sampler_view *astc_luts[5];
++      struct hash_table *astc_partition_tables;
++   } texcompress_compute;
++
+    /** for drawing with st_util_vertex */
+    struct cso_velems_state util_velems;
+ 
+diff --git a/src/mesa/state_tracker/st_texcompress_compute.c b/src/mesa/state_tracker/st_texcompress_compute.c
+new file mode 100644
+index 00000000000..118efc55d7c
+--- /dev/null
++++ b/src/mesa/state_tracker/st_texcompress_compute.c
+@@ -0,0 +1,846 @@
++/**************************************************************************
++ *
++ * Copyright © 2022 Intel Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ **************************************************************************/
++
++#include "compiler/glsl/astc_glsl.h"
++#include "compiler/glsl/bc1_glsl.h"
++#include "compiler/glsl/bc4_glsl.h"
++#include "compiler/glsl/cross_platform_settings_piece_all.h"
++#include "compiler/glsl/etc2_rgba_stitch_glsl.h"
++
++#include "main/context.h"
++#include "main/shaderapi.h"
++#include "main/shaderobj.h"
++#include "main/texcompress_astc.h"
++#include "util/texcompress_astc_luts_wrap.h"
++#include "main/uniforms.h"
++
++#include "state_tracker/st_atom_constbuf.h"
++#include "state_tracker/st_bc1_tables.h"
++#include "state_tracker/st_context.h"
++#include "state_tracker/st_program.h"
++#include "state_tracker/st_texcompress_compute.h"
++#include "state_tracker/st_texture.h"
++
++#include "util/u_hash_table.h"
++#include "util/u_string.h"
++
++enum compute_program_id {
++   COMPUTE_PROGRAM_BC1,
++   COMPUTE_PROGRAM_BC4,
++   COMPUTE_PROGRAM_STITCH,
++   COMPUTE_PROGRAM_ASTC_4x4,
++   COMPUTE_PROGRAM_ASTC_5x4,
++   COMPUTE_PROGRAM_ASTC_5x5,
++   COMPUTE_PROGRAM_ASTC_6x5,
++   COMPUTE_PROGRAM_ASTC_6x6,
++   COMPUTE_PROGRAM_ASTC_8x5,
++   COMPUTE_PROGRAM_ASTC_8x6,
++   COMPUTE_PROGRAM_ASTC_8x8,
++   COMPUTE_PROGRAM_ASTC_10x5,
++   COMPUTE_PROGRAM_ASTC_10x6,
++   COMPUTE_PROGRAM_ASTC_10x8,
++   COMPUTE_PROGRAM_ASTC_10x10,
++   COMPUTE_PROGRAM_ASTC_12x10,
++   COMPUTE_PROGRAM_ASTC_12x12,
++   COMPUTE_PROGRAM_COUNT
++};
++
++static struct gl_program * PRINTFLIKE(3, 4)
++get_compute_program(struct st_context *st,
++                    enum compute_program_id prog_id,
++                    const char *source_fmt, ...)
++{
++   /* Try to get the program from the cache. */
++   assert(prog_id < COMPUTE_PROGRAM_COUNT);
++   if (st->texcompress_compute.progs[prog_id])
++      return st->texcompress_compute.progs[prog_id];
++
++   /* Cache miss. Create the final source string. */
++   char *source_str;
++   va_list ap;
++   va_start(ap, source_fmt);
++   int num_printed_bytes = vasprintf(&source_str, source_fmt, ap);
++   va_end(ap);
++   if (num_printed_bytes == -1)
++      return NULL;
++
++   /* Compile and link the shader. Then, destroy the shader string. */
++   const char *strings[] = { source_str };
++   GLuint program =
++      _mesa_CreateShaderProgramv_impl(st->ctx, GL_COMPUTE_SHADER, 1, strings);
++   free(source_str);
++
++   struct gl_shader_program *shProg =
++      _mesa_lookup_shader_program(st->ctx, program);
++   if (!shProg)
++      return NULL;
++
++   if (shProg->data->LinkStatus == LINKING_FAILURE) {
++      fprintf(stderr, "Linking failed:\n%s\n", shProg->data->InfoLog);
++      _mesa_reference_shader_program(st->ctx, &shProg, NULL);
++      return NULL;
++   }
++
++   /* Cache the program and return it. */
++   return st->texcompress_compute.progs[prog_id] =
++          shProg->_LinkedShaders[MESA_SHADER_COMPUTE]->Program;
++}
++
++static struct pipe_resource *
++create_bc1_endpoint_ssbo(struct pipe_context *pipe)
++{
++   struct pipe_resource *buffer =
++      pipe_buffer_create(pipe->screen, PIPE_BIND_SHADER_BUFFER,
++                         PIPE_USAGE_IMMUTABLE, sizeof(float) *
++                         (sizeof(stb__OMatch5) + sizeof(stb__OMatch6)));
++
++   if (!buffer)
++      return NULL;
++
++   struct pipe_transfer *transfer;
++   float (*buffer_map)[2] = pipe_buffer_map(pipe, buffer,
++                                            PIPE_MAP_WRITE |
++                                            PIPE_MAP_DISCARD_WHOLE_RESOURCE,
++                                            &transfer);
++   if (!buffer_map) {
++      pipe_resource_reference(&buffer, NULL);
++      return NULL;
++   }
++
++   for (int i = 0; i < 256; i++) {
++      for (int j = 0; j < 2; j++) {
++         buffer_map[i][j] = (float) stb__OMatch5[i][j];
++         buffer_map[i + 256][j] = (float) stb__OMatch6[i][j];
++      }
++   }
++
++   pipe_buffer_unmap(pipe, transfer);
++
++   return buffer;
++}
++
++static void
++bind_compute_state(struct st_context *st,
++                   struct gl_program *prog,
++                   struct pipe_sampler_view **sampler_views,
++                   const struct pipe_shader_buffer *shader_buffers,
++                   const struct pipe_image_view *image_views,
++                   bool cs_handle_from_prog,
++                   bool constbuf0_from_prog)
++{
++   assert(prog->info.stage == PIPE_SHADER_COMPUTE);
++
++   /* Set compute states in the same order as defined in st_atom_list.h */
++
++   assert(prog->affected_states & ST_NEW_CS_STATE);
++   assert(st->shader_has_one_variant[PIPE_SHADER_COMPUTE]);
++   cso_set_compute_shader_handle(st->cso_context,
++                                 cs_handle_from_prog ?
++                                 prog->variants->driver_shader : NULL);
++
++   if (prog->affected_states & ST_NEW_CS_SAMPLER_VIEWS) {
++      st->pipe->set_sampler_views(st->pipe, prog->info.stage, 0,
++                                  prog->info.num_textures, 0, false,
++                                  sampler_views);
++   }
++
++   if (prog->affected_states & ST_NEW_CS_SAMPLERS) {
++      /* Programs seem to set this bit more often than needed. For example, if
++       * a program only uses texelFetch, this shouldn't be needed. Section
++       * "11.1.3.2 Texel Fetches", of the GL 4.6 spec says:
++       *
++       *    Texel fetch proceeds similarly to the steps described for texture
++       *    access in section 11.1.3.5, with the exception that none of the
++       *    operations controlled by sampler object state are performed,
++       *
++       * We assume that the program is using texelFetch or doesn't care about
++       * this state for a similar reason.
++       *
++       * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/8014.
++       */
++   }
++
++   if (prog->affected_states & ST_NEW_CS_CONSTANTS) {
++      st_upload_constants(st, constbuf0_from_prog ? prog : NULL,
++                          prog->info.stage);
++   }
++
++   if (prog->affected_states & ST_NEW_CS_UBOS) {
++      unreachable("Uniform buffer objects not handled");
++   }
++
++   if (prog->affected_states & ST_NEW_CS_ATOMICS) {
++      unreachable("Atomic buffer objects not handled");
++   }
++
++   if (prog->affected_states & ST_NEW_CS_SSBOS) {
++      st->pipe->set_shader_buffers(st->pipe, prog->info.stage, 0,
++                                   prog->info.num_ssbos, shader_buffers,
++                                   prog->sh.ShaderStorageBlocksWriteAccess);
++   }
++
++   if (prog->affected_states & ST_NEW_CS_IMAGES) {
++      st->pipe->set_shader_images(st->pipe, prog->info.stage, 0,
++                                  prog->info.num_images, 0, image_views);
++   }
++}
++
++static void
++dispatch_compute_state(struct st_context *st,
++                       struct gl_program *prog,
++                       struct pipe_sampler_view **sampler_views,
++                       const struct pipe_shader_buffer *shader_buffers,
++                       const struct pipe_image_view *image_views,
++                       unsigned num_workgroups_x,
++                       unsigned num_workgroups_y,
++                       unsigned num_workgroups_z)
++{
++   assert(prog->info.stage == PIPE_SHADER_COMPUTE);
++
++   /* Bind the state */
++   bind_compute_state(st, prog, sampler_views, shader_buffers, image_views,
++                      true, true);
++
++   /* Launch the grid */
++   const struct pipe_grid_info info = {
++      .block[0] = prog->info.workgroup_size[0],
++      .block[1] = prog->info.workgroup_size[1],
++      .block[2] = prog->info.workgroup_size[2],
++      .grid[0] = num_workgroups_x,
++      .grid[1] = num_workgroups_y,
++      .grid[2] = num_workgroups_z,
++   };
++
++   st->pipe->launch_grid(st->pipe, &info);
++
++   /* Unbind the state */
++   bind_compute_state(st, prog, NULL, NULL, NULL, false, false);
++
++   /* If the previously used compute program was relying on any state that was
++    * trampled on by these state changes, dirty the relevant flags.
++    */
++   if (st->cp) {
++      st->ctx->NewDriverState |=
++         st->cp->affected_states & prog->affected_states;
++   }
++}
++
++static struct pipe_resource *
++cs_encode_bc1(struct st_context *st,
++              struct pipe_resource *rgba8_tex)
++{
++   /* Create the required compute state */
++   struct gl_program *prog =
++      get_compute_program(st, COMPUTE_PROGRAM_BC1, bc1_source,
++                          cross_platform_settings_piece_all_header);
++   if (!prog)
++      return NULL;
++
++   /* ... complete the program setup by defining the number of refinements to
++    * do on the created blocks. The program will attempt to create a more
++    * accurate encoding on each iteration. Doing at least one refinement
++    * provides a significant improvement in quality and is needed to give a
++    * result comparable to the CPU encoder (according to piglit tests).
++    * Additional refinements don't help as much.
++    */
++   const unsigned num_refinements = 1;
++   _mesa_uniform(0, 1, &num_refinements, st->ctx, prog->shader_program,
++                 GLSL_TYPE_UINT, 1);
++
++   const struct pipe_sampler_view templ = {
++      .target = PIPE_TEXTURE_2D,
++      .format = PIPE_FORMAT_R8G8B8A8_UNORM,
++      .swizzle_r = PIPE_SWIZZLE_X,
++      .swizzle_g = PIPE_SWIZZLE_Y,
++      .swizzle_b = PIPE_SWIZZLE_Z,
++      .swizzle_a = PIPE_SWIZZLE_W,
++   };
++   struct pipe_sampler_view *rgba8_view =
++      st->pipe->create_sampler_view(st->pipe, rgba8_tex, &templ);
++   if (!rgba8_view)
++      return NULL;
++
++   const struct pipe_shader_buffer ssbo = {
++      .buffer = st->texcompress_compute.bc1_endpoint_buf,
++      .buffer_size = st->texcompress_compute.bc1_endpoint_buf->width0,
++   };
++
++   struct pipe_resource *bc1_tex =
++      st_texture_create(st, PIPE_TEXTURE_2D, PIPE_FORMAT_R32G32_UINT, 0,
++                        DIV_ROUND_UP(rgba8_tex->width0, 4),
++                        DIV_ROUND_UP(rgba8_tex->height0, 4), 1, 1, 0,
++                        PIPE_BIND_SHADER_IMAGE |
++                        PIPE_BIND_SAMPLER_VIEW, false);
++   if (!bc1_tex)
++      goto release_sampler_views;
++
++   const struct pipe_image_view image = {
++      .resource = bc1_tex,
++      .format = PIPE_FORMAT_R16G16B16A16_UINT,
++      .access = PIPE_IMAGE_ACCESS_WRITE,
++      .shader_access = PIPE_IMAGE_ACCESS_WRITE,
++   };
++
++   /* Dispatch the compute state */
++   dispatch_compute_state(st, prog, &rgba8_view, &ssbo, &image,
++                          DIV_ROUND_UP(rgba8_tex->width0, 32),
++                          DIV_ROUND_UP(rgba8_tex->height0, 32), 1);
++
++release_sampler_views:
++   pipe_sampler_view_reference(&rgba8_view, NULL);
++
++   return bc1_tex;
++}
++
++static struct pipe_resource *
++cs_encode_bc4(struct st_context *st,
++              struct pipe_resource *rgba8_tex,
++              enum pipe_swizzle component, bool use_snorm)
++{
++   /* Create the required compute state */
++   struct gl_program *prog =
++      get_compute_program(st, COMPUTE_PROGRAM_BC4, bc4_source,
++                          cross_platform_settings_piece_all_header);
++   if (!prog)
++      return NULL;
++
++   /* ... complete the program setup by picking the channel to encode and
++    * whether to encode it as snorm. The shader doesn't actually support
++    * channel index 2. So, pick index 0 and rely on swizzling instead.
++    */
++   const unsigned params[] = { 0, use_snorm };
++   _mesa_uniform(0, 1, params, st->ctx, prog->shader_program,
++                 GLSL_TYPE_UINT, 2);
++
++   const struct pipe_sampler_view templ = {
++      .target = PIPE_TEXTURE_2D,
++      .format = PIPE_FORMAT_R8G8B8A8_UNORM,
++      .swizzle_r = component,
++      .swizzle_g = PIPE_SWIZZLE_0,
++      .swizzle_b = PIPE_SWIZZLE_0,
++      .swizzle_a = PIPE_SWIZZLE_1,
++   };
++   struct pipe_sampler_view *rgba8_view =
++      st->pipe->create_sampler_view(st->pipe, rgba8_tex, &templ);
++   if (!rgba8_view)
++      return NULL;
++
++   struct pipe_resource *bc4_tex =
++      st_texture_create(st, PIPE_TEXTURE_2D, PIPE_FORMAT_R32G32_UINT, 0,
++                        DIV_ROUND_UP(rgba8_tex->width0, 4),
++                        DIV_ROUND_UP(rgba8_tex->height0, 4), 1, 1, 0,
++                        PIPE_BIND_SHADER_IMAGE |
++                        PIPE_BIND_SAMPLER_VIEW, false);
++   if (!bc4_tex)
++      goto release_sampler_views;
++
++   const struct pipe_image_view image = {
++      .resource = bc4_tex,
++      .format = PIPE_FORMAT_R16G16B16A16_UINT,
++      .access = PIPE_IMAGE_ACCESS_WRITE,
++      .shader_access = PIPE_IMAGE_ACCESS_WRITE,
++   };
++
++   /* Dispatch the compute state */
++   dispatch_compute_state(st, prog, &rgba8_view, NULL, &image, 1,
++                          DIV_ROUND_UP(rgba8_tex->width0, 16),
++                          DIV_ROUND_UP(rgba8_tex->height0, 16));
++
++release_sampler_views:
++   pipe_sampler_view_reference(&rgba8_view, NULL);
++
++   return bc4_tex;
++}
++
++static struct pipe_resource *
++cs_stitch_64bpb_textures(struct st_context *st,
++                         struct pipe_resource *tex_hi,
++                         struct pipe_resource *tex_lo)
++{
++   assert(util_format_get_blocksizebits(tex_hi->format) == 64);
++   assert(util_format_get_blocksizebits(tex_lo->format) == 64);
++   assert(tex_hi->width0 == tex_lo->width0);
++   assert(tex_hi->height0 == tex_lo->height0);
++
++   struct pipe_resource *stitched_tex = NULL;
++
++   /* Create the required compute state */
++   struct gl_program *prog =
++      get_compute_program(st, COMPUTE_PROGRAM_STITCH, etc2_rgba_stitch_source,
++                          cross_platform_settings_piece_all_header);
++   if (!prog)
++      return NULL;
++
++   const struct pipe_sampler_view templ = {
++      .target = PIPE_TEXTURE_2D,
++      .format = PIPE_FORMAT_R32G32_UINT,
++      .swizzle_r = PIPE_SWIZZLE_X,
++      .swizzle_g = PIPE_SWIZZLE_Y,
++      .swizzle_b = PIPE_SWIZZLE_0,
++      .swizzle_a = PIPE_SWIZZLE_1,
++   };
++   struct pipe_sampler_view *rg32_views[2] = {
++      [0] = st->pipe->create_sampler_view(st->pipe, tex_hi, &templ),
++      [1] = st->pipe->create_sampler_view(st->pipe, tex_lo, &templ),
++   };
++   if (!rg32_views[0] || !rg32_views[1])
++      goto release_sampler_views;
++
++   stitched_tex =
++      st_texture_create(st, PIPE_TEXTURE_2D, PIPE_FORMAT_R32G32B32A32_UINT, 0,
++                        tex_hi->width0,
++                        tex_hi->height0, 1, 1, 0,
++                        PIPE_BIND_SHADER_IMAGE |
++                        PIPE_BIND_SAMPLER_VIEW, false);
++   if (!stitched_tex)
++      goto release_sampler_views;
++
++   const struct pipe_image_view image = {
++      .resource = stitched_tex,
++      .format = PIPE_FORMAT_R32G32B32A32_UINT,
++      .access = PIPE_IMAGE_ACCESS_WRITE,
++      .shader_access = PIPE_IMAGE_ACCESS_WRITE,
++   };
++
++   /* Dispatch the compute state */
++   dispatch_compute_state(st, prog, rg32_views, NULL, &image,
++                          DIV_ROUND_UP(tex_hi->width0, 8),
++                          DIV_ROUND_UP(tex_hi->height0, 8), 1);
++
++release_sampler_views:
++   pipe_sampler_view_reference(&rg32_views[0], NULL);
++   pipe_sampler_view_reference(&rg32_views[1], NULL);
++
++   return stitched_tex;
++}
++
++static struct pipe_resource *
++cs_encode_bc3(struct st_context *st,
++              struct pipe_resource *rgba8_tex)
++{
++   struct pipe_resource *bc3_tex = NULL;
++
++   /* Encode RGB channels as BC1. */
++   struct pipe_resource *bc1_tex = cs_encode_bc1(st, rgba8_tex);
++   if (!bc1_tex)
++      return NULL;
++
++   /* Encode alpha channels as BC4. */
++   struct pipe_resource *bc4_tex =
++      cs_encode_bc4(st, rgba8_tex, PIPE_SWIZZLE_W, false);
++   if (!bc4_tex)
++      goto release_textures;
++
++   st->pipe->memory_barrier(st->pipe, PIPE_BARRIER_TEXTURE);
++
++   /* Combine BC1 and BC4 to create BC3. */
++   bc3_tex = cs_stitch_64bpb_textures(st, bc1_tex, bc4_tex);
++   if (!bc3_tex)
++      goto release_textures;
++
++release_textures:
++   pipe_resource_reference(&bc1_tex, NULL);
++   pipe_resource_reference(&bc4_tex, NULL);
++
++   return bc3_tex;
++}
++
++static struct pipe_resource *
++sw_decode_astc(struct st_context *st,
++               uint8_t *astc_data,
++               unsigned astc_stride,
++               mesa_format astc_format,
++               unsigned width_px, unsigned height_px)
++{
++   /* Create the destination */
++   struct pipe_resource *rgba8_tex =
++      st_texture_create(st, PIPE_TEXTURE_2D, PIPE_FORMAT_R8G8B8A8_UNORM, 0,
++                        width_px, height_px, 1, 1, 0,
++                        PIPE_BIND_SAMPLER_VIEW, false);
++   if (!rgba8_tex)
++      return NULL;
++
++   /* Temporarily map the destination and decode into the returned pointer */
++   struct pipe_transfer *rgba8_xfer;
++   void *rgba8_map = pipe_texture_map(st->pipe, rgba8_tex, 0, 0,
++                                      PIPE_MAP_WRITE, 0, 0,
++                                      width_px, height_px, &rgba8_xfer);
++   if (!rgba8_map) {
++      pipe_resource_reference(&rgba8_tex, NULL);
++      return NULL;
++   }
++
++   _mesa_unpack_astc_2d_ldr(rgba8_map, rgba8_xfer->stride,
++                            astc_data, astc_stride,
++                            width_px, height_px, astc_format);
++
++   pipe_texture_unmap(st->pipe, rgba8_xfer);
++
++   return rgba8_tex;
++}
++
++static struct pipe_sampler_view *
++create_astc_cs_payload_view(struct st_context *st,
++                            uint8_t *data, unsigned stride,
++                            uint32_t width_el, uint32_t height_el)
++{
++   const struct pipe_resource src_templ = {
++      .target = PIPE_TEXTURE_2D,
++      .format = PIPE_FORMAT_R32G32B32A32_UINT,
++      .bind = PIPE_BIND_SAMPLER_VIEW,
++      .usage = PIPE_USAGE_STAGING,
++      .width0 = width_el,
++      .height0 = height_el,
++      .depth0 = 1,
++      .array_size = 1,
++   };
++
++   struct pipe_resource *payload_res =
++      st->screen->resource_create(st->screen, &src_templ);
++
++   if (!payload_res)
++      return NULL;
++
++   struct pipe_box box;
++   u_box_origin_2d(width_el, height_el, &box);
++
++   st->pipe->texture_subdata(st->pipe, payload_res, 0, 0,
++                             &box,
++                             data,
++                             stride,
++                             0 /* unused */);
++
++   const struct pipe_sampler_view view_templ = {
++      .target = PIPE_TEXTURE_2D,
++      .format = payload_res->format,
++      .swizzle_r = PIPE_SWIZZLE_X,
++      .swizzle_g = PIPE_SWIZZLE_Y,
++      .swizzle_b = PIPE_SWIZZLE_Z,
++      .swizzle_a = PIPE_SWIZZLE_W,
++   };
++
++   struct pipe_sampler_view *view =
++      st->pipe->create_sampler_view(st->pipe, payload_res, &view_templ);
++
++   pipe_resource_reference(&payload_res, NULL);
++
++   return view;
++}
++
++static struct pipe_sampler_view *
++get_astc_partition_table_view(struct st_context *st,
++                              unsigned block_w,
++                              unsigned block_h)
++{
++   unsigned lut_width;
++   unsigned lut_height;
++   struct pipe_box ptable_box;
++   void *ptable_data =
++      _mesa_get_astc_decoder_partition_table(block_w, block_h, &lut_width, &lut_height);
++   u_box_origin_2d(lut_width, lut_height, &ptable_box);
++
++   struct pipe_sampler_view *view =
++      util_hash_table_get(st->texcompress_compute.astc_partition_tables,
++                          ptable_data);
++
++   if (view)
++      return view;
++
++   struct pipe_resource *res =
++      st_texture_create(st, PIPE_TEXTURE_2D, PIPE_FORMAT_R8_UINT, 0,
++                        ptable_box.width, ptable_box.height,
++                        1, 1, 0,
++                        PIPE_BIND_SAMPLER_VIEW, false);
++   if (!res)
++      return NULL;
++
++   st->pipe->texture_subdata(st->pipe, res, 0, 0,
++                             &ptable_box,
++                             ptable_data,
++                             ptable_box.width,
++                             0 /* unused */);
++
++   const struct pipe_sampler_view templ = {
++      .target = PIPE_TEXTURE_2D,
++      .format = res->format,
++      .swizzle_r = PIPE_SWIZZLE_X,
++      .swizzle_g = PIPE_SWIZZLE_Y,
++      .swizzle_b = PIPE_SWIZZLE_Z,
++      .swizzle_a = PIPE_SWIZZLE_W,
++   };
++
++   view = st->pipe->create_sampler_view(st->pipe, res, &templ);
++
++   pipe_resource_reference(&res, NULL);
++
++   if (view) {
++      _mesa_hash_table_insert(st->texcompress_compute.astc_partition_tables,
++                              ptable_data, view);
++      ASSERTED const unsigned max_entries =
++         COMPUTE_PROGRAM_ASTC_12x12 - COMPUTE_PROGRAM_ASTC_4x4 + 1;
++      assert(_mesa_hash_table_num_entries(
++         st->texcompress_compute.astc_partition_tables) < max_entries);
++   }
++
++   return view;
++}
++
++static struct pipe_resource *
++cs_decode_astc(struct st_context *st,
++               uint8_t *astc_data,
++               unsigned astc_stride,
++               mesa_format astc_format,
++               unsigned width_px, unsigned height_px)
++{
++   const enum compute_program_id astc_id = COMPUTE_PROGRAM_ASTC_4x4 +
++      util_format_linear(astc_format) - PIPE_FORMAT_ASTC_4x4;
++
++   unsigned block_w, block_h;
++   _mesa_get_format_block_size(astc_format, &block_w, &block_h);
++
++   struct gl_program *prog =
++      get_compute_program(st, astc_id, astc_source, block_w, block_h);
++
++   if (!prog)
++      return NULL;
++
++   struct pipe_sampler_view *ptable_view =
++      get_astc_partition_table_view(st, block_w, block_h);
++
++   if (!ptable_view)
++      return NULL;
++
++   struct pipe_sampler_view *payload_view =
++      create_astc_cs_payload_view(st, astc_data, astc_stride,
++                                  DIV_ROUND_UP(width_px, block_w),
++                                  DIV_ROUND_UP(height_px, block_h));
++
++   if (!payload_view)
++      return NULL;
++
++   /* Create the destination */
++   struct pipe_resource *rgba8_tex =
++      st_texture_create(st, PIPE_TEXTURE_2D, PIPE_FORMAT_R8G8B8A8_UNORM, 0,
++                        width_px, height_px, 1, 1, 0,
++                        PIPE_BIND_SAMPLER_VIEW, false);
++
++   if (!rgba8_tex)
++      goto release_payload_view;
++
++   const struct pipe_image_view image = {
++      .resource = rgba8_tex,
++      .format = PIPE_FORMAT_R8G8B8A8_UINT,
++      .access = PIPE_IMAGE_ACCESS_WRITE,
++      .shader_access = PIPE_IMAGE_ACCESS_WRITE,
++   };
++
++   struct pipe_sampler_view *sampler_views[] = {
++      st->texcompress_compute.astc_luts[0],
++      st->texcompress_compute.astc_luts[1],
++      st->texcompress_compute.astc_luts[2],
++      st->texcompress_compute.astc_luts[3],
++      st->texcompress_compute.astc_luts[4],
++      ptable_view,
++      payload_view,
++   };
++
++   dispatch_compute_state(st, prog, sampler_views, NULL, &image,
++                          DIV_ROUND_UP(payload_view->texture->width0, 2),
++                          DIV_ROUND_UP(payload_view->texture->height0, 2),
++                          1);
++
++release_payload_view:
++   pipe_sampler_view_reference(&payload_view, NULL);
++
++   return rgba8_tex;
++}
++
++static struct pipe_sampler_view *
++get_sampler_view_for_lut(struct pipe_context *pipe,
++                         const astc_decoder_lut *lut)
++{
++   struct pipe_resource *res =
++      pipe_buffer_create_with_data(pipe,
++                                   PIPE_BIND_SAMPLER_VIEW,
++                                   PIPE_USAGE_DEFAULT,
++                                   lut->size_B,
++                                   lut->data);
++   if (!res)
++      return NULL;
++
++   const struct pipe_sampler_view templ = {
++      .format = lut->format,
++      .target = PIPE_BUFFER,
++      .swizzle_r = PIPE_SWIZZLE_X,
++      .swizzle_g = PIPE_SWIZZLE_Y,
++      .swizzle_b = PIPE_SWIZZLE_Z,
++      .swizzle_a = PIPE_SWIZZLE_W,
++      .u.buf.offset = 0,
++      .u.buf.size = lut->size_B,
++   };
++
++   struct pipe_sampler_view *view =
++      pipe->create_sampler_view(pipe, res, &templ);
++
++   pipe_resource_reference(&res, NULL);
++
++   return view;
++}
++
++/* Initializes required resources for Granite ASTC GPU decode.
++ *
++ * There are 5 texture buffer objects and one additional texture required.
++ * We initialize 5 tbo's here and a single texture later during runtime.
++ */
++static bool
++initialize_astc_decoder(struct st_context *st)
++{
++   astc_decoder_lut_holder astc_lut_holder;
++   _mesa_init_astc_decoder_luts(&astc_lut_holder);
++
++   const astc_decoder_lut *luts[] = {
++      &astc_lut_holder.color_endpoint,
++      &astc_lut_holder.color_endpoint_unquant,
++      &astc_lut_holder.weights,
++      &astc_lut_holder.weights_unquant,
++      &astc_lut_holder.trits_quints,
++   };
++
++   for (unsigned i = 0; i < ARRAY_SIZE(luts); i++) {
++      st->texcompress_compute.astc_luts[i] =
++         get_sampler_view_for_lut(st->pipe, luts[i]);
++      if (!st->texcompress_compute.astc_luts[i])
++         return false;
++   }
++
++   st->texcompress_compute.astc_partition_tables =
++      _mesa_pointer_hash_table_create(NULL);
++
++   if (!st->texcompress_compute.astc_partition_tables)
++      return false;
++
++   return true;
++}
++
++bool
++st_init_texcompress_compute(struct st_context *st)
++{
++   st->texcompress_compute.progs =
++      calloc(COMPUTE_PROGRAM_COUNT, sizeof(struct gl_program *));
++   if (!st->texcompress_compute.progs)
++      return false;
++
++   st->texcompress_compute.bc1_endpoint_buf =
++      create_bc1_endpoint_ssbo(st->pipe);
++   if (!st->texcompress_compute.bc1_endpoint_buf)
++      return false;
++
++   if (!initialize_astc_decoder(st))
++      return false;
++
++   return true;
++}
++
++static void
++destroy_astc_decoder(struct st_context *st)
++{
++   for (unsigned i = 0; i < ARRAY_SIZE(st->texcompress_compute.astc_luts); i++)
++      pipe_sampler_view_reference(&st->texcompress_compute.astc_luts[i], NULL);
++
++   if (st->texcompress_compute.astc_partition_tables) {
++      hash_table_foreach(st->texcompress_compute.astc_partition_tables,
++                         entry) {
++         pipe_sampler_view_reference(
++            (struct pipe_sampler_view **)&entry->data, NULL);
++      }
++   }
++
++   _mesa_hash_table_destroy(st->texcompress_compute.astc_partition_tables,
++                            NULL);
++}
++
++void
++st_destroy_texcompress_compute(struct st_context *st)
++{
++   /* The programs in the array are part of the gl_context (in st->ctx).They
++    * are automatically destroyed when the context is destroyed (via
++    * _mesa_free_context_data -> ... -> free_shader_program_data_cb).
++    */
++   free(st->texcompress_compute.progs);
++
++   /* Destroy the SSBO used by the BC1 shader program. */
++   pipe_resource_reference(&st->texcompress_compute.bc1_endpoint_buf, NULL);
++
++   destroy_astc_decoder(st);
++}
++
++/* See st_texcompress_compute.h for more information. */
++bool
++st_compute_transcode_astc_to_dxt5(struct st_context *st,
++                                  uint8_t *astc_data,
++                                  unsigned astc_stride,
++                                  mesa_format astc_format,
++                                  struct pipe_resource *dxt5_tex,
++                                  unsigned dxt5_level,
++                                  unsigned dxt5_layer)
++{
++   assert(_mesa_has_compute_shaders(st->ctx));
++   assert(_mesa_is_format_astc_2d(astc_format));
++   assert(dxt5_tex->format == PIPE_FORMAT_DXT5_RGBA ||
++          dxt5_tex->format == PIPE_FORMAT_DXT5_SRGBA);
++   assert(dxt5_level <= dxt5_tex->last_level);
++   assert(dxt5_layer <= util_max_layer(dxt5_tex, dxt5_level));
++
++   bool success = false;
++
++   /* Decode ASTC to RGBA8. */
++   struct pipe_resource *rgba8_tex =
++      cs_decode_astc(st, astc_data, astc_stride, astc_format,
++                     u_minify(dxt5_tex->width0, dxt5_level),
++                     u_minify(dxt5_tex->height0, dxt5_level));
++   if (!rgba8_tex)
++      return false;
++
++   st->pipe->memory_barrier(st->pipe, PIPE_BARRIER_TEXTURE);
++
++   /* Encode RGBA8 to BC3. */
++   struct pipe_resource *bc3_tex = cs_encode_bc3(st, rgba8_tex);
++   if (!bc3_tex)
++      goto release_textures;
++
++   /* Upload the result. */
++   struct pipe_box src_box;
++   u_box_origin_2d(bc3_tex->width0, bc3_tex->height0, &src_box);
++   st->pipe->resource_copy_region(st->pipe, dxt5_tex, dxt5_level,
++                                  0, 0, dxt5_layer, bc3_tex, 0, &src_box);
++
++   success = true;
++
++release_textures:
++   pipe_resource_reference(&rgba8_tex, NULL);
++   pipe_resource_reference(&bc3_tex, NULL);
++
++   return success;
++}
+diff --git a/src/mesa/state_tracker/st_texcompress_compute.h b/src/mesa/state_tracker/st_texcompress_compute.h
+new file mode 100644
+index 00000000000..fb2f08fdae7
+--- /dev/null
++++ b/src/mesa/state_tracker/st_texcompress_compute.h
+@@ -0,0 +1,51 @@
++/**************************************************************************
++ *
++ * Copyright © 2022 Intel Corporation
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
++ * DEALINGS IN THE SOFTWARE.
++ *
++ **************************************************************************/
++
++#ifndef ST_TEXCOMPRESS_COMPUTE_H
++#define ST_TEXCOMPRESS_COMPUTE_H
++
++bool
++st_init_texcompress_compute(struct st_context *st);
++
++void
++st_destroy_texcompress_compute(struct st_context *st);
++
++/**
++ * When this function returns true, the destination image will contain the
++ * contents of astc_data but transcoded to DXT5/BC3.
++ *
++ * Note that this function will internally create compute programs by using
++ * glCreateShaderProgramv with the application's GL context.
++ */
++bool
++st_compute_transcode_astc_to_dxt5(struct st_context *st,
++                                  uint8_t *astc_data,
++                                  unsigned astc_stride,
++                                  mesa_format astc_format,
++                                  struct pipe_resource *dxt5_tex,
++                                  unsigned dxt5_level,
++                                  unsigned dxt5_layer);
++
++#endif /* ST_TEXCOMPRESS_COMPUTE_H */
+diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
+index 0e72491e1b7..3e03d2fc3f5 100644
+--- a/src/mesa/state_tracker/st_texture.c
++++ b/src/mesa/state_tracker/st_texture.c
+@@ -241,6 +241,41 @@ st_texture_match_image(struct st_context *st,
+    return GL_TRUE;
+ }
+ 
++void
++st_texture_image_insert_transfer(struct gl_texture_image *stImage,
++                                 unsigned index,
++                                 struct pipe_transfer *transfer)
++{
++   /* Enlarge the transfer array if it's not large enough. */
++   if (index >= stImage->num_transfers) {
++      unsigned new_size = index + 1;
++
++      stImage->transfer = realloc(stImage->transfer,
++                  new_size * sizeof(struct st_texture_image_transfer));
++      memset(&stImage->transfer[stImage->num_transfers], 0,
++             (new_size - stImage->num_transfers) *
++             sizeof(struct st_texture_image_transfer));
++      stImage->num_transfers = new_size;
++   }
++
++   assert(!stImage->transfer[index].transfer);
++   stImage->transfer[index].transfer = transfer;
++}
++
++/* See st_texture.h for more information. */
++GLuint
++st_texture_image_resource_level(struct gl_texture_image *stImage)
++{
++   /* An image for a non-finalized texture object only has a single level. */
++   if (stImage->pt != stImage->TexObject->pt)
++      return 0;
++
++   /* An immutable texture object may have views with an LOD offset. */
++   if (stImage->TexObject->Immutable)
++      return stImage->Level + stImage->TexObject->Attrib.MinLevel;
++
++   return stImage->Level;
++}
+ 
+ /**
+  * Map a texture image and return the address for a particular 2D face/slice/
+@@ -282,22 +317,10 @@ st_texture_image_map(struct st_context *st, struct gl_texture_image *stImage,
+ 
+    map = pipe_texture_map_3d(st->pipe, stImage->pt, level, usage,
+                               x, y, z, w, h, d, transfer);
+-   if (map) {
+-      /* Enlarge the transfer array if it's not large enough. */
+-      if (z >= stImage->num_transfers) {
+-         unsigned new_size = z + 1;
+-
+-         stImage->transfer = realloc(stImage->transfer,
+-                     new_size * sizeof(struct st_texture_image_transfer));
+-         memset(&stImage->transfer[stImage->num_transfers], 0,
+-                (new_size - stImage->num_transfers) *
+-                sizeof(struct st_texture_image_transfer));
+-         stImage->num_transfers = new_size;
+-      }
+ 
+-      assert(!stImage->transfer[z].transfer);
+-      stImage->transfer[z].transfer = *transfer;
+-   }
++   if (map)
++      st_texture_image_insert_transfer(stImage, z, *transfer);
++
+    return map;
+ }
+ 
+diff --git a/src/mesa/state_tracker/st_texture.h b/src/mesa/state_tracker/st_texture.h
+index 29e9c98a354..ea989707e49 100644
+--- a/src/mesa/state_tracker/st_texture.h
++++ b/src/mesa/state_tracker/st_texture.h
+@@ -46,7 +46,7 @@ struct st_texture_image_transfer
+    /* For compressed texture fallback. */
+    GLubyte *temp_data; /**< Temporary compressed texture storage. */
+    unsigned temp_stride; /**< Stride of the compressed texture storage. */
+-   GLubyte *map; /**< Saved map pointer of the uncompressed transfer. */
++   struct pipe_box box; /**< Region of the transfer's resource to write. */
+ };
+ 
+ 
+@@ -178,6 +178,21 @@ st_texture_match_image(struct st_context *st,
+                        const struct pipe_resource *pt,
+                        const struct gl_texture_image *image);
+ 
++/* Insert a transfer pointer into the image's transfer array at the specified
++ * index. The array is reallocated if necessary.
++ */
++void
++st_texture_image_insert_transfer(struct gl_texture_image *stImage,
++                                 unsigned index,
++                                 struct pipe_transfer *transfer);
++
++/**
++ * Returns the level of the gl_texture_image with respect to the resource it
++ * is allocated within. Example: returns 0 for non-finalized texture.
++ */
++GLuint
++st_texture_image_resource_level(struct gl_texture_image *stImage);
++
+ /* Return a pointer to an image within a texture.  Return image stride as
+  * well.
+  */
-- 
Gitee