关于摩尔线程s80显卡OpenCL驱动崩溃的问题

2501_92453550 2025-06-14 15:04:34
本人长期开发、维护OpenCL开源可导光线追踪渲染器项目LibreDR https://codeberg.org/ybh1998/LibreDR 。最近在尝试适配摩尔线程s80显卡，在迁移摩尔线程s80时遇到编译kernel时驱动崩溃问题，详情如下：

本人在Ubuntu 20.04下分别尝试了musa_2.5.0和musa_2.7.1-rc3版本驱动，将kernel源代码使用clCreateProgramWithSource传入OpenCL驱动，kernel代码在附件“debug_kernel.cl”中。调用clBuildProgram编译kernel时程序发生segmentation fault，backtrace如附件“backtrace.png”中所示，显示segmentation fault发生在OpenCL驱动libufwriter_MUSA.so中，但由于缺失debug symbol，不能确定具体位置。

本人发现在clinfo中，摩尔线程显卡汇报支持cl_khr_spir扩展。本人尝试严格按照 https://github.com/KhronosGroup/SPIR 描述的标准编译SPIR程序，使用clCreateProgramWithBinary传入OpenCL驱动，在clBuildProgram时返回CL_DEVICE_NOT_AVAILABLE错误信息。

1. 使用源代码传入kernel时，由于segmentation fault发生在驱动代码内，是否方便处理相关驱动代码的问题？如果方便提供libufwriter_MUSA.so相关代码的debug symbol，本人可以协助调试；
2. 另外想问摩尔线程s80显卡OpenCL是否支持SPIR输入程序？如果支持，能否提供相应的使用样例？
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable

typedef struct {
  float3 r, rd, rn, render, render_weight;
  float3 render_last, render_last_weight;
  float2 rt;
  float pdf;
  ulong2 pcg32_state;
  int ri, hit_cnt;
} Ray;

typedef struct __attribute__((__packed__)) {
  const float vertex[8][3][170u];
  const float uv_xyz[4][256u][256u];
  const int bvh_left[86u], bvh_right[86u], bvh_next[86u];
  const float bvh_corner[3][2][86u];
} GeometryBuffer;

typedef struct __attribute__((__packed__)) {
  float ray[27u][262144u];
  const float texture[14][256u][256u];
  const uint texture_cdf[256u][256u];
    float d_texture[14][256u][256u][8u];
  const float envmap[3][6][512u][512u];
  const uint envmap_cdf[6][512u][512u];
  float d_envmap[3][6][512u][512u][8u];
} RenderBuffer;

typedef struct __attribute__((__packed__)) {
  float ray_r[3][2u][262144u][128u];
  uint ray_ri[2u][262144u][128u];
  float ray_pdf[2u][262144u][128u];
  float ray_render_last[3][262144u][128u];
} IntermediateBuffer;

uint pcg32_next(Ray *ray) {
  ulong old_state = ray->pcg32_state.x;
  ray->pcg32_state.x = old_state * 0x5851f42d4c957f2dul + ray->pcg32_state.y;
  uint xor_shifted = ((old_state >> 18u) ^ old_state) >> 27u;
  uint rot = old_state >> 59u;
  return (xor_shifted >> rot) | (xor_shifted << ((-rot) & 31));
}

float pcg32_next_float(Ray *ray) {
  union {
    uint u;
    float f;
  } x;
  x.u = (pcg32_next(ray) >> 9) | 0x3f800000u;
  return x.f - 1.f;
}

void pcg32_init(Ray *ray, ulong seed, ulong inc) {
  ray->pcg32_state.x = 0ul;
  ray->pcg32_state.y = (inc << 1u) | 1u;
  pcg32_next(ray);
  ray->pcg32_state.x += (0x853c49e6748fea9bul + seed);
  pcg32_next(ray);
}

float halton_float(const uint low_discrepancy, const uint base) {
  const uint PRIMES[] = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89};
  const uint prime_base = PRIMES[base];
  float f = 1, r = 0;
  for (uint index = get_global_id(1) + low_discrepancy + 1; index > 0; index /= prime_base) {
    f /= prime_base;
    r += f * (index % prime_base);
  }
  return r;
}

uint halton(const uint low_discrepancy, const uint base) {
  return halton_float(low_discrepancy, base) * (float)UINT_MAX;
}

void atomic_add_f(volatile global float *source, const float operand) {
  if (operand == 0.f)
    return;
  union {
    uint u32;
    float f32;
  } next, expected, current;
  current.f32 = *source;
  do {
    expected.f32 = current.f32;
    next.f32 = expected.f32 + operand;
    current.u32 = atomic_cmpxchg((volatile __global uint *)source, expected.u32, next.u32);
  } while (current.u32 != expected.u32);
}

void local_gather_atomic_add_f (volatile global float *source, local float *operand_gather, const float operand) {
  operand_gather[get_local_id(1)] = operand;
  barrier(CLK_LOCAL_MEM_FENCE);
  for (int i = get_local_size(1) / 2; i > 0; i >>= 1) {
    if (get_local_id(1) < i)
      operand_gather[get_local_id(1)] += operand_gather[get_local_id(1) + i];
    barrier(CLK_LOCAL_MEM_FENCE);
  }
  if (get_local_id(1) == 0)
    atomic_add_f(source, operand_gather[0]);
  barrier(CLK_LOCAL_MEM_FENCE);
}

void d_material_normal(
    volatile global RenderBuffer *render_buffer,
    local float *operand_gather,
    const uint2 texture_index,
    const int i_bounce,
    const float3 d_normal) {
  const uint i_bucket = get_global_id(1) % 8u;
  atomic_add_f(&render_buffer->d_texture[0][texture_index.x][texture_index.y][i_bucket], d_normal.x);
  atomic_add_f(&render_buffer->d_texture[1][texture_index.x][texture_index.y][i_bucket], d_normal.y);
  atomic_add_f(&render_buffer->d_texture[2][texture_index.x][texture_index.y][i_bucket], d_normal.z);
}

float3 material_diffuse(
    const volatile global RenderBuffer *render_buffer,
    const uint2 texture_index,
    const int i_bounce) {
  return (float3)(render_buffer->texture[3][texture_index.x][texture_index.y],
                  render_buffer->texture[4][texture_index.x][texture_index.y],
                  render_buffer->texture[5][texture_index.x][texture_index.y]);
}

void d_material_diffuse(
    volatile global RenderBuffer *render_buffer,
    local float *operand_gather,
    const uint2 texture_index,
    const int i_bounce,
    const float3 d_diffuse) {
  const uint i_bucket = get_global_id(1) % 8u;
  atomic_add_f(&render_buffer->d_texture[3][texture_index.x][texture_index.y][i_bucket], d_diffuse.x);
  atomic_add_f(&render_buffer->d_texture[4][texture_index.x][texture_index.y][i_bucket], d_diffuse.y);
  atomic_add_f(&render_buffer->d_texture[5][texture_index.x][texture_index.y][i_bucket], d_diffuse.z);
}

float3 material_specular(
    const volatile global RenderBuffer *render_buffer,
    const uint2 texture_index,
    const int i_bounce) {
  return (float3)(render_buffer->texture[6][texture_index.x][texture_index.y],
                  render_buffer->texture[7][texture_index.x][texture_index.y],
                  render_buffer->texture[8][texture_index.x][texture_index.y]);
}

void d_material_specular(
    volatile global RenderBuffer *render_buffer,
    local float *operand_gather,
    const uint2 texture_index,
    const int i_bounce,
    const float3 d_specular) {
  const uint i_bucket = get_global_id(1) % 8u;
  atomic_add_f(&render_buffer->d_texture[6][texture_index.x][texture_index.y][i_bucket], d_specular.x);
  atomic_add_f(&render_buffer->d_texture[7][texture_index.x][texture_index.y][i_bucket], d_specular.y);
  atomic_add_f(&render_buffer->d_texture[8][texture_index.x][texture_index.y][i_bucket], d_specular.z);
}

float material_roughness(
    const volatile global RenderBuffer *render_buffer,
    const uint2 texture_index,
    const int i_bounce) {
  return render_buffer->texture[9][texture_index.x][texture_index.y];
}

void d_material_roughness(
    volatile global RenderBuffer *render_buffer,
    local float *operand_gather,
    const uint2 texture_index,
    const int i_bounce,
    const float d_roughness) {
  const uint i_bucket = get_global_id(1) % 8u;
  atomic_add_f(&render_buffer->d_texture[9][texture_index.x][texture_index.y][i_bucket], d_roughness);
}

float phong_roughness_to_shininess(const float roughness) {
  return 2.f / roughness - 2.f;
}

float d_phong_roughness_to_shininess(const float roughness, const float d_shininess) {
  return -2.f * d_shininess / pown(roughness, 2);
}

float specular_D_beckmann(const float3 rn, const float3 h, const float roughness) {
  const float alpha = roughness / 4.f;
  const float n_dot_h = dot(rn, h);
  return exp((pown(n_dot_h, 2) - 1.f) / (alpha * pown(n_dot_h, 2))) /
    (4 * M_PI_F * alpha * pown(n_dot_h, 4));
}

void d_specular_D_beckmann(
    const float3 rn,
    const float3 h,
    const float roughness,
    const float d_D,
    float3 *d_normal,
    float *d_roughness) {
  const float alpha = roughness / 4.f;
  const float n_dot_h = dot(rn, h);
  const float d_n_dot_h = d_D * exp((pown(n_dot_h, 2) - 1.f) / (alpha * pown(n_dot_h, 2))) *
    (1.f - 2.f * alpha * pown(n_dot_h, 2)) / (2.f * M_PI_F * pown(alpha, 2) * pown(n_dot_h, 7));
  *d_normal += d_n_dot_h * h;
  const float d_alpha = d_D * exp((pown(n_dot_h, 2) - 1.f) / (alpha * pown(n_dot_h, 2))) *
    (1.f - (alpha + 1.f) * pown(n_dot_h, 2)) / (4.f * M_PI_F * pown(n_dot_h, 6) * pow(alpha, 3));
  *d_roughness += d_alpha / 4.f;
}

int bin_search(const global uint *cdf, const uint size, const uint value) {
  int a = -1, b = size - 1;
  while (a + 1 < b) {
    int m = (a + b) / 2;
    if (cdf[m] < value)
      a = m;
    else
      b = m;
  }
  return b;
}

float3 envmap_offset_to_rd(const uint envmap_offset) {
  const uint envmap_face_id = envmap_offset / (512u * 512u);
  const uint envmap_row = (envmap_offset / 512u) % 512u;
  const uint envmap_col = envmap_offset % 512u;
  const float2 background_uv = (float2)(
    (envmap_col + 0.5f) / 512u,
    (512u - 0.5f - envmap_row) / 512u) * 2.f - 1.f;
  float3 next_rd;
  switch (envmap_face_id / 2) {
    case 0:
      next_rd.x = (envmap_face_id % 2) ? -1.f : 1.f;
      next_rd.y = background_uv.x;
      next_rd.z = background_uv.y;
      break;
    case 1:
      next_rd.x = background_uv.x;
      next_rd.y = (envmap_face_id % 2) ? -1.f : 1.f;
      next_rd.z = background_uv.y;
      break;
    case 2:
      next_rd.x = background_uv.x;
      next_rd.y = background_uv.y;
      next_rd.z = (envmap_face_id % 2) ? -1.f : 1.f;
      break;
  }
  next_rd /= sqrt(dot(next_rd, next_rd));
  return next_rd;
}

float3 sample_reflection(
    const global GeometryBuffer *geometry_buffer,
    const volatile global RenderBuffer *render_buffer,
    Ray *ray,
    const uint2 texture_index,
    int i_bounce,
    uint low_discrepancy) {
  const float3 r = ray->rd - 2 * dot(ray->rd, ray->rn) * ray->rn;
  const uint w = i_bounce < 0 ? halton(low_discrepancy, i_bounce * 3 + 2) : pcg32_next(ray);
  float tmp;
  float3 next_rd = (float3)(0.f, 0.f, 0.f);
  if (w < 1073741826u + 1073741823u) {
    float3 n_coord[3];
    n_coord[2] = ray->rn;
    n_coord[0] = cross((float3)(0.f, 0.f, 1.f), n_coord[2]);
    tmp = dot(n_coord[0], n_coord[0]);
    if (tmp < 1e-3f) {
      n_coord[0] = cross((float3)(0.f, 1.f, 0.f), n_coord[2]);
      tmp = dot(n_coord[0], n_coord[0]);
    }
    if (tmp < 1e-3f) {
      n_coord[0] = cross((float3)(1.f, 0.f, 0.f), n_coord[2]);
      tmp = dot(n_coord[0], n_coord[0]);
    }
    n_coord[0] /= sqrt(tmp);
    n_coord[1] = cross(n_coord[0], n_coord[2]);
    n_coord[1] /= sqrt(dot(n_coord[1], n_coord[1]));
    const float sample_a =
      i_bounce < 0 ? halton_float(low_discrepancy, i_bounce * 3 + 3) : pcg32_next_float(ray);
    const float sample_b =
      i_bounce < 0 ? halton_float(low_discrepancy, i_bounce * 3 + 4) : pcg32_next_float(ray);
    const float2 sample = (float2)(sample_a, sample_b);
    const float sin_phi = sin(2.f * M_PI_F * sample.x);
    const float cos_phi = cos(2.f * M_PI_F * sample.x);
    if (w <= 1073741826u) {
      tmp = sqrt(1.f - sample.y);
      const float3 local_dir = (float3)(cos_phi * tmp, sin_phi * tmp, sqrt(sample.y));
      next_rd = local_dir.x * n_coord[0] + local_dir.y * n_coord[1] + local_dir.z * n_coord[2];
    } else {
      float3 r_coord[3];
      r_coord[2] = r;
      r_coord[0] = cross((float3)(0.f, 0.f, 1.f), r_coord[2]);
      tmp = dot(r_coord[0], r_coord[0]);
      if (tmp < 1e-3f) {
        r_coord[0] = cross((float3)(0.f, 1.f, 0.f), r_coord[2]);
        tmp = dot(r_coord[0], r_coord[0]);
      }
      if (tmp < 1e-3f) {
        r_coord[0] = cross((float3)(1.f, 0.f, 0.f), r_coord[2]);
        tmp = dot(r_coord[0], r_coord[0]);
      }
      r_coord[0] /= sqrt(tmp);
      r_coord[1] = cross(r_coord[0], r_coord[2]);
      r_coord[1] /= sqrt(dot(r_coord[1], r_coord[1]));
      const float roughness = material_roughness(render_buffer, texture_index, i_bounce);
      const float shininess = phong_roughness_to_shininess(roughness);
      const float cos_theta = pow(sample.y, 1.f / (shininess + 1.f));
      const float sin_theta = sqrt(1.f - cos_theta * cos_theta);
      const float3 local_dir = (float3)(cos_phi * sin_theta, sin_phi * sin_theta, cos_theta);
      next_rd = local_dir.x * r_coord[0] + local_dir.y * r_coord[1] + local_dir.z * r_coord[2];
      if (dot(next_rd, ray->rn) < 0) {
        next_rd -= 2 * dot(next_rd, ray->rn) * ray->rn;
      }
    }
  } else if (w < 1073741826u + 1073741823u + 1073741823u) {
    const uint texture_offset = bin_search(
      (const global uint *)render_buffer->texture_cdf,
      256u * 256u,
      i_bounce < 0 ? halton(low_discrepancy, i_bounce * 3 + 3) : pcg32_next(ray));
    const uint2 texture_index = (uint2)(texture_offset / 256u, texture_offset % 256u);
    const float3 look_at = (float3)(
      geometry_buffer->uv_xyz[0][texture_index.x][texture_index.y],
      geometry_buffer->uv_xyz[1][texture_index.x][texture_index.y],
      geometry_buffer->uv_xyz[2][texture_index.x][texture_index.y]);
    next_rd = look_at - ray->r;
    const float dist = sqrt(dot(next_rd, next_rd));
    if (dist < 1e-3f) {
      ray->render_weight = (float3)(0.f, 0.f, 0.f);
      next_rd = (float3)(0.f, 1.f, 0.f);
    } else {
      next_rd /= dist;
    }
  } else {
    const uint envmap_offset = bin_search(
      (const global uint *)render_buffer->envmap_cdf,
      6 * 512u * 512u,
      i_bounce < 0 ? halton(low_discrepancy, i_bounce * 3 + 3) : pcg32_next(ray));
    next_rd = envmap_offset_to_rd(envmap_offset);
  }
  return next_rd;
}

float3 diffuse_brdf(
    const volatile global RenderBuffer *render_buffer,
    Ray *ray,
    const float3 next_rd,
    const uint2 texture_index,
    const int i_bounce) {
  float3 brdf = (float3)(0.f, 0.f, 0.f);
  if (dot(ray->rn, next_rd) < 1e-3f)
    return brdf;
  const float3 diffuse = material_diffuse(render_buffer, texture_index, i_bounce);
  brdf = diffuse * dot(ray->rn, next_rd) / M_PI_F;
  return brdf;
}

void d_diffuse_brdf(
    volatile global RenderBuffer *render_buffer,
    local float *operand_gather,
    Ray *ray,
    float3 next_rd,
    const uint2 texture_index,
    const int i_bounce,
    float3 d_brdf) {
  float3 d_diffuse = (float3)(0.f, 0.f, 0.f);
  float3 d_normal = (float3)(0.f, 0.f, 0.f);
  if (fabs(d_brdf.x) < 1e-6f && fabs(d_brdf.y) < 1e-6f && fabs(d_brdf.z) < 1e-6f)
    goto d_return;
  if (dot(ray->rn, next_rd) < 1e-3f)
    goto d_return;
  d_diffuse += d_brdf * dot(ray->rn, next_rd) / M_PI_F;
  const float3 diffuse = material_diffuse(render_buffer, texture_index, i_bounce);
  d_normal += next_rd * dot(d_brdf, diffuse) / M_PI_F;
d_return:
  d_material_diffuse(render_buffer, operand_gather, texture_index, i_bounce, d_diffuse);
  d_material_normal(render_buffer, operand_gather, texture_index, i_bounce, d_normal);
}

float3 specular_brdf(
    const volatile global RenderBuffer *render_buffer,
    Ray *ray,
    const float3 next_rd,
    const uint2 texture_index,
    const int i_bounce) {
  float3 brdf = (float3)(0.f, 0.f, 0.f);
  if (dot(ray->rn, next_rd) < 1e-3f)
    return brdf;
  const float3 specular = material_specular(render_buffer, texture_index, i_bounce);
  const float roughness = material_roughness(render_buffer, texture_index, i_bounce);
  // TODO: Shall we move the data EPS check to rust code?
  if (roughness < 1e-6f)
    return brdf;
  float3 h = next_rd - ray->rd;
  if (dot(h, h) < 1e-3f)
    return brdf;
  h /= sqrt(dot(h, h));
  if (dot(h, -ray->rd) < 1e-3f || dot(h, ray->rn) < 1e-3f)
    return brdf;
  const float3 F = specular + (1.f - specular) * pown(1.f - dot(ray->rn, -ray->rd), 5);
  const float G1 = 2.f * dot(h, ray->rn) * dot(ray->rn, -ray->rd) / dot(h, -ray->rd);
  const float G2 = 2.f * dot(h, ray->rn) * dot(ray->rn, next_rd) / dot(h, -ray->rd);
  const float G = min(1.f, min(G1, G2));
  const float D = specular_D_beckmann(ray->rn, h, roughness);
  brdf = D * G * F / dot(ray->rn, -ray->rd);
  return brdf;
}

void d_specular_brdf(
    volatile global RenderBuffer *render_buffer,
    local float *operand_gather,
    Ray *ray,
    float3 next_rd,
    const uint2 texture_index,
    const int i_bounce,
    float3 d_brdf) {
  float3 d_specular = (float3)(0.f, 0.f, 0.f);
  float d_roughness = 0.f;
  float3 d_normal = (float3)(0.f, 0.f, 0.f);
  if (fabs(d_brdf.x) < 1e-6f && fabs(d_brdf.y) < 1e-6f && fabs(d_brdf.z) < 1e-6f)
    goto d_return;
  if (dot(ray->rn, next_rd) < 1e-3f)
    goto d_return;
  const float3 specular = material_specular(render_buffer, texture_index, i_bounce);
  float roughness = material_roughness(render_buffer, texture_index, i_bounce);
  // TODO: Shall we move the data EPS check to rust code?
  if (roughness < 1e-6f)
    goto d_return;
  float3 h = next_rd - ray->rd;
  if (dot(h, h) < 1e-3f)
    goto d_return;
  h /= sqrt(dot(h, h));
  if (dot(h, -ray->rd) < 1e-3f || dot(h, ray->rn) < 1e-3f)
    goto d_return;
  const float3 F = specular + (1.f - specular) * pown(1.f - dot(ray->rn, -ray->rd), 5);
  const float G1 = 2.f * dot(h, ray->rn) * dot(ray->rn, -ray->rd) / dot(h, -ray->rd);
  const float G2 = 2.f * dot(h, ray->rn) * dot(ray->rn, next_rd) / dot(h, -ray->rd);
  const float G = min(1.f, min(G1, G2));
  const float D = specular_D_beckmann(ray->rn, h, roughness);
  const float d_D = dot(d_brdf, F) * G / dot(ray->rn, -ray->rd);
  float3 d_n_dot_v = -dot(d_brdf, F) * D * G / pown(dot(ray->rn, -ray->rd), 2);
  const float3 d_F = d_brdf * D * G / dot(ray->rn, -ray->rd);
  d_specular = d_F * (1.f - pown(1.f - dot(ray->rn, -ray->rd), 5));
  d_n_dot_v += -5 * dot(d_F, 1.f - specular) * pown(1.f - dot(ray->rn, -ray->rd), 4);
  d_normal += d_n_dot_v * -ray->rd;
  const float d_G = dot(d_brdf, F) * D / dot(ray->rn, -ray->rd);
  if (G1 < 1.f && G1 <= G2) {
    d_normal += 2.f * d_G * (dot(h, ray->rn) * -ray->rd + h * dot(ray->rn, -ray->rd)) / dot(h, -ray->rd);
  }
  if (G2 < 1.f && G2 < G1) {
    d_normal += 2.f * d_G * (dot(h, ray->rn) * next_rd + h * dot(ray->rn, next_rd)) / dot(h, -ray->rd);
  }
  d_specular_D_beckmann(ray->rn, h, roughness, d_D, &d_normal, &d_roughness);
d_return:
  d_material_specular(render_buffer, operand_gather, texture_index, i_bounce, d_specular);
  d_material_roughness(render_buffer, operand_gather, texture_index, i_bounce, d_roughness);
  d_material_normal(render_buffer, operand_gather, texture_index, i_bounce, d_normal);
}

float diffuse_pdf(Ray *ray, const float3 next_rd) {
  const float pdf = max(dot(ray->rn, next_rd), 0.f) / M_PI_F;
  return 0.25000000052386895f * pdf;
}

float specular_pdf(
    const volatile global RenderBuffer *render_buffer,
    Ray *ray,
    const float3 next_rd,
    const uint2 texture_index,
    const int i_bounce) {
  float pdf = 0.f;
  if (dot(ray->rn, next_rd) < 1e-3f)
    return pdf;
  const float roughness = material_roughness(render_buffer, texture_index, i_bounce);
  const float shininess = phong_roughness_to_shininess(roughness);
  const float3 r = ray->rd - 2 * dot(ray->rd, ray->rn) * ray->rn;
  float r_dot_wi = max(dot(r, next_rd - 2 * dot(next_rd, ray->rn) * ray->rn), 0.f);
  pdf += pow(r_dot_wi, shininess) * (shininess + 1.f) / (2.f * M_PI_F);
  r_dot_wi = max(dot(r, next_rd), 0.f);
  pdf += pow(r_dot_wi, shininess) * (shininess + 1.f) / (2.f * M_PI_F);
  return 0.24999999982537702f * pdf;
}

float float3_access(const float3 data, const uint idx) {
  if (idx == 0)
    return data.x;
  if (idx == 1)
    return data.y;
  return data.z;
}

float envmap_pdf(const global uint *cdf, const Ray *ray, const uint3 envmap_index) {
  uint envmap_offset = envmap_index.x * 512u * 512u +
    envmap_index.y * 512u + envmap_index.z;
  uint pdf = cdf[envmap_offset];
  if (envmap_offset > 0)
    pdf -= cdf[envmap_offset - 1];
  float n_dot_v = max(fabs(ray->rd.x), max(fabs(ray->rd.y), fabs(ray->rd.z)));
  return 3.814697263848643e-6f * pdf / pown(n_dot_v, 3);
}

void background_intersect(const volatile global RenderBuffer *render_buffer, Ray *ray, uint3 *envmap_index) {
  uint face_id = 0;
  const float3 rd_abs = fabs(ray->rd);
  if (rd_abs.x > rd_abs.y) {
    if (rd_abs.x > rd_abs.z)
      face_id = 0;
    else
      face_id = 2;
  } else {
    if (rd_abs.y > rd_abs.z)
      face_id = 1;
    else
      face_id = 2;
  }
  const float3 rd = ray->rd / float3_access(rd_abs, face_id);
  float2 uv;
  switch (face_id) {
  case 0:
    uv = (float2)(rd.y, rd.z);
    break;
  case 1:
    uv = (float2)(rd.x, rd.z);
    break;
  case 2:
    uv = (float2)(rd.x, rd.y);
    break;
  }
  uv = (uv + 1.f) / 2.f;
  if (float3_access(ray->rd, face_id) < 0)
    face_id = face_id * 2 + 1;
  else
    face_id = face_id * 2;
  uv = clamp(uv, 1e-6f, 1.f - 1e-6f);
  envmap_index->x = face_id;
  envmap_index->y = 512u - 1 - floor(uv.y * 512u);
  envmap_index->z = floor(uv.x * 512u);
  ray->pdf += envmap_pdf((const global uint *)render_buffer->envmap_cdf, ray, *envmap_index);
}

float triangle_intersect(const global GeometryBuffer *geometry_buffer, Ray *ray, const int triangle_id) {
  const float3 tri_v0 = (float3)(
    geometry_buffer->vertex[0][0][triangle_id],
    geometry_buffer->vertex[1][0][triangle_id],
    geometry_buffer->vertex[2][0][triangle_id]);
  const float3 u = (float3)(
    geometry_buffer->vertex[0][1][triangle_id],
    geometry_buffer->vertex[1][1][triangle_id],
    geometry_buffer->vertex[2][1][triangle_id]) - tri_v0;
  const float3 v = (float3)(
    geometry_buffer->vertex[0][2][triangle_id],
    geometry_buffer->vertex[1][2][triangle_id],
    geometry_buffer->vertex[2][2][triangle_id]) - tri_v0;
  const float3 fn = cross(u, v);
  const float3 w0 = ray->r - tri_v0;
  const float a = -dot(fn, w0);
  const float b = dot(fn, ray->rd);
  if (fabs(b) < 1e-12f) return MAXFLOAT;
  const float r = a / b;
  if (r <= 0) return MAXFLOAT;
  const float3 I = ray->r + ray->rd * r;
  const float uu = dot(u, u);
  const float uv = dot(u, v);
  const float vv = dot(v, v);
  const float3 w = I - tri_v0;
  const float wu = dot(w, u);
  const float wv = dot(w, v);
  const float D = uu * vv - uv * uv;
  const float D_min = -1e-5f * D;
  const float D_max = D - D_min;
  const float s = vv * wu - uv * wv;
  if (s < D_min || s > D_max) return MAXFLOAT;
  const float t = uu * wv - uv * wu;
  if (t < D_min || (s + t) > D_max) return MAXFLOAT;
  return r;
}

bool aabb_intersect(const global GeometryBuffer *geometry_buffer, Ray *ray, const int bvh_id) {
  const float3 corner[2] = {
    (float3)(
      geometry_buffer->bvh_corner[0][0][bvh_id],
      geometry_buffer->bvh_corner[1][0][bvh_id],
      geometry_buffer->bvh_corner[2][0][bvh_id]),
    (float3)(
      geometry_buffer->bvh_corner[0][1][bvh_id],
      geometry_buffer->bvh_corner[1][1][bvh_id],
      geometry_buffer->bvh_corner[2][1][bvh_id])
  };
  const float3 dir_frac = 1.0f / ray->rd;
  const float3 tmp[2] = {
    (corner[0] - ray->r) * dir_frac,
    (corner[1] - ray->r) * dir_frac
  };
  const float t_min = max(max(min(tmp[0].x, tmp[1].x), min(tmp[0].y, tmp[1].y)), min(tmp[0].z, tmp[1].z));
  const float t_max = min(min(max(tmp[0].x, tmp[1].x), max(tmp[0].y, tmp[1].y)), max(tmp[0].z, tmp[1].z));
  return t_max >= t_min;
}

float update_ray(
    const global GeometryBuffer *geometry_buffer,
    Ray *ray,
    const float d,
    const int triangle_id,
    uint2 *texture_index) {
  const float3 next_r = ray->r + ray->rd * d;
  const float3 tri_v0 = (float3)(
    geometry_buffer->vertex[0][0][triangle_id],
    geometry_buffer->vertex[1][0][triangle_id],
    geometry_buffer->vertex[2][0][triangle_id]);
  const float3 tri_v1 = (float3)(
    geometry_buffer->vertex[0][1][triangle_id],
    geometry_buffer->vertex[1][1][triangle_id],
    geometry_buffer->vertex[2][1][triangle_id]);
  const float3 tri_v2 = (float3)(
    geometry_buffer->vertex[0][2][triangle_id],
    geometry_buffer->vertex[1][2][triangle_id],
    geometry_buffer->vertex[2][2][triangle_id]);
  const float2 tri_vt0 = (float2)(
    geometry_buffer->vertex[6][0][triangle_id],
    geometry_buffer->vertex[7][0][triangle_id]);
  const float2 tri_vt1 = (float2)(
    geometry_buffer->vertex[6][1][triangle_id],
    geometry_buffer->vertex[7][1][triangle_id]);
  const float2 tri_vt2 = (float2)(
    geometry_buffer->vertex[6][2][triangle_id],
    geometry_buffer->vertex[7][2][triangle_id]);
  const float3 u_cross_v = cross(tri_v1 - tri_v0, tri_v2 - tri_v0);
  const float area_v = sqrt(dot(u_cross_v, u_cross_v));
  const float area_vt = fabs((tri_vt1 - tri_vt0).x * (tri_vt2 - tri_vt0).y -
                             (tri_vt1 - tri_vt0).y * (tri_vt2 - tri_vt0).x);
  float w = dot(cross(tri_v1 - tri_v0, next_r - tri_v0), u_cross_v) / dot(u_cross_v, u_cross_v);
  float v = dot(cross(next_r - tri_v0, tri_v2 - tri_v0), u_cross_v) / dot(u_cross_v, u_cross_v);
  w = max(1e-6f, min(1.f - 1e-6f, w));
  v = max(1e-6f, min(1.f - 1e-6f - w, v));
  float u = 1.f - w - v;
  ray->rt = tri_vt0 * u + tri_vt1 * v + tri_vt2 * w;
  ray->rt = clamp(ray->rt, 1e-6f, 1.f - 1e-6f);
  texture_index->x = 256u - 1 - floor(ray->rt.y * 256u);
  texture_index->y = floor(ray->rt.x * 256u);
  ray->rn = u_cross_v / sqrt(dot(u_cross_v, u_cross_v));
  if (dot(ray->rd, ray->rn) > 0)
    ray->rn = -ray->rn;
  return area_vt / area_v;
}

float texture_pdf(
    const global uint *texture_cdf,
    const Ray *ray,
    const float d,
    const float texture_area_ratio,
    const uint2 texture_index) {
  const uint texture_offset = texture_index.x * 256u + texture_index.y;
  uint pdf = texture_cdf[texture_offset];
  if (texture_offset > 0)
    pdf -= texture_cdf[texture_offset - 1];
  float n_dot_v = fabs(dot(ray->rd, ray->rn));
  if (n_dot_v < 1e-3f)
    return MAXFLOAT;
  return 3.814697263848643e-6f * pdf * texture_area_ratio * pown(d, 2) / n_dot_v;
}

void any_hit(
    const global GeometryBuffer *geometry_buffer,
    const global uint *texture_cdf,
    Ray *ray,
    const float d,
    const int triangle_id,
    uint2 *texture_index) {
  const float texture_area_ratio = update_ray(geometry_buffer, ray, d, triangle_id, texture_index);
  ray->pdf += texture_pdf(texture_cdf, ray, d, texture_area_ratio, *texture_index);
}

void closest_hit(
    const global GeometryBuffer *geometry_buffer,
    Ray *ray,
    const float d,
    uint2 *texture_index) {
  update_ray(geometry_buffer, ray, d, ray->ri, texture_index);
  ray->r += ray->rd * d;
}

void miss(Ray *ray) {
  ray->rn = (float3)(0.0f, 0.0f, 0.0f);
  ray->rt = (float2)(0.0f, 0.0f);
  ray->ri = -1;
}

float scene_intersect(
    const global GeometryBuffer *geometry_buffer,
    const global uint *texture_cdf,
    Ray *ray,
    uint2 *texture_index) {
  if (ray->ri == -2)
    return 0.f;
  ray->hit_cnt = 0;
  float d = MAXFLOAT;
  int bvh_id = 0;
  while (bvh_id != -1) {
    int bvh_left = geometry_buffer->bvh_left[bvh_id];
    int bvh_right = geometry_buffer->bvh_right[bvh_id];
    int bvh_next = geometry_buffer->bvh_next[bvh_id];
    bool has_intersect = aabb_intersect(geometry_buffer, ray, bvh_id);
    bool bvh_is_leaf = (bvh_right != -1);
    if (!has_intersect) {
      bvh_id = bvh_next;
      continue;
    }
    if (!bvh_is_leaf) {
      bvh_id = bvh_left;
      continue;
    }
    for (int i = bvh_left; i < bvh_right; ++i) {
      float new_d = triangle_intersect(geometry_buffer, ray, i);
      if (new_d != MAXFLOAT) {
        if (new_d < d) {
          ray->ri = i;
          d = new_d;
        }
        ray->hit_cnt++;
        any_hit(geometry_buffer, texture_cdf, ray, new_d, i, texture_index);
      }
    }
    bvh_id = bvh_next;
  }
  if (ray->hit_cnt) {
    closest_hit(geometry_buffer, ray, d, texture_index);
    return d;
  }
  miss(ray);
  return 0.f;
}

Ray ray_init(const int srand, const uint low_discrepancy, const volatile global RenderBuffer *render_buffer) {
  Ray ray;
  const uint i_pixel = get_global_id(0);
  const uint i_sample = get_global_id(1);
  if (srand >= 0)
    pcg32_init(&ray, srand, i_sample);
  else
    pcg32_init(&ray, i_pixel + get_global_size(0) * (1 - srand), i_sample);
    float2 pixel_coord = (float2)(pcg32_next_float(&ray), pcg32_next_float(&ray));
  if (i_sample == 0) {
    pixel_coord = (float2)(0.5f, 0.5f);
  }
  ray.r = (float3)(render_buffer->ray[0][i_pixel],
                   render_buffer->ray[1][i_pixel],
                   render_buffer->ray[2][i_pixel]
  ) + pixel_coord.x * (float3)(render_buffer->ray[3][i_pixel],
                               render_buffer->ray[4][i_pixel],
                               render_buffer->ray[5][i_pixel]
  ) + pixel_coord.y * (float3)(render_buffer->ray[6][i_pixel],
                               render_buffer->ray[7][i_pixel],
                               render_buffer->ray[8][i_pixel]);
  ray.rd = (float3)(render_buffer->ray[9][i_pixel],
                    render_buffer->ray[10][i_pixel],
                    render_buffer->ray[11][i_pixel]
  ) + pixel_coord.x * (float3)(render_buffer->ray[12][i_pixel],
                               render_buffer->ray[13][i_pixel],
                               render_buffer->ray[14][i_pixel]
  ) + pixel_coord.y * (float3)(render_buffer->ray[15][i_pixel],
                               render_buffer->ray[16][i_pixel],
                               render_buffer->ray[17][i_pixel]);
  ray.rd /= sqrt(dot(ray.rd, ray.rd));
  ray.render = (float3)(0.f, 0.f, 0.f);
  ray.render_last = (float3)(0.f, 0.f, 0.f);
  ray.render_last_weight = (float3)(1.f, 1.f, 1.f);
  ray.ri = 0;
  return ray;
}

void material_normal(
    Ray *ray,
    const volatile global RenderBuffer *render_buffer,
    const uint2 texture_index,
    const int i_bounce) {
  if (!(false && i_bounce == 0) && ray->ri >= 0)
    ray->rn = (float3)(render_buffer->texture[0][texture_index.x][texture_index.y],
                       render_buffer->texture[1][texture_index.x][texture_index.y],
                       render_buffer->texture[2][texture_index.x][texture_index.y]);
  if (dot(ray->rn, ray->rn) < 1e-3f) {
    ray->ri = -2;
  } else {
    ray->rn /= sqrt(dot(ray->rn, ray->rn));
    if (dot(ray->rd, ray->rn) > 0)
      ray->rn = -ray->rn;
    if (dot(ray->rd, ray->rn) > -1e-3f)
      ray->ri = -2;
  }
}

float3 material_intensity(const volatile global RenderBuffer *render_buffer, const uint2 texture_index, const int i_bounce) {
  return (float3)(render_buffer->texture[10][texture_index.x][texture_index.y],
                  render_buffer->texture[11][texture_index.x][texture_index.y],
                  render_buffer->texture[12][texture_index.x][texture_index.y]);
}

float material_window(const volatile global RenderBuffer *render_buffer, const uint2 texture_index, const int i_bounce) {
  return render_buffer->texture[13][texture_index.x][texture_index.y];
}

float3 material_envmap(const volatile global RenderBuffer *render_buffer, const uint3 envmap_index) {
  return (float3)(render_buffer->envmap[0][envmap_index.x][envmap_index.y][envmap_index.z],
                  render_buffer->envmap[1][envmap_index.x][envmap_index.y][envmap_index.z],
                  render_buffer->envmap[2][envmap_index.x][envmap_index.y][envmap_index.z]);
}

void d_material_intensity(
      volatile global RenderBuffer *render_buffer,
      local float *operand_gather,
      const uint2 texture_index,
      const int i_bounce,
      const float3 d_intensity) {
  const uint i_bucket = get_global_id(1) % 8u;
  atomic_add_f(&render_buffer->d_texture[10][texture_index.x][texture_index.y][i_bucket], d_intensity.x);
  atomic_add_f(&render_buffer->d_texture[11][texture_index.x][texture_index.y][i_bucket], d_intensity.y);
  atomic_add_f(&render_buffer->d_texture[12][texture_index.x][texture_index.y][i_bucket], d_intensity.z);
}

void d_material_window(
      volatile global RenderBuffer *render_buffer,
      local float *operand_gather,
      const uint2 texture_index,
      const int i_bounce,
      const float d_window) {
  const uint i_bucket = get_global_id(1) % 8u;
  atomic_add_f(&render_buffer->d_texture[13][texture_index.x][texture_index.y][i_bucket], d_window);
}

void d_material_envmap(volatile global RenderBuffer *render_buffer, const uint3 envmap_index, const float3 d_envmap) {
  const uint i_bucket = get_global_id(1) % 8u;
  atomic_add_f(&render_buffer->d_envmap[0][envmap_index.x][envmap_index.y][envmap_index.z][i_bucket], d_envmap.x);
  atomic_add_f(&render_buffer->d_envmap[1][envmap_index.x][envmap_index.y][envmap_index.z][i_bucket], d_envmap.y);
  atomic_add_f(&render_buffer->d_envmap[2][envmap_index.x][envmap_index.y][envmap_index.z][i_bucket], d_envmap.z);
}

kernel void kernel_ray_tracing_forward(
    const int srand,
    const uint low_discrepancy,
    const global GeometryBuffer *restrict geometry_buffer,
    volatile global RenderBuffer *restrict render_buffer,
    global IntermediateBuffer *restrict intermediate_buffer) {
  local float operand_gather[128u];
  const uint i_pixel = get_global_id(0);
  const uint i_sample = get_global_id(1);
  Ray ray = ray_init(srand, low_discrepancy, render_buffer);
  for (int i_bounce = 0; i_bounce <= 4u; ++i_bounce) {
    const float clip_near = i_bounce == 0 ? 0.01f : i_bounce == 1 ? 0.0001f : 0.0001f;
    float d;
    uint2 texture_index;
    uint3 envmap_index;
    background_intersect(render_buffer, &ray, &envmap_index);
    if (false && i_bounce == 0) {
      d = render_buffer->ray[23][i_pixel];
      if (d <= 0) {
        ray.ri = -1;
      } else {
        ray.r += d * ray.rd;
      }
    } else {
      ray.r += clip_near * ray.rd;
      d = scene_intersect(
        geometry_buffer,
        (const global uint *)render_buffer->texture_cdf,
        &ray,
        &texture_index);
    }
    if (i_bounce == 0) {
      ray.render_weight = (float3)(1.f, 1.f, 1.f);
      ray.pdf = 1.f;
    }
    if (ray.pdf < 1e-6f || 1.0f / ray.pdf < 1e-6f ||
        (ray.render_weight.x < 1e-6f && ray.render_weight.y < 1e-6f && ray.render_weight.z < 1e-6f)) {
      ray.ri = -2;
    } else {
      ray.render_weight /= ray.pdf;
      if (i_bounce > 1u)
        ray.render_last_weight /= ray.pdf;
    }
    const float3 envmap = material_envmap(render_buffer, envmap_index);
    if (ray.ri == -1) {
      ray.r += ray.rd;
      ray.rn = ray.rd;
      if (i_bounce >= 0u)
        ray.render += ray.render_weight * envmap;
    }
    material_normal(&ray, render_buffer, texture_index, i_bounce);
    if (ray.ri == -2) {
      ray.r += ray.rd;
      ray.pdf = 0.0f;
    }
    if (i_bounce <= 1u && i_sample < 128u) {
      intermediate_buffer->ray_r[0][i_bounce][i_pixel][i_sample] = ray.r.x;
      intermediate_buffer->ray_r[1][i_bounce][i_pixel][i_sample] = ray.r.y;
      intermediate_buffer->ray_r[2][i_bounce][i_pixel][i_sample] = ray.r.z;
      intermediate_buffer->ray_ri[i_bounce][i_pixel][i_sample] = ray.ri;
      intermediate_buffer->ray_pdf[i_bounce][i_pixel][i_sample] = ray.pdf;
    }
    if (ray.ri < 0)
      break;
    if (i_bounce == 0 && i_sample == 0) {
      render_buffer->ray[21][i_pixel] = ray.rt.x;
      render_buffer->ray[22][i_pixel] = ray.rt.y;
      render_buffer->ray[23][i_pixel] = clip_near + d;
      render_buffer->ray[24][i_pixel] = ray.rn.x;
      render_buffer->ray[25][i_pixel] = ray.rn.y;
      render_buffer->ray[26][i_pixel] = ray.rn.z;
    }
    if (i_bounce >= 0u) {
      const float3 intensity = material_intensity(render_buffer, texture_index, i_bounce);
      const float window = material_window(render_buffer, texture_index, i_bounce);
      ray.render += ray.render_weight * (intensity + window * envmap);
      if (i_bounce > 1u)
        ray.render_last += ray.render_last_weight * (intensity + window * envmap);
    }
    const float3 next_rd = sample_reflection(
      geometry_buffer,
      render_buffer,
      &ray,
      texture_index,
      i_bounce,
      low_discrepancy);
    const float3 brdf = diffuse_brdf(render_buffer, &ray, next_rd, texture_index, i_bounce) +
                        specular_brdf(render_buffer, &ray, next_rd, texture_index, i_bounce);
    ray.pdf = diffuse_pdf(&ray, next_rd) + specular_pdf(render_buffer, &ray, next_rd, texture_index, i_bounce);
    ray.rd = next_rd;
    ray.render_weight *= brdf;
    if (i_bounce >= 1u)
      ray.render_last_weight *= brdf;
  }
  if (i_sample < 128u) {
    intermediate_buffer->ray_render_last[0][i_pixel][i_sample] = ray.render_last.x;
    intermediate_buffer->ray_render_last[1][i_pixel][i_sample] = ray.render_last.y;
    intermediate_buffer->ray_render_last[2][i_pixel][i_sample] = ray.render_last.z;
  }
  local_gather_atomic_add_f(&render_buffer->ray[18][i_pixel], operand_gather, ray.render.x / 1024u);
  local_gather_atomic_add_f(&render_buffer->ray[19][i_pixel], operand_gather, ray.render.y / 1024u);
  local_gather_atomic_add_f(&render_buffer->ray[20][i_pixel], operand_gather, ray.render.z / 1024u);
}

kernel void kernel_ray_tracing_backward(
    const int srand,
    const uint low_discrepancy,
    const global GeometryBuffer *restrict geometry_buffer,
    volatile global RenderBuffer *restrict render_buffer,
    const global IntermediateBuffer *restrict intermediate_buffer) {
  local float operand_gather[128u];
  const uint i_pixel = get_global_id(0);
  const uint i_sample = get_global_id(1);
  float3 chain_render_weight[1u + 1];
  float3 chain_render[1u + 1];
  chain_render_weight[0] = (float3)(1.f, 1.f, 1.f);
  Ray ray = ray_init(srand, low_discrepancy, render_buffer);
  uint2 texture_index = (uint2)(0u, 0u);
  uint3 envmap_index = (uint3)(0u, 0u, 0u);
  for (int i_bounce = 0; i_bounce <= 1u; ++i_bounce) {
    if (ray.ri < 0) {
      chain_render_weight[i_bounce] = (float3)(0.f, 0.f, 0.f);
      chain_render[i_bounce] = (float3)(0.f, 0.f, 0.f);
      continue;
    }
    const float3 next_r = (float3)(intermediate_buffer->ray_r[0][i_bounce][i_pixel][i_sample],
                                   intermediate_buffer->ray_r[1][i_bounce][i_pixel][i_sample],
                                   intermediate_buffer->ray_r[2][i_bounce][i_pixel][i_sample]);
    float3 curr_rd = next_r - ray.r;
    curr_rd /= sqrt(dot(curr_rd, curr_rd));
    ray.ri = intermediate_buffer->ray_ri[i_bounce][i_pixel][i_sample];
    ray.pdf = intermediate_buffer->ray_pdf[i_bounce][i_pixel][i_sample];
    if (ray.ri == -2) {
      chain_render_weight[i_bounce] = (float3)(0.f, 0.f, 0.f);
      chain_render[i_bounce] = (float3)(0.f, 0.f, 0.f);
      continue;
    }
    if (i_bounce > 0) {
      const float3 brdf = diffuse_brdf(render_buffer, &ray, curr_rd, texture_index, i_bounce - 1) +
                          specular_brdf(render_buffer, &ray, curr_rd, texture_index, i_bounce - 1);
      chain_render_weight[i_bounce] = brdf / ray.pdf;
    }
    ray.rd = curr_rd;
    ray.r = next_r;
    background_intersect(render_buffer, &ray, &envmap_index);
    const float3 envmap = material_envmap(render_buffer, envmap_index);
    if (ray.ri == -1) {
    if (i_bounce >= 0u) {
      chain_render[i_bounce] = envmap;
    } else {
      chain_render[i_bounce] = (float3)(0.f, 0.f, 0.f);
    }
      continue;
    }
    if (ray.ri < 0) {
      chain_render_weight[i_bounce] = (float3)(0.f, 0.f, 0.f);
      chain_render[i_bounce] = (float3)(0.f, 0.f, 0.f);
      continue;
    }
    if (!(false && i_bounce == 0))
      update_ray(geometry_buffer, &ray, 0.f, ray.ri, &texture_index);
    material_normal(&ray, render_buffer, texture_index, i_bounce);
    if (i_bounce >= 0u) {
      const float3 intensity = material_intensity(render_buffer, texture_index, i_bounce);
      const float window = material_window(render_buffer, texture_index, i_bounce);
      chain_render[i_bounce] = intensity + window * envmap;
    } else {
      chain_render[i_bounce] = (float3)(0.f, 0.f, 0.f);
    }
  }
  chain_render[1u] += (float3)(intermediate_buffer->ray_render_last[0][i_pixel][i_sample],
                                                intermediate_buffer->ray_render_last[1][i_pixel][i_sample],
                                                intermediate_buffer->ray_render_last[2][i_pixel][i_sample]);
  for (int i_bounce = 1u - 1; i_bounce >= 0; --i_bounce)
    chain_render[i_bounce] += chain_render[i_bounce + 1] * chain_render_weight[i_bounce + 1];
  ray = ray_init(srand, low_discrepancy, render_buffer);
  float3 d_render = (float3)(render_buffer->ray[18][i_pixel],
                             render_buffer->ray[19][i_pixel],
                             render_buffer->ray[20][i_pixel]) / (float)128u;
  for (int i_bounce = 0; i_bounce <= 1u; ++i_bounce) {
    bool is_continue = false;
    if (ray.ri < 0) {
      d_render = (float3)(0.f, 0.f, 0.f);
      is_continue = true;
      goto d_continue_before_brdf;
    }
    const float3 next_r = (float3)(intermediate_buffer->ray_r[0][i_bounce][i_pixel][i_sample],
                                   intermediate_buffer->ray_r[1][i_bounce][i_pixel][i_sample],
                                   intermediate_buffer->ray_r[2][i_bounce][i_pixel][i_sample]);
    float3 curr_rd = next_r - ray.r;
    curr_rd /= sqrt(dot(curr_rd, curr_rd));
    ray.ri = intermediate_buffer->ray_ri[i_bounce][i_pixel][i_sample];
    ray.pdf = intermediate_buffer->ray_pdf[i_bounce][i_pixel][i_sample];
    float3 envmap = (float3)(0.f, 0.f, 0.f);
    if (ray.ri == -2) {
      d_render = (float3)(0.f, 0.f, 0.f);
      is_continue = true;
      goto d_continue_before_brdf;
    }
d_continue_before_brdf:
    if (i_bounce > 0) {
      const float3 d_brdf = is_continue ? (float3)(0.f, 0.f, 0.f) : d_render * chain_render[i_bounce] / ray.pdf;
      d_diffuse_brdf(render_buffer, operand_gather, &ray, curr_rd, texture_index, i_bounce - 1, d_brdf);
      d_specular_brdf(render_buffer, operand_gather, &ray, curr_rd, texture_index, i_bounce - 1, d_brdf);
      d_render *= chain_render_weight[i_bounce];
    }
    if (is_continue)
      goto d_continue_after_brdf;
    ray.rd = curr_rd;
    ray.r = next_r;
    background_intersect(render_buffer, &ray, &envmap_index);
    envmap = material_envmap(render_buffer, envmap_index);
    if (ray.ri == -1 && i_bounce >= 0u)
      d_material_envmap(render_buffer, envmap_index, d_render);
    if (ray.ri < 0) {
      d_render = (float3)(0.f, 0.f, 0.f);
      goto d_continue_after_brdf;
    }
    if (!(false && i_bounce == 0))
      update_ray(geometry_buffer, &ray, 0.f, ray.ri, &texture_index);
    material_normal(&ray, render_buffer, texture_index, i_bounce);
    const float window = material_window(render_buffer, texture_index, i_bounce);
    if (i_bounce >= 0u)
      d_material_envmap(render_buffer, envmap_index, window * d_render);
d_continue_after_brdf:
    if (i_bounce >= 0u) {
      d_material_intensity(render_buffer, operand_gather, texture_index, i_bounce, d_render);
      d_material_window(render_buffer, operand_gather, texture_index, i_bounce, dot(d_render, envmap));
    }
  }
}

void collect_texture(volatile global float (*data)[256u][256u][8u]) {
  volatile global float (*data_offset)[8u] = &(*data)[get_global_id(0)][get_global_id(1)];
  for (int i = 1; i < 8u; ++i)
    (*data_offset)[0] += (*data_offset)[i];
}

kernel void kernel_ray_tracing_backward_collect_texture(volatile global RenderBuffer *render_buffer) {
  for (int i = 0; i < 14; ++i)
    collect_texture(&render_buffer->d_texture[i]);
}

void collect_envmap(volatile global float (*data)[512u][512u][8u]) {
  volatile global float (*data_offset)[8u] = &(*data)[get_global_id(0)][get_global_id(1)];
  for (int i = 1; i < 8u; ++i)
    (*data_offset)[0] += (*data_offset)[i];
}

kernel void kernel_ray_tracing_backward_collect_envmap(volatile global RenderBuffer *render_buffer) {
  for (int i = 0; i < 3; ++i)
    for (int j = 0; j < 6; ++j)
      collect_envmap(&render_buffer->d_envmap[i][j]);
}
...全文