3d gaussian splatting核心代码注释(CUDA部分)
rasterizer_impl.cu:// 查找最高有效位(most significant bit),输入变量n表示tile编号最大值x、y的乘积uint32_t getHigherMsb(uint32_t n){uint32_t msb = sizeof(n) * 4;//4*4=16uint32_t step = msb;while (step > 1){step /= 2;
·
rasterizer_impl.cu:
// 查找最高有效位(most significant bit),输入变量n表示tile编号最大值x、y的乘积
uint32_t getHigherMsb(uint32_t n)
{
uint32_t msb = sizeof(n) * 4; //4*4=16
uint32_t step = msb;
while (step > 1)
{
step /= 2; //缩小2倍
if (n >> msb) //右移16位,相当于除以2的16次方
msb += step;
else
msb -= step;
}
if (n >> msb) //如果n的最高位大于0,则msb+1
msb++;
return msb;
}
// Wrapper method to call auxiliary coarse frustum containment test.
// Mark all Gaussians that pass it.
__global__ void checkFrustum(int P,
const float* orig_points,
const float* viewmatrix,
const float* projmatrix,
bool* present)
{
auto idx = cg::this_grid().thread_rank();
if (idx >= P)
return;
float3 p_view;
present[idx] = in_frustum(idx, orig_points, viewmatrix, projmatrix, false, p_view);
}
//计算2d高斯椭圆中心点points_xy在2d像素平面上占据的tile的tileID,并将tileID|depth组合成64位的key值,value值为高斯球的编号
__global__ void duplicateWithKeys(
int P,
const float2* points_xy,
const float* depths,
const uint32_t* offsets, //累计的tiles数量的数组
uint64_t* gaussian_keys_unsorted, //未排序的key(tileID|depth)
uint32_t* gaussian_values_unsorted, //未排序的valu(depth)
int* radii, //高斯球的半径
dim3 grid) //block编号的xy两个极大值
{
auto idx = cg::this_grid().thread_rank();
if (idx >= P)
return;
// Generate no key/value pair for invisible Gaussians
if (radii[idx] > 0)
{
// Find this Gaussian's offset in buffer for writing keys/values.
uint32_t off = (idx == 0) ? 0 : offsets[idx - 1]; //第idx个高斯球前面已经占据的tiles总数
uint2 rect_min, rect_max;
//计算像素点points_xy[idx]在半径为radii[idx]的圆所占据的网格编号的最小值和最大值
getRect(points_xy[idx], radii[idx], rect_min, rect_max, grid);
// For each tile that the bounding rect overlaps, emit a
// key/value pair. The key is | tile ID | depth |,
// and the value is the ID of the Gaussian. Sorting the values
// with this key yields Gaussian IDs in a list, such that they
// are first sorted by tile and then by depth.
for (int y = rect_min.y; y < rect_max.y; y++)
{
for (int x = rect_min.x; x < rect_max.x; x++)
{
uint64_t key = y * grid.x + x; //计算当前block所在的tileID
key <<= 32; //左移32位,把后面的32位空出来
key |= *((uint32_t*)&depths[idx]); //后32位存高斯的深度值
//把上步得到的key,存到第off个(也即是当前高斯球前面的所有高斯所占据的tiles数量)gaussian_keys_unsorted的uint64_t数组中
gaussian_keys_unsorted[off] = key;
//values设置为idx
gaussian_values_unsorted[off] = idx;
off++;
}
}
}
}
// Check keys to see if it is at the start/end of one tile's range in
// the full sorted list. If yes, write start/end of this tile.
// Run once per instanced (duplicated) Gaussian ID.
//一个thread处理一个point_list_keys中的tile,总共L个tile;point_list_keys:已经排序过的key列表,tileID从小到大排列(优先),depth从小到大排列;
//ranges:每一项存储对应tile的的id范围[0,L-1],这个id表示的是在point_list_keys中的索引,通过binningState.point_list找到对应高斯球编号
//例如:point_list_keys值如下:tileID:0 0 0 0 1 1 1 2 2 3 4 4...
// depth: 1 2 3 4 1 4 5 3 4 2 3 5... ,那么point_list_keys[0]中的tileID即为0,ranges[0].x = 0
__global__ void identifyTileRanges(int L, uint64_t* point_list_keys, uint2* ranges)
{
auto idx = cg::this_grid().thread_rank();
if (idx >= L)
return;
// Read tile ID from key. Update start/end of tile range if at limit.
uint64_t key = point_list_keys[idx];
uint32_t currtile = key >> 32; //key右移32位,得到tileID
if (idx == 0) //第0个point_list_keys,也即tileID最小且depth最小的point_list_keys项
ranges[currtile].x = 0; //tileID最小的ranges项的x值赋值为0
else //如果idx=4,那么currtile=1,prevtile=0,就设置ranges[0].y=4,ranges[1].x=4,
{
uint32_t prevtile = point_list_keys[idx - 1] >> 32; //前一个tileID
if (currtile != prevtile) //与前一个tileID不相等的话,设置前一个ranges.y和自己的ranges.x
{
ranges[prevtile].y = idx;
ranges[currtile].x = idx;
}
}
if (idx == L - 1) //最后一个point_list_keys项的话,设置ranges[currtile].y为L
ranges[currtile].y = L;
}
// Mark Gaussians as visible/invisible, based on view frustum testing
void CudaRasterizer::Rasterizer::markVisible(
int P,
float* means3D,
float* viewmatrix,
float* projmatrix,
bool* present)
{
checkFrustum << <(P + 255) / 256, 256 >> > (
P,
means3D,
viewmatrix, projmatrix,
present);
}
//给GeometryState geom分配内存,P:高斯球数量
CudaRasterizer::GeometryState CudaRasterizer::GeometryState::fromChunk(char*& chunk, size_t P)
{
GeometryState geom;
obtain(chunk, geom.depths, P, 128);
obtain(chunk, geom.clamped, P * 3, 128);
obtain(chunk, geom.internal_radii, P, 128);
obtain(chunk, geom.means2D, P, 128);
obtain(chunk, geom.cov3D, P * 6, 128);
obtain(chunk, geom.conic_opacity, P, 128);
obtain(chunk, geom.rgb, P * 3, 128);
obtain(chunk, geom.tiles_touched, P, 128);
//计算数组的前缀和,InclusiveSum表示包括自身,ExclusiveSum表示不包括自身
//参数:第三个in,第四个out,最后一个num。当第一个参数为NULL时, 所需的分配大小被写入第二个参数,并且不执行任何工作
//https://github.com/dmlc/cub/blob/master/cub/device/device_scan.cuh
cub::DeviceScan::InclusiveSum(nullptr, geom.scan_size, geom.tiles_touched, geom.tiles_touched, P); //计算所需内存大小
obtain(chunk, geom.scanning_space, geom.scan_size, 128);
obtain(chunk, geom.point_offsets, P, 128);
return geom;
}
//给ImageState img分配内存,N:图片中tile的总数
CudaRasterizer::ImageState CudaRasterizer::ImageState::fromChunk(char*& chunk, size_t N)
{
ImageState img;
obtain(chunk, img.accum_alpha, N, 128);
obtain(chunk, img.n_contrib, N, 128);
obtain(chunk, img.ranges, N, 128);
return img;
}
//给BinningState binning分配内存,表示装箱状态,P:高斯球数量
CudaRasterizer::BinningState CudaRasterizer::BinningState::fromChunk(char*& chunk, size_t P)
{
BinningState binning;
obtain(chunk, binning.point_list, P, 128);
obtain(chunk, binning.point_list_unsorted, P, 128);
obtain(chunk, binning.point_list_keys, P, 128);
obtain(chunk, binning.point_list_keys_unsorted, P, 128);
cub::DeviceRadixSort::SortPairs( //基数排序,
nullptr, binning.sorting_size,
binning.point_list_keys_unsorted, binning.point_list_keys,
binning.point_list_unsorted, binning.point_list, P);
obtain(chunk, binning.list_sorting_space, binning.sorting_size, 128);
return binning;
}
// Forward rendering procedure for differentiable rasterization
// of Gaussians.
int CudaRasterizer::Rasterizer::forward(
std::function<char* (size_t)> geometryBuffer,
std::function<char* (size_t)> binningBuffer,
std::function<char* (size_t)> imageBuffer,
const int P, int D, int M,
const float* background,
const int width, int height,
const float* means3D,
const float* shs,
const float* colors_precomp,
const float* opacities,
const float* scales,
const float scale_modifier,
const float* rotations,
const float* cov3D_precomp,
const float* viewmatrix,
const float* projmatrix,
const float* cam_pos,
const float tan_fovx, float tan_fovy,
const bool prefiltered,
float* out_color,
int* radii,
bool debug)
{ //计算焦距fx,fy
const float focal_y = height / (2.0f * tan_fovy);
const float focal_x = width / (2.0f * tan_fovx);
//模版函数required调用fromChunk函数来获取内存,返回结束地址,也即所需存储大小
size_t chunk_size = required<GeometryState>(P);
//调用rasterize_points.cu文件中的resizeFunctional函数里面嵌套的匿名函数lambda来调整显存块大小,并返回首地址
char* chunkptr = geometryBuffer(chunk_size);
// 用显存块首地址作为参数,调用本文件上的fromChunk函数来申请显存
GeometryState geomState = GeometryState::fromChunk(chunkptr, P);
if (radii == nullptr)
{
radii = geomState.internal_radii;
}
//网格中block块的数量,确保覆盖全图
dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
//块中线程数量,16*16
dim3 block(BLOCK_X, BLOCK_Y, 1);
// Dynamically resize image-based auxiliary buffers during training
//模版函数required调用fromChunk函数来获取内存,返回结束地址,也即所需存储大小
size_t img_chunk_size = required<ImageState>(width * height);
char* img_chunkptr = imageBuffer(img_chunk_size);
ImageState imgState = ImageState::fromChunk(img_chunkptr, width * height);
if (NUM_CHANNELS != 3 && colors_precomp == nullptr)
{
throw std::runtime_error("For non-RGB, provide precomputed Gaussian colors!");
}
// Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB)
CHECK_CUDA(FORWARD::preprocess(
P, D, M,
means3D,
(glm::vec3*)scales,
scale_modifier,
(glm::vec4*)rotations,
opacities,
shs,
geomState.clamped,
cov3D_precomp,
colors_precomp,
viewmatrix, projmatrix,
(glm::vec3*)cam_pos,
width, height,
focal_x, focal_y,
tan_fovx, tan_fovy,
radii,
geomState.means2D,
geomState.depths,
geomState.cov3D,
geomState.rgb,
geomState.conic_opacity,
tile_grid,
geomState.tiles_touched,
prefiltered
), debug)
// Compute prefix sum over full list of touched tile counts by Gaussians
// E.g., [2, 3, 0, 2, 1] -> [2, 5, 5, 7, 8]
//同步运行InclusiveSum,获取tiles_touched数组的前缀和,存到point_offsets中
CHECK_CUDA(cub::DeviceScan::InclusiveSum(geomState.scanning_space, geomState.scan_size, geomState.tiles_touched, geomState.point_offsets, P), debug)
// Retrieve total number of Gaussian instances to launch and resize aux buffers
int num_rendered;
// 把指针(point_offsets + P - 1),也就是point_offsets数组的最后一个元素的值,赋给num_rendered,也就是总共覆盖的tiles数量
CHECK_CUDA(cudaMemcpy(&num_rendered, geomState.point_offsets + P - 1, sizeof(int), cudaMemcpyDeviceToHost), debug);
//计算所需的BinningState的数量,即每个高斯球覆盖的tile都有对应的装箱状态BinningState数据
size_t binning_chunk_size = required<BinningState>(num_rendered);
//调整显存块大小,并返回首地址
char* binning_chunkptr = binningBuffer(binning_chunk_size);
//用显存块首地址作为参数,调用fromChunk函数来申请显存
BinningState binningState = BinningState::fromChunk(binning_chunkptr, num_rendered);
// For each instance to be rendered, produce adequate [ tile | depth ] key
// and corresponding dublicated Gaussian indices to be sorted
duplicateWithKeys << <(P + 255) / 256, 256 >> > (
P,
geomState.means2D,
geomState.depths,
geomState.point_offsets, // 这里用到上步InclusiveSum得到的累计高斯球touch的tiles数
binningState.point_list_keys_unsorted, // 存储key(tileID|depth)
binningState.point_list_unsorted, // 存储对应的高斯球idx
radii, // 像素平面上高斯圆的半径,最长轴的3倍
tile_grid) // 全图中tile的数量
CHECK_CUDA(, debug) // 同步,并检查错误
//查找tile_grid.x * tile_grid.y的最高位
int bit = getHigherMsb(tile_grid.x * tile_grid.y);
// Sort complete list of (duplicated) Gaussian indices by keys//依据key进行基数排序,
//https://nvlabs.github.io/cub/structcub_1_1_device_radix_sort.html
CHECK_CUDA(cub::DeviceRadixSort::SortPairs(
binningState.list_sorting_space, //辅助空间
binningState.sorting_size, //辅助空间大小
binningState.point_list_keys_unsorted, binningState.point_list_keys, //d_keys_in,d_keys_out
binningState.point_list_unsorted, binningState.point_list, //d_values_in, d_values_out
num_rendered, 0, 32 + bit), debug) //总共覆盖的tiles数量,开始bit位,结束比特位
//imgState.ranges的值设置成0,长度为tile_grid.x * tile_grid.y * sizeof(uint2)
CHECK_CUDA(cudaMemset(imgState.ranges, 0, tile_grid.x * tile_grid.y * sizeof(uint2)), debug);
// Identify start and end of per-tile workloads in sorted list
if (num_rendered > 0)
identifyTileRanges << <(num_rendered + 255) / 256, 256 >> > (
num_rendered,
binningState.point_list_keys,
imgState.ranges);
CHECK_CUDA(, debug)
// Let each tile blend its range of Gaussians independently in parallel
const float* feature_ptr = colors_precomp != nullptr ? colors_precomp : geomState.rgb;
CHECK_CUDA(FORWARD::render(
tile_grid, block,
imgState.ranges,
binningState.point_list,
width, height,
geomState.means2D,
feature_ptr,
geomState.conic_opacity,
imgState.accum_alpha,
imgState.n_contrib,
background,
out_color), debug)
return num_rendered;
}
forward.cu:
// 2D协方差矩阵计算的前向版本,mean:高斯点在世界坐标系的坐标,focla_x,focal_y:相机焦距,tan_fovx,tan_fovy:相机视场角的tan值,cov3D:3d椭球的协方差矩阵,viewmatrix:视角变换矩阵
// 计算结果存在cov中,并返回
__device__ float3 computeCov2D(const float3& mean, float focal_x, float focal_y, float tan_fovx, float tan_fovy, const float* cov3D, const float* viewmatrix)
{
// The following models the steps outlined by equations 29
// and 31 in "EWA Splatting" (Zwicker et al., 2002).
// Additionally considers aspect / scaling of viewport.
// Transposes used to account for row-/column-major conventions.
float3 t = transformPoint4x3(mean, viewmatrix);
const float limx = 1.3f * tan_fovx;
const float limy = 1.3f * tan_fovy;
const float txtz = t.x / t.z;
const float tytz = t.y / t.z;
t.x = min(limx, max(-limx, txtz)) * t.z;
t.y = min(limy, max(-limy, tytz)) * t.z;
glm::mat3 J = glm::mat3(
focal_x / t.z, 0.0f, -(focal_x * t.x) / (t.z * t.z),
0.0f, focal_y / t.z, -(focal_y * t.y) / (t.z * t.z),
0, 0, 0);
glm::mat3 W = glm::mat3(
viewmatrix[0], viewmatrix[4], viewmatrix[8],
viewmatrix[1], viewmatrix[5], viewmatrix[9],
viewmatrix[2], viewmatrix[6], viewmatrix[10]);
glm::mat3 T = W * J;
//Vrk:3d协方差矩阵
glm::mat3 Vrk = glm::mat3(
cov3D[0], cov3D[1], cov3D[2],
cov3D[1], cov3D[3], cov3D[4],
cov3D[2], cov3D[4], cov3D[5]);
//cov: (W^T*J)^T * Vrk^T * (W^T*J) == (J*W)* Vrk^T * (W^T*J^T)
glm::mat3 cov = glm::transpose(T) * glm::transpose(Vrk) * T;
// Apply low-pass filter: every Gaussian should be at least
// one pixel wide/high. Discard 3rd row and column.//确保协方差矩阵正定,数值稳定性考虑
cov[0][0] += 0.3f;
cov[1][1] += 0.3f;
return { float(cov[0][0]), float(cov[0][1]), float(cov[1][1]) };
}
// 在世界坐标系中,将高斯的旋转和缩放属性转换为3d协方差矩阵,同时注意四元数归一化。
// scale:缩放,mod:缩放的模长,rot:四元数旋转
// cov3D:返回的协方差矩阵,RS*(RS)T
__device__ void computeCov3D(const glm::vec3 scale, float mod, const glm::vec4 rot, float* cov3D)
{
// Create scaling matrix
glm::mat3 S = glm::mat3(1.0f);
S[0][0] = mod * scale.x;
S[1][1] = mod * scale.y;
S[2][2] = mod * scale.z;
// Normalize quaternion to get valid rotation//从四元数计算旋转矩阵
glm::vec4 q = rot;// / glm::length(rot);
float r = q.x;
float x = q.y;
float y = q.z;
float z = q.w;
// Compute rotation matrix from quaternion
glm::mat3 R = glm::mat3(
1.f - 2.f * (y * y + z * z), 2.f * (x * y - r * z), 2.f * (x * z + r * y),
2.f * (x * y + r * z), 1.f - 2.f * (x * x + z * z), 2.f * (y * z - r * x),
2.f * (x * z - r * y), 2.f * (y * z + r * x), 1.f - 2.f * (x * x + y * y)
);
glm::mat3 M = S * R;
// Compute 3D world covariance matrix Sigma
glm::mat3 Sigma = glm::transpose(M) * M;
// Covariance is symmetric, only store upper right
cov3D[0] = Sigma[0][0];
cov3D[1] = Sigma[0][1];
cov3D[2] = Sigma[0][2];
cov3D[3] = Sigma[1][1];
cov3D[4] = Sigma[1][2];
cov3D[5] = Sigma[2][2];
}
// 计算每一个高斯球投影出来的圆半径及覆盖的范围,一个线程处理一个高斯球
template<int C>
__global__ void preprocessCUDA(int P, int D, int M, // P:高斯球的数量,D:,M:
const float* orig_points,
const glm::vec3* scales,
const float scale_modifier,
const glm::vec4* rotations,
const float* opacities,
const float* shs,
bool* clamped,
const float* cov3D_precomp,
const float* colors_precomp,
const float* viewmatrix,
const float* projmatrix,
const glm::vec3* cam_pos,
const int W, int H,
const float tan_fovx, float tan_fovy,
const float focal_x, float focal_y,
int* radii,
float2* points_xy_image,
float* depths,
float* cov3Ds,
float* rgb,
float4* conic_opacity,
const dim3 grid, //block块的范围
uint32_t* tiles_touched, //
bool prefiltered)
{
auto idx = cg::this_grid().thread_rank(); // 线程在组内的标号,区间为[0,num_threads)
if (idx >= P) // 如果编号大于高斯球的数量,则返回,也就是说一个线程对应一个高斯球处理
return;
// Initialize radius and touched tiles to 0. If this isn't changed,
// this Gaussian will not be processed further.//半径和tile数
radii[idx] = 0;
tiles_touched[idx] = 0;
// Perform near culling, quit if outside执行近剔除,如果超出范围则退出.idx:0-255,thread的id号;
float3 p_view;
if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view))
return;
// Transform point by projecting //p_orig:空间3d点,projmatrix:投影矩阵c2w
float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
float4 p_hom = transformPoint4x4(p_orig, projmatrix);
float p_w = 1.0f / (p_hom.w + 0.0000001f);
//投影到相机ndc坐标系后的高斯球中心坐标
float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
// If 3D covariance matrix is precomputed, use it, otherwise compute from scaling and rotation parameters.
const float* cov3D;
if (cov3D_precomp != nullptr)
{
cov3D = cov3D_precomp + idx * 6; //第idx个高斯球的3d协方差
}
else
{ //用缩放和旋转计算三个轴长度(3d协方差矩阵)
computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6);
cov3D = cov3Ds + idx * 6;
}
// Compute 2D screen-space covariance matrix//计算椭球投影成椭圆的样子,用对称2D矩阵记录(abbc)
float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix);
// Invert covariance (EWA algorithm)//数值求2*2方阵[cov.x,cov.y,cov.y,cov.z]abbc的特征值
float det = (cov.x * cov.z - cov.y * cov.y); //2*2矩阵的模,ac - b^2
if (det == 0.0f)
return;
float det_inv = 1.f / det;
float3 conic = { cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv }; //2d协方差矩阵的逆 det*(c,-b,a)
// Compute extent in screen space (by finding eigenvalues of
// 2D covariance matrix). Use extent to compute a bounding rectangle
// of screen-space tiles that this Gaussian overlaps with. Quit if
// rectangle covers 0 tiles.
float mid = 0.5f * (cov.x + cov.z);
float lambda1 = mid + sqrt(max(0.1f, mid * mid - det));
float lambda2 = mid - sqrt(max(0.1f, mid * mid - det));
float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2))); //最长轴的3倍,覆盖99%的区域
float2 point_image = { ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H) }; //高斯球中心的像素坐标
uint2 rect_min, rect_max;
//计算圆覆盖了哪些tile,返回rect_min, rect_max
getRect(point_image, my_radius, rect_min, rect_max, grid);
if ((rect_max.x - rect_min.x) * (rect_max.y - rect_min.y) == 0)
return;
// If colors have been precomputed, use them, otherwise convert
// spherical harmonics coefficients to RGB color.//从球谐函数系数求投影点颜色
if (colors_precomp == nullptr)
{
glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, shs, clamped);
rgb[idx * C + 0] = result.x; //C为模版参数,表示通道数,这里是3
rgb[idx * C + 1] = result.y;
rgb[idx * C + 2] = result.z;
}
// Store some useful helper data for the next steps.
depths[idx] = p_view.z; //第idx个高斯球的中心点的深度值
radii[idx] = my_radius; //高斯椭圆的长轴半径,超过这个范围高斯分布的值很小
points_xy_image[idx] = point_image; //第idx个高斯椭圆的像素坐标
// Inverse 2D covariance and opacity neatly pack into one float4
conic_opacity[idx] = { conic.x, conic.y, conic.z, opacities[idx] }; //2d协方差矩阵的逆和不透明度
tiles_touched[idx] = (rect_max.y - rect_min.y) * (rect_max.x - rect_min.x); //存储所覆盖的tile的数量
}
// Main rasterization method. Collaboratively works on one tile per
// block, each thread treats one pixel. Alternates between fetching
// and rasterizing data.//渲染,一个block一个tile,一个thread一个像素
template <uint32_t CHANNELS>
__global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
renderCUDA(
const uint2* __restrict__ ranges,
const uint32_t* __restrict__ point_list,
int W, int H,
const float2* __restrict__ points_xy_image,
const float* __restrict__ features,
const float4* __restrict__ conic_opacity,
float* __restrict__ final_T,
uint32_t* __restrict__ n_contrib,
const float* __restrict__ bg_color,
float* __restrict__ out_color)
{
// Identify current tile and associated min/max pixel range.
auto block = cg::this_thread_block(); //创建线程块block
uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X; //水平块数
//group_index():当前block在grid中的三维索引,相当于blockIdx
uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y }; //最小像素坐标
uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) }; //最大像素坐标
//thread_index():当前thread在block中的三维索引,相当于threadIdx
uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y }; //当前线程正在处理的像素坐标
uint32_t pix_id = W * pix.y + pix.x; //像素id,二维展平成一维后
float2 pixf = { (float)pix.x, (float)pix.y }; //浮点型像素坐标
// Check if this thread is associated with a valid pixel or outside.//在图像内的像素点
bool inside = pix.x < W&& pix.y < H;
// Done threads can help with fetching, but don't rasterize //渲染结束标志,初始值取inside的相反值
bool done = !inside;
// Load start/end range of IDs to process in bit sorted list.
// 当前tile在point_list_keys中的范围,包括x下界和y上届
uint2 range = ranges[block.group_index().y * horizontal_blocks + block.group_index().x];
// rounds表示每个线程需要处理的高斯球的数量
const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
// toDo=range.y-range.x表示当前tile(block)对应的需要计算的高斯球的数量
int toDo = range.y - range.x;
// Allocate storage for batches of collectively fetched data.
//共享显存,被同一个block的thread共享的存储
__shared__ int collected_id[BLOCK_SIZE]; // 记录各线程处理的高斯球的编号
__shared__ float2 collected_xy[BLOCK_SIZE]; // 记录各线程处理的高斯球中心在2d平面的投影坐标
__shared__ float4 collected_conic_opacity[BLOCK_SIZE]; // 记录各线程处理的高斯球的2d协方差矩阵的逆和不透明度
// Initialize helper variables
float T = 1.0f; // 透射率
uint32_t contributor = 0; // 计算经过了多少高斯
uint32_t last_contributor = 0; // 存储最终经过的高斯球数量
float C[CHANNELS] = { 0 }; // 最后渲染的颜色
// Iterate over batches until all done or range is complete
for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
{
// 如果一个block里面全部thread完成,则退出循环
int num_done = __syncthreads_count(done);
if (num_done == BLOCK_SIZE)
break;
// 共同从全局显存读取每一个高斯数据到共享显存,已经结束的thread去取
int progress = i * BLOCK_SIZE + block.thread_rank(); //thread_rank:当前线程在组内的标号,区间为[0, num_threads)
if (range.x + progress < range.y) //如果当前线程有效,即处理的高斯球不越界
{
//point_list表示与已排序的point_list_keys对应的高斯球编号,coll_id为当前线程处理的高斯球编号
int coll_id = point_list[range.x + progress];
collected_id[block.thread_rank()] = coll_id; //当前线程处理的高斯球id
collected_xy[block.thread_rank()] = points_xy_image[coll_id]; //points_xy_image:高斯椭圆中心在像素平面的坐标
collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id]; //2d协方差矩阵的逆和不透明度
}
block.sync();
// 每个线程遍历当前batch一堆高斯球
for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
{
// Keep track of current position in range
contributor++;
// Resample using conic matrix (cf. "Surface
// Splatting" by Zwicker et al., 2001)//圆锥形矩阵
float2 xy = collected_xy[j]; //2d高斯椭圆中心像素坐标
float2 d = { xy.x - pixf.x, xy.y - pixf.y }; //像素点与2d高斯中心点的距离
float4 con_o = collected_conic_opacity[j]; //2d协方差矩阵的逆和不透明度
float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y; // 计算高斯分布概率的指数部分
if (power > 0.0f)
continue;
// Eq. (2) from 3D Gaussian splatting paper.// 通过与高斯不透明度相乘获得alpha值及其从平均值开始的指数衰减。避免数值不稳定性
// Obtain alpha by multiplying with Gaussian opacity
// and its exponential falloff from mean.
// Avoid numerical instabilities (see paper appendix).
float alpha = min(0.99f, con_o.w * exp(power)); //通过power值和不透明度计算alpha值
if (alpha < 1.0f / 255.0f)
continue;
float test_T = T * (1 - alpha); //由alpha计算透射率
if (test_T < 0.0001f)
{
done = true;
continue;
}
// Eq. (3) from 3D Gaussian splatting paper.//颜色alpha合成
for (int ch = 0; ch < CHANNELS; ch++)
C[ch] += features[collected_id[j] * CHANNELS + ch] * alpha * T;
T = test_T;
// Keep track of last range entry to update this
// pixel.
last_contributor = contributor;
}
}
// 所有处理有效像素的thread都会将其最终渲染数据
// 写入帧缓冲区和辅助缓冲区
if (inside)
{
final_T[pix_id] = T; // 输出最终pix_id像素点的T值
n_contrib[pix_id] = last_contributor; // 输出贡献的高斯球数
for (int ch = 0; ch < CHANNELS; ch++)
out_color[ch * H * W + pix_id] = C[ch] + T * bg_color[ch]; //
}
}更多推荐



所有评论(0)