Optimize vertical box filter to iterate over rows for cache friendliness.

This commit is contained in:
mindv0rtex 2021-02-25 15:17:06 -05:00
parent 6966433c9c
commit bb1b5ddac0
5 changed files with 143 additions and 105 deletions

10
Cargo.lock generated
View File

@ -190,6 +190,15 @@ dependencies = [
"tiff",
]
[[package]]
name = "itertools"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319"
dependencies = [
"either",
]
[[package]]
name = "jpeg-decoder"
version = "0.1.22"
@ -315,6 +324,7 @@ name = "physarum"
version = "0.1.0"
dependencies = [
"image",
"itertools",
"rand",
"rayon",
]

View File

@ -8,5 +8,6 @@ edition = "2018"
[dependencies]
image = "*"
itertools = "*"
rand = "*"
rayon = "*"

View File

@ -1,102 +1,116 @@
/// Approximate 1D Gaussian filter of standard deviation sigma with N box filter passes. Each
/// element in the output array contains the radius of the box filter for the corresponding pass.
pub fn boxes_for_gaussian<const N: usize>(sigma: f32) -> ([usize; N]) {
let w_ideal = (12.0 * sigma * sigma / N as f32 + 1.0).sqrt();
let mut w = w_ideal as usize;
w -= 1 - w & 1;
use itertools::multizip;
let mut m = ((w * w + 4 * w + 3) * N) as f32;
m -= 12.0 * sigma * sigma;
m *= 0.25;
m /= (w + 1) as f32;
let m = m.round() as usize;
let mut result = [0; N];
for (i, value) in result.iter_mut().enumerate() {
*value = (if i < m { w - 1 } else { w + 1 }) / 2;
}
result
}
/// Blur an image with 3 box filter passes. The result will be written to the src slice, while the
/// buf slice is used as a scratch space.
pub fn approximate_gauss_blur(
src: &mut [f32],
buf: &mut [f32],
#[derive(Debug)]
pub struct Blur {
width: usize,
height: usize,
sigma: f32,
decay: f32,
) {
let boxes = boxes_for_gaussian::<3>(sigma);
box_blur(src, buf, width, height, boxes[0], 1.0);
box_blur(src, buf, width, height, boxes[1], 1.0);
box_blur(src, buf, width, height, boxes[2], decay);
row_buffer: Vec<f32>,
}
/// Perform one pass of the 2D box filter of the given radius. The result will be written to the src
/// slice, while the buf slice is used as a scratch space.
fn box_blur(
src: &mut [f32],
buf: &mut [f32],
width: usize,
height: usize,
radius: usize,
decay: f32,
) {
box_blur_h(src, buf, width, radius);
box_blur_v(buf, src, width, height, radius, decay);
}
/// Perform one pass of the 1D box filter of the given radius along x axis.
fn box_blur_h(src: &[f32], dst: &mut [f32], width: usize, radius: usize) {
let weight = 1.0 / (2 * radius + 1) as f32;
// TODO: Parallelize with rayon
for (src_row, dst_row) in src.chunks_exact(width).zip(dst.chunks_exact_mut(width)) {
// First we build a value for the beginning of each row. We assume periodic boundary
// conditions, so we need to push the left index to the opposite side of the row.
let mut value = src_row[width - radius - 1];
for j in 0..radius {
value += src_row[width - radius + j] + src_row[j];
}
// At this point "value" contains the unweighted sum for the right-most row element.
for current_id in 0..width {
let left_id = (current_id + width - radius - 1) & (width - 1);
let right_id = (current_id + radius) & (width - 1);
value += src_row[right_id] - src_row[left_id];
dst_row[current_id] = value * weight;
impl Blur {
pub fn new(width: usize, height: usize) -> Self {
Blur {
width,
height,
row_buffer: vec![0.0; width],
}
}
}
/// Perform one pass of the 1D box filter of the given radius along y axis. Applies the decay factor
/// to the destination buffer.
fn box_blur_v(
src: &[f32],
dst: &mut [f32],
width: usize,
height: usize,
radius: usize,
decay: f32,
) {
let weight = decay / (2 * radius + 1) as f32;
/// Blur an image with 3 box filter passes. The result will be written to the src slice, while
/// the buf slice is used as a scratch space.
pub fn run(&mut self, src: &mut [f32], buf: &mut [f32], sigma: f32, decay: f32) {
let boxes = Blur::boxes_for_gaussian::<3>(sigma);
self.box_blur(src, buf, boxes[0], 1.0);
self.box_blur(src, buf, boxes[1], 1.0);
self.box_blur(src, buf, boxes[2], decay);
}
// TODO: Parallelize with rayon
for i in 0..width {
// First we build a value for the beginning of each column. We assume periodic boundary
// conditions, so we need to push the bottom index to the opposite side of the column.
let mut value = src[i + (height - radius - 1) * width];
for j in 0..radius {
value += src[i + (height - radius + j) * width] + src[i + j * width];
/// Approximate 1D Gaussian filter of standard deviation sigma with N box filter passes. Each
/// element in the output array contains the radius of the box filter for the corresponding
/// pass.
fn boxes_for_gaussian<const N: usize>(sigma: f32) -> ([usize; N]) {
let w_ideal = (12.0 * sigma * sigma / N as f32 + 1.0).sqrt();
let mut w = w_ideal as usize;
w -= 1 - w & 1;
let mut m = 0.25 * (N * (w + 3)) as f32;
m -= 3.0 * sigma * sigma / (w + 1) as f32;
let m = m.round() as usize;
let mut result = [0; N];
for (i, value) in result.iter_mut().enumerate() {
*value = (if i < m { w - 1 } else { w + 1 }) / 2;
}
// At this point "value" contains the unweighted sum for the top-most column element.
result
}
for current_id in (i..i + height * width).step_by(width) {
let bottom_id = (current_id + (height - radius - 1) * width) & (width * height - 1);
let top_id = (current_id + radius * width) & (width * height - 1);
value += src[top_id] - src[bottom_id];
dst[current_id] = value * weight;
/// Perform one pass of the 2D box filter of the given radius. The result will be written to the
/// src slice, while the buf slice is used as a scratch space.
fn box_blur(&mut self, src: &mut [f32], buf: &mut [f32], radius: usize, decay: f32) {
self.box_blur_h(src, buf, radius);
self.box_blur_v(buf, src, radius, decay);
}
/// Perform one pass of the 1D box filter of the given radius along x axis.
fn box_blur_h(&mut self, src: &[f32], dst: &mut [f32], radius: usize) {
let weight = 1.0 / (2 * radius + 1) as f32;
let width = self.width;
for (src_row, dst_row) in src.chunks_exact(width).zip(dst.chunks_exact_mut(width)) {
// First we build a value for the beginning of each row. We assume periodic boundary
// conditions, so we need to push the left index to the opposite side of the row.
let mut value = src_row[width - radius - 1];
for j in 0..radius {
value += src_row[width - radius + j] + src_row[j];
}
// At this point "value" contains the unweighted sum for the right-most row element.
for i in 0..width {
let left = (i + width - radius - 1) & (width - 1);
let right = (i + radius) & (width - 1);
value += src_row[right] - src_row[left];
dst_row[i] = value * weight;
}
}
}
/// Perform one pass of the 1D box filter of the given radius along y axis. Applies the decay
/// factor to the destination buffer.
fn box_blur_v(&mut self, src: &[f32], dst: &mut [f32], radius: usize, decay: f32) {
let weight = decay / (2 * radius + 1) as f32;
let (width, height) = (self.width, self.height);
// We don't replicate the horizontal filter logic because of the cache-unfriendly memory
// access patterns of sequential iteration over individual columns. Instead, we iterate over
// rows via loop interchange.
let offset = (height - radius - 1) * width;
self.row_buffer
.copy_from_slice(&src[offset..offset + width]);
for j in 0..radius {
let bottom_off = (height - radius + j) * width;
let bottom_row = &src[bottom_off..bottom_off + width];
let top_off = j * width;
let top_row = &src[top_off..top_off + width];
for (buf, bottom, top) in multizip((&mut self.row_buffer, bottom_row, top_row)) {
*buf += bottom + top;
}
}
// At this point row_buffer contains the unweighted sum for all top elements.
for (i, dst_row) in dst.chunks_exact_mut(width).enumerate() {
let bottom_off = ((i + height - radius - 1) & (height - 1)) * width;
let bottom_row = &src[bottom_off..bottom_off + width];
let top_off = ((i + radius) & (height - 1)) * width;
let top_row = &src[top_off..top_off + width];
for (dst, buf, bottom, top) in
multizip((dst_row, &mut self.row_buffer, bottom_row, top_row))
{
*buf += top - bottom;
*dst = *buf * weight;
}
}
}
}
@ -107,9 +121,26 @@ mod tests {
#[test]
fn test_blur() {
let src = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let mut dst = vec![0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0];
box_blur_h(&src, &mut dst, 4, 1);
println!("Out: {:?}", dst);
let src: Vec<f32> = (1..17).map(|v| v as f32).collect();
let mut dst = vec![0.0; 16];
let mut blur = Blur::new(4, 4);
blur.box_blur_h(&src, &mut dst, 1);
assert_eq!(
dst,
&[
2.3333335, 2.0, 3.0, 2.6666667, 6.3333335, 6.0, 7.0, 6.666667, 10.333334, 10.0,
11.0, 10.666667, 14.333334, 14.0, 15.0, 14.666667
]
);
blur.box_blur_v(&src, &mut dst, 1, 1.0);
assert_eq!(
dst,
[
6.3333335, 7.3333335, 8.333334, 9.333334, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0,
12.0, 7.666667, 8.666667, 9.666667, 10.666667
]
)
}
}

View File

@ -1,13 +1,17 @@
use crate::blur::approximate_gauss_blur;
use rand::{distributions::Uniform, Rng};
use crate::blur::Blur;
/// A 2D grid with a scalar value per each grid block.
#[derive(Debug)]
pub struct Grid {
width: usize,
height: usize,
data: Vec<f32>,
// The scratch space for the blur operation.
buf: Vec<f32>,
blur: Blur,
}
#[inline(always)]
@ -30,6 +34,7 @@ impl Grid {
height,
data,
buf: vec![0.0; width * height],
blur: Blur::new(width, height),
}
}
@ -60,14 +65,8 @@ impl Grid {
/// Diffuse grid data and apply a decay multiplier.
pub fn diffuse(&mut self, radius: usize, decay_factor: f32) {
approximate_gauss_blur(
&mut self.data,
&mut self.buf,
self.width,
self.height,
radius as f32,
decay_factor,
);
self.blur
.run(&mut self.data, &mut self.buf, radius as f32, decay_factor);
}
}

View File

@ -2,7 +2,4 @@ mod blur;
mod grid;
mod model;
fn main() {
let boxes = blur::boxes_for_gaussian::<3>(2.5);
println!("boxes: {:?}", boxes);
}
fn main() {}