Optimize vertical box filter to iterate over rows for cache friendliness.
This commit is contained in:
parent
6966433c9c
commit
bb1b5ddac0
10
Cargo.lock
generated
10
Cargo.lock
generated
@ -190,6 +190,15 @@ dependencies = [
|
||||
"tiff",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319"
|
||||
dependencies = [
|
||||
"either",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jpeg-decoder"
|
||||
version = "0.1.22"
|
||||
@ -315,6 +324,7 @@ name = "physarum"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"image",
|
||||
"itertools",
|
||||
"rand",
|
||||
"rayon",
|
||||
]
|
||||
|
||||
@ -8,5 +8,6 @@ edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
image = "*"
|
||||
itertools = "*"
|
||||
rand = "*"
|
||||
rayon = "*"
|
||||
|
||||
215
src/blur.rs
215
src/blur.rs
@ -1,102 +1,116 @@
|
||||
/// Approximate 1D Gaussian filter of standard deviation sigma with N box filter passes. Each
|
||||
/// element in the output array contains the radius of the box filter for the corresponding pass.
|
||||
pub fn boxes_for_gaussian<const N: usize>(sigma: f32) -> ([usize; N]) {
|
||||
let w_ideal = (12.0 * sigma * sigma / N as f32 + 1.0).sqrt();
|
||||
let mut w = w_ideal as usize;
|
||||
w -= 1 - w & 1;
|
||||
use itertools::multizip;
|
||||
|
||||
let mut m = ((w * w + 4 * w + 3) * N) as f32;
|
||||
m -= 12.0 * sigma * sigma;
|
||||
m *= 0.25;
|
||||
m /= (w + 1) as f32;
|
||||
let m = m.round() as usize;
|
||||
|
||||
let mut result = [0; N];
|
||||
for (i, value) in result.iter_mut().enumerate() {
|
||||
*value = (if i < m { w - 1 } else { w + 1 }) / 2;
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Blur an image with 3 box filter passes. The result will be written to the src slice, while the
|
||||
/// buf slice is used as a scratch space.
|
||||
pub fn approximate_gauss_blur(
|
||||
src: &mut [f32],
|
||||
buf: &mut [f32],
|
||||
#[derive(Debug)]
|
||||
pub struct Blur {
|
||||
width: usize,
|
||||
height: usize,
|
||||
sigma: f32,
|
||||
decay: f32,
|
||||
) {
|
||||
let boxes = boxes_for_gaussian::<3>(sigma);
|
||||
box_blur(src, buf, width, height, boxes[0], 1.0);
|
||||
box_blur(src, buf, width, height, boxes[1], 1.0);
|
||||
box_blur(src, buf, width, height, boxes[2], decay);
|
||||
row_buffer: Vec<f32>,
|
||||
}
|
||||
|
||||
/// Perform one pass of the 2D box filter of the given radius. The result will be written to the src
|
||||
/// slice, while the buf slice is used as a scratch space.
|
||||
fn box_blur(
|
||||
src: &mut [f32],
|
||||
buf: &mut [f32],
|
||||
width: usize,
|
||||
height: usize,
|
||||
radius: usize,
|
||||
decay: f32,
|
||||
) {
|
||||
box_blur_h(src, buf, width, radius);
|
||||
box_blur_v(buf, src, width, height, radius, decay);
|
||||
}
|
||||
|
||||
/// Perform one pass of the 1D box filter of the given radius along x axis.
|
||||
fn box_blur_h(src: &[f32], dst: &mut [f32], width: usize, radius: usize) {
|
||||
let weight = 1.0 / (2 * radius + 1) as f32;
|
||||
|
||||
// TODO: Parallelize with rayon
|
||||
for (src_row, dst_row) in src.chunks_exact(width).zip(dst.chunks_exact_mut(width)) {
|
||||
// First we build a value for the beginning of each row. We assume periodic boundary
|
||||
// conditions, so we need to push the left index to the opposite side of the row.
|
||||
let mut value = src_row[width - radius - 1];
|
||||
for j in 0..radius {
|
||||
value += src_row[width - radius + j] + src_row[j];
|
||||
}
|
||||
// At this point "value" contains the unweighted sum for the right-most row element.
|
||||
for current_id in 0..width {
|
||||
let left_id = (current_id + width - radius - 1) & (width - 1);
|
||||
let right_id = (current_id + radius) & (width - 1);
|
||||
value += src_row[right_id] - src_row[left_id];
|
||||
dst_row[current_id] = value * weight;
|
||||
impl Blur {
|
||||
pub fn new(width: usize, height: usize) -> Self {
|
||||
Blur {
|
||||
width,
|
||||
height,
|
||||
row_buffer: vec![0.0; width],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Perform one pass of the 1D box filter of the given radius along y axis. Applies the decay factor
|
||||
/// to the destination buffer.
|
||||
fn box_blur_v(
|
||||
src: &[f32],
|
||||
dst: &mut [f32],
|
||||
width: usize,
|
||||
height: usize,
|
||||
radius: usize,
|
||||
decay: f32,
|
||||
) {
|
||||
let weight = decay / (2 * radius + 1) as f32;
|
||||
/// Blur an image with 3 box filter passes. The result will be written to the src slice, while
|
||||
/// the buf slice is used as a scratch space.
|
||||
pub fn run(&mut self, src: &mut [f32], buf: &mut [f32], sigma: f32, decay: f32) {
|
||||
let boxes = Blur::boxes_for_gaussian::<3>(sigma);
|
||||
self.box_blur(src, buf, boxes[0], 1.0);
|
||||
self.box_blur(src, buf, boxes[1], 1.0);
|
||||
self.box_blur(src, buf, boxes[2], decay);
|
||||
}
|
||||
|
||||
// TODO: Parallelize with rayon
|
||||
for i in 0..width {
|
||||
// First we build a value for the beginning of each column. We assume periodic boundary
|
||||
// conditions, so we need to push the bottom index to the opposite side of the column.
|
||||
let mut value = src[i + (height - radius - 1) * width];
|
||||
for j in 0..radius {
|
||||
value += src[i + (height - radius + j) * width] + src[i + j * width];
|
||||
/// Approximate 1D Gaussian filter of standard deviation sigma with N box filter passes. Each
|
||||
/// element in the output array contains the radius of the box filter for the corresponding
|
||||
/// pass.
|
||||
fn boxes_for_gaussian<const N: usize>(sigma: f32) -> ([usize; N]) {
|
||||
let w_ideal = (12.0 * sigma * sigma / N as f32 + 1.0).sqrt();
|
||||
let mut w = w_ideal as usize;
|
||||
w -= 1 - w & 1;
|
||||
|
||||
let mut m = 0.25 * (N * (w + 3)) as f32;
|
||||
m -= 3.0 * sigma * sigma / (w + 1) as f32;
|
||||
let m = m.round() as usize;
|
||||
|
||||
let mut result = [0; N];
|
||||
for (i, value) in result.iter_mut().enumerate() {
|
||||
*value = (if i < m { w - 1 } else { w + 1 }) / 2;
|
||||
}
|
||||
// At this point "value" contains the unweighted sum for the top-most column element.
|
||||
result
|
||||
}
|
||||
|
||||
for current_id in (i..i + height * width).step_by(width) {
|
||||
let bottom_id = (current_id + (height - radius - 1) * width) & (width * height - 1);
|
||||
let top_id = (current_id + radius * width) & (width * height - 1);
|
||||
value += src[top_id] - src[bottom_id];
|
||||
dst[current_id] = value * weight;
|
||||
/// Perform one pass of the 2D box filter of the given radius. The result will be written to the
|
||||
/// src slice, while the buf slice is used as a scratch space.
|
||||
fn box_blur(&mut self, src: &mut [f32], buf: &mut [f32], radius: usize, decay: f32) {
|
||||
self.box_blur_h(src, buf, radius);
|
||||
self.box_blur_v(buf, src, radius, decay);
|
||||
}
|
||||
|
||||
/// Perform one pass of the 1D box filter of the given radius along x axis.
|
||||
fn box_blur_h(&mut self, src: &[f32], dst: &mut [f32], radius: usize) {
|
||||
let weight = 1.0 / (2 * radius + 1) as f32;
|
||||
let width = self.width;
|
||||
|
||||
for (src_row, dst_row) in src.chunks_exact(width).zip(dst.chunks_exact_mut(width)) {
|
||||
// First we build a value for the beginning of each row. We assume periodic boundary
|
||||
// conditions, so we need to push the left index to the opposite side of the row.
|
||||
let mut value = src_row[width - radius - 1];
|
||||
for j in 0..radius {
|
||||
value += src_row[width - radius + j] + src_row[j];
|
||||
}
|
||||
// At this point "value" contains the unweighted sum for the right-most row element.
|
||||
for i in 0..width {
|
||||
let left = (i + width - radius - 1) & (width - 1);
|
||||
let right = (i + radius) & (width - 1);
|
||||
value += src_row[right] - src_row[left];
|
||||
dst_row[i] = value * weight;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Perform one pass of the 1D box filter of the given radius along y axis. Applies the decay
|
||||
/// factor to the destination buffer.
|
||||
fn box_blur_v(&mut self, src: &[f32], dst: &mut [f32], radius: usize, decay: f32) {
|
||||
let weight = decay / (2 * radius + 1) as f32;
|
||||
let (width, height) = (self.width, self.height);
|
||||
|
||||
// We don't replicate the horizontal filter logic because of the cache-unfriendly memory
|
||||
// access patterns of sequential iteration over individual columns. Instead, we iterate over
|
||||
// rows via loop interchange.
|
||||
|
||||
let offset = (height - radius - 1) * width;
|
||||
self.row_buffer
|
||||
.copy_from_slice(&src[offset..offset + width]);
|
||||
|
||||
for j in 0..radius {
|
||||
let bottom_off = (height - radius + j) * width;
|
||||
let bottom_row = &src[bottom_off..bottom_off + width];
|
||||
let top_off = j * width;
|
||||
let top_row = &src[top_off..top_off + width];
|
||||
|
||||
for (buf, bottom, top) in multizip((&mut self.row_buffer, bottom_row, top_row)) {
|
||||
*buf += bottom + top;
|
||||
}
|
||||
}
|
||||
// At this point row_buffer contains the unweighted sum for all top elements.
|
||||
|
||||
for (i, dst_row) in dst.chunks_exact_mut(width).enumerate() {
|
||||
let bottom_off = ((i + height - radius - 1) & (height - 1)) * width;
|
||||
let bottom_row = &src[bottom_off..bottom_off + width];
|
||||
let top_off = ((i + radius) & (height - 1)) * width;
|
||||
let top_row = &src[top_off..top_off + width];
|
||||
|
||||
for (dst, buf, bottom, top) in
|
||||
multizip((dst_row, &mut self.row_buffer, bottom_row, top_row))
|
||||
{
|
||||
*buf += top - bottom;
|
||||
*dst = *buf * weight;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -107,9 +121,26 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_blur() {
|
||||
let src = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
|
||||
let mut dst = vec![0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0];
|
||||
box_blur_h(&src, &mut dst, 4, 1);
|
||||
println!("Out: {:?}", dst);
|
||||
let src: Vec<f32> = (1..17).map(|v| v as f32).collect();
|
||||
let mut dst = vec![0.0; 16];
|
||||
let mut blur = Blur::new(4, 4);
|
||||
|
||||
blur.box_blur_h(&src, &mut dst, 1);
|
||||
assert_eq!(
|
||||
dst,
|
||||
&[
|
||||
2.3333335, 2.0, 3.0, 2.6666667, 6.3333335, 6.0, 7.0, 6.666667, 10.333334, 10.0,
|
||||
11.0, 10.666667, 14.333334, 14.0, 15.0, 14.666667
|
||||
]
|
||||
);
|
||||
|
||||
blur.box_blur_v(&src, &mut dst, 1, 1.0);
|
||||
assert_eq!(
|
||||
dst,
|
||||
[
|
||||
6.3333335, 7.3333335, 8.333334, 9.333334, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0,
|
||||
12.0, 7.666667, 8.666667, 9.666667, 10.666667
|
||||
]
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
17
src/grid.rs
17
src/grid.rs
@ -1,13 +1,17 @@
|
||||
use crate::blur::approximate_gauss_blur;
|
||||
use rand::{distributions::Uniform, Rng};
|
||||
|
||||
use crate::blur::Blur;
|
||||
|
||||
/// A 2D grid with a scalar value per each grid block.
|
||||
#[derive(Debug)]
|
||||
pub struct Grid {
|
||||
width: usize,
|
||||
height: usize,
|
||||
data: Vec<f32>,
|
||||
|
||||
// The scratch space for the blur operation.
|
||||
buf: Vec<f32>,
|
||||
blur: Blur,
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
@ -30,6 +34,7 @@ impl Grid {
|
||||
height,
|
||||
data,
|
||||
buf: vec![0.0; width * height],
|
||||
blur: Blur::new(width, height),
|
||||
}
|
||||
}
|
||||
|
||||
@ -60,14 +65,8 @@ impl Grid {
|
||||
|
||||
/// Diffuse grid data and apply a decay multiplier.
|
||||
pub fn diffuse(&mut self, radius: usize, decay_factor: f32) {
|
||||
approximate_gauss_blur(
|
||||
&mut self.data,
|
||||
&mut self.buf,
|
||||
self.width,
|
||||
self.height,
|
||||
radius as f32,
|
||||
decay_factor,
|
||||
);
|
||||
self.blur
|
||||
.run(&mut self.data, &mut self.buf, radius as f32, decay_factor);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -2,7 +2,4 @@ mod blur;
|
||||
mod grid;
|
||||
mod model;
|
||||
|
||||
fn main() {
|
||||
let boxes = blur::boxes_for_gaussian::<3>(2.5);
|
||||
println!("boxes: {:?}", boxes);
|
||||
}
|
||||
fn main() {}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user