sin/cos optimizations, cleanup, and optimizations

2021-03-30 12:54:57 -04:00 · 2021-03-30 12:54:57 -04:00 · f76c9f7401
commit f76c9f7401
parent 6e2b5c3d2a
9 changed files with 99 additions and 33 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -248,6 +248,12 @@ version = "0.3.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"

+[[package]]
+name = "fastapprox"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0031c93f37b5d18272de2d932ebff6a7eb32d4bc3bab6751a9af42da7d1a424"
+
 [[package]]
 name = "getrandom"
 version = "0.2.2"
@ -488,6 +494,7 @@ name = "physarum"
 version = "0.1.0"
 dependencies = [
 "criterion",
+ "fastapprox",
 "image",
 "indicatif",
 "itertools 0.10.0",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -12,6 +12,7 @@ rand = "0.8.3"
 rand_distr = "0.4"
 #rayon = "1.5"
 rayon = {git = "https://github.com/rayon-rs/rayon.git"}
+fastapprox = "0.3.0"

 [dev-dependencies]
 criterion = "0.3.4"
--- a/Notes.md
+++ b/Notes.md
@ -0,0 +1,17 @@
+## sin and cos optimizations:
+### Setup/Info:
+- measured in ms/agent ticked
+- 2048 iterations
+- average of 3 trials
+- 256x256 grid
+- 1 << 20 particles
+- 1 population
+### Results:
+- normal sin + normal cos: 
+    - 0.000018192ms
+- old sin + old cos:
+    - 0.000019803ms (8.85% slower)
+- fast_approx::fast::sin + fast_approx::fast::cos
+    - 0.000018658ms (2.56% slower)
+- fast_approx::faster::sin + fast_approx::faster::cos
+    - 0.000015878ms (14.57% faster)
--- a/TODO.md
+++ b/TODO.md
@ -2,4 +2,7 @@
 - Auto create a mp4 from generate images
    - Instead of using the command `ffmpeg -r 20 -i tmp/out_%d.png -vcodec libx265 -crf 25 -s 512x512 test.mp4` maybe use a rust library to do the same (more research needed)
 - GPU compute 
-    - Tried [ArrayFire-rust](https://github.com/arrayfire/arrayfire-rust) didn't work well, looking for another library
+    - Tried [ArrayFire-rust](https://github.com/arrayfire/arrayfire-rust) didn't work well, looking for another library
+    - Try using [emu](https://github.com/calebwin/emu) (seems to be a very good option)
+- sin and cos optimizations
+    - sin/cos table?
--- a/src/lib.rs
+++ b/src/lib.rs
@ -3,4 +3,5 @@ mod grid;
 pub mod model;
 mod palette;
 mod util;
-mod imgdata; // for storing image data
+mod imgdata; // for storing image data
+mod math; // for math things
--- a/src/main.rs
+++ b/src/main.rs
@ -2,19 +2,19 @@ use physarum::model;

 fn main() {
    // # of iterations to go through
-    let n_iterations = 512;
+    let n_iterations = 2048;

    // Size of grid and pictures
    let (width, height) = (256, 256);

    // # of agents
-    let n_particles = 1 << 24;
+    let n_particles = 1 << 20;
    println!("n_particles: {}", n_particles);

    let diffusivity = 1;

    // `n_populations` is the # of types of agents
-    let n_populations = 2;
+    let n_populations = 1;
    // let n_populations = 1 + rng.gen_range(1..4); // make # of populations between 2 and 5
    

--- a/src/math.rs
+++ b/src/math.rs
@ -0,0 +1,44 @@
+#[inline(always)]
+fn to_radians(x: f32) -> f32 {
+    x * (std::f32::consts::PI / 180.0)
+}
+
+/// Previously from trig.rs
+/// From https://bits.stephan-brumme.com/absFloat.html
+#[allow(dead_code)]
+#[inline(always)]
+fn abs(x: f32) -> f32 {
+    return f32::from_bits(x.to_bits() & 0x7FFF_FFFF);
+}
+
+/// Previously from trig.rs
+/// Branchless floor implementation
+#[allow(dead_code)]
+#[inline(always)]
+fn floor(x: f32) -> f32 {
+    let mut x_trunc = (x as i32) as f32;
+    x_trunc -= (x < x_trunc) as i32 as f32;
+    return x_trunc;
+}
+
+/// Previously from trig.rs
+/// Approximates `cos(x)` in radians with the maximum error of `0.002`
+/// https://stackoverflow.com/posts/28050328/revisions
+#[allow(dead_code)]
+#[inline(always)]
+pub fn cos(mut x: f32) -> f32 {
+    const ALPHA: f32 = 0.5 * std::f32::consts::FRAC_1_PI;
+    x *= ALPHA;
+    x -= 0.25_f32 + floor(x + 0.25_f32);
+    x *= 16.0_f32 * (abs(x) - 0.5_f32);
+    x += 0.225_f32 * x * (abs(x) - 1.0_f32);
+    return x;
+}
+
+/// Previously from trig.rs
+/// Approximates `sin(x)` in radians with the maximum error of `0.002`
+#[allow(dead_code)]
+#[inline(always)]
+pub fn sin(x: f32) -> f32 {
+    return cos(x - std::f32::consts::FRAC_PI_2);
+}
--- a/src/model.rs
+++ b/src/model.rs
@ -2,6 +2,7 @@ use crate::{
    grid::{combine, Grid, PopulationConfig},
    palette::{random_palette, Palette},
    imgdata::ImgData,
+    util::wrap,
 };

 use rand::{seq::SliceRandom, Rng};
@ -13,7 +14,6 @@ use std::time::{Instant};
 use rayon::iter::{ParallelIterator,};
 use indicatif::{ParallelProgressIterator, ProgressBar, ProgressStyle};
 use std::path::Path;
-use crate::util::wrap;

 /// A single Physarum agent. The x and y positions are continuous, hence we use floating point
 /// numbers instead of integers.
@ -39,21 +39,7 @@ impl Agent {
        }
    }

-    fn get_sensor_coords(&mut self, x: f32, y: f32, sensor_distance: f32, sensor_angle: f32, angle: f32) -> (f32, f32, f32, f32, f32, f32) {
-        let xc = x + self.angle.cos() * sensor_distance;
-        let yc = y + self.angle.sin() * sensor_distance;
-        
-        let agent_add_sens = angle + sensor_angle;
-        let agent_sub_sens = angle - sensor_angle;
-
-        let xl = x + agent_sub_sens.cos() * sensor_distance;
-        let yl = y + agent_sub_sens.sin() * sensor_distance;
-        let xr = x + agent_add_sens.cos() * sensor_distance;
-        let yr = y + agent_add_sens.sin() * sensor_distance;
-
-        return (xc, yc, xl, yl, xr, yr);
-    }
-
+    #[inline]
    pub fn tick(&mut self, grid: &Grid) {        
        let (width, height) = (grid.width, grid.height);
        let PopulationConfig {
@ -64,7 +50,16 @@ impl Agent {
            ..
        } = grid.config;

-        let (xc, yc, xl, yl, xr, yr) = Self::get_sensor_coords(self, self.x, self.y, sensor_distance, sensor_angle, self.angle);
+        let xc = self.x + fastapprox::faster::cos(self.angle) * sensor_distance;
+        let yc = self.y + fastapprox::faster::sin(self.angle) * sensor_distance;
+        
+        let agent_add_sens = self.angle + sensor_angle;
+        let agent_sub_sens = self.angle - sensor_angle;
+
+        let xl = self.x + fastapprox::faster::cos(agent_sub_sens) * sensor_distance;
+        let yl = self.y + fastapprox::faster::sin(agent_sub_sens) * sensor_distance;
+        let xr = self.x + fastapprox::faster::cos(agent_add_sens) * sensor_distance;
+        let yr = self.y + fastapprox::faster::sin(agent_add_sens) * sensor_distance;

        // We sense from the buffer because this is where we previously combined data from all the grid.
        let center = grid.get_buf(xc, yc);
@ -86,9 +81,10 @@ impl Agent {
        }

        let delta_angle = rotation_angle * direction;
+
        self.angle = wrap(self.angle + delta_angle, TAU);
-        self.x = wrap(self.x + step_distance * self.angle.cos(), width as f32);
-        self.y = wrap(self.y + step_distance * self.angle.sin(), height as f32);
+        self.x = wrap(self.x + step_distance * fastapprox::faster::cos(self.angle), width as f32);
+        self.y = wrap(self.y + step_distance * fastapprox::faster::sin(self.angle), height as f32);
    }
 }

@ -195,7 +191,7 @@ impl Model {


    /// Simulates `steps` # of steps
-    #[inline(always)]
+    #[inline]
    pub fn run(&mut self, steps: usize) {
        let debug: bool = false;

@ -221,11 +217,10 @@ impl Model {

            // Tick agents
            self.agents.par_iter_mut().for_each(|agent| {
-                let grid = &grids[agent.population_id];
-                agent.tick(grid);
+                agent.tick(&grids[agent.population_id]);
            });

-            // Deposit
+            // Deposit // TODO - Make this parallel
            for agent in self.agents.iter() {
                self.grids[agent.population_id].deposit(agent.x, agent.y);
            }
@ -243,9 +238,7 @@ impl Model {
            time_per_agent_list.push(ms_per_agent);
            time_per_step_list.push(agents_tick_elapsed);

-            if debug {
-                println!("Finished tick for all agents. took {}ms\nTime per agent: {}ms\n", agents_tick_elapsed, ms_per_agent);
-            }
+            if debug {println!("Finished tick for all agents. took {}ms\nTime per agent: {}ms\n", agents_tick_elapsed, ms_per_agent)};

            self.iteration += 1;
            pb.set_position(i as u64);
--- a/src/util.rs
+++ b/src/util.rs
@ -1,4 +1,4 @@
 #[inline(always)]
 pub fn wrap(x: f32, max: f32) -> f32 {
-    x - max * ((x > max) as i32 as f32 - (x < 0.0_f32) as i32 as f32)
-}
+    return x - max * ((x > max) as i32 as f32 - (x < 0.0_f32) as i32 as f32);
+}