diff --git a/Cargo.toml b/Cargo.toml
index 2d00641..0cf1cec 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -114,6 +114,7 @@ harness = false
 [[bench]]
 name = "sla"
 harness = false
+required-features = ["viz"]
 
 [workspace]
 members = ["pathmap-derive"]
diff --git a/benches/binary_keys.rs b/benches/binary_keys.rs
index 2d8ded7..8f3c1e0 100644
--- a/benches/binary_keys.rs
+++ b/benches/binary_keys.rs
@@ -77,6 +77,22 @@ fn binary_val_count_bench(bencher: Bencher, n: u64) {
     assert_eq!(sink, n as usize);
 }
 
+#[divan::bench(args = [125, 250, 500, 1000, 2000, 4000])]
+fn binary_goat_val_count_bench(bencher: Bencher, n: u64) {
+
+    let keys = make_keys(n as usize, 1);
+
+    let mut map: PathMap<u64> = PathMap::new();
+    for i in 0..n { map.set_val_at(&keys[i as usize], i); }
+
+    //Benchmark the time taken to count the number of values in the map
+    let mut sink = 0;
+    bencher.bench_local(|| {
+        *black_box(&mut sink) = map.goat_val_count()
+    });
+    assert_eq!(sink, n as usize);
+}
+
 #[divan::bench(args = [50, 100, 200, 400, 800, 1600])]
 fn binary_drop_head(bencher: Bencher, n: u64) {
 
diff --git a/benches/cities.rs b/benches/cities.rs
index 231cb6e..cc5cbc9 100644
--- a/benches/cities.rs
+++ b/benches/cities.rs
@@ -168,6 +168,25 @@ fn cities_val_count(bencher: Bencher) {
     assert_eq!(sink, unique_count);
 }
 
+#[divan::bench()]
+fn cities_goat_val_count(bencher: Bencher) {
+
+    let pairs = read_data();
+    let mut map = PathMap::new();
+    let mut unique_count = 0;
+    for (k, v) in pairs.iter() {
+        if map.set_val_at(k, *v).is_none() {
+            unique_count += 1;
+        }
+    }
+
+    let mut sink = 0;
+    bencher.bench_local(|| {
+        *black_box(&mut sink) = map.goat_val_count();
+    });
+    assert_eq!(sink, unique_count);
+}
+
 #[cfg(feature="arena_compact")]
 #[divan::bench()]
 fn cities_val_count_act(bencher: Bencher) {
diff --git a/benches/shakespeare.rs b/benches/shakespeare.rs
index 2040ba5..5528f73 100644
--- a/benches/shakespeare.rs
+++ b/benches/shakespeare.rs
@@ -113,6 +113,25 @@ fn shakespeare_words_val_count(bencher: Bencher) {
     assert_eq!(sink, unique_count);
 }
 
+#[divan::bench()]
+fn shakespeare_words_goat_val_count(bencher: Bencher) {
+
+    let strings = read_data(true);
+    let mut map = PathMap::new();
+    let mut unique_count = 0;
+    for (v, k) in strings.iter().enumerate() {
+        if map.set_val_at(k, v).is_none() {
+            unique_count += 1;
+        }
+    }
+
+    let mut sink = 0;
+    bencher.bench_local(|| {
+        *black_box(&mut sink) = map.goat_val_count();
+    });
+    assert_eq!(sink, unique_count);
+}
+
 #[divan::bench()]
 fn shakespeare_sentences_insert(bencher: Bencher) {
 
@@ -168,6 +187,25 @@ fn shakespeare_sentences_val_count(bencher: Bencher) {
     assert_eq!(sink, unique_count);
 }
 
+#[divan::bench()]
+fn shakespeare_sentences_goat_val_count(bencher: Bencher) {
+
+    let strings = read_data(false);
+    let mut map = PathMap::new();
+    let mut unique_count = 0;
+    for (v, k) in strings.iter().enumerate() {
+        if map.set_val_at(k, v).is_none() {
+            unique_count += 1;
+        }
+    }
+
+    let mut sink = 0;
+    bencher.bench_local(|| {
+        *black_box(&mut sink) = map.goat_val_count();
+    });
+    assert_eq!(sink, unique_count);
+}
+
 #[cfg(feature="arena_compact")]
 #[divan::bench()]
 fn shakespeare_sentences_val_count_act(bencher: Bencher) {
diff --git a/benches/sla.rs b/benches/sla.rs
index 5d84dbe..92d89f0 100644
--- a/benches/sla.rs
+++ b/benches/sla.rs
@@ -388,7 +388,7 @@ fn tipover_attention_weave() {
     // let res = rtq.vF_mut().merkleize();
     // println!("{:?}", res.hash);
     let t0 = Instant::now();
-    println!("{:?} {:?}", rtq.vF().read_zipper().into_cata_cached(morphisms::alg::hash), t0.elapsed().as_micros());
+    // println!("{:?} {:?}", rtq.vF().read_zipper().into_cata_cached(morphisms::alg::hash), t0.elapsed().as_micros());
     return;
 
     // rtk.vF_mut().merkleize();
diff --git a/benches/sparse_keys.rs b/benches/sparse_keys.rs
index 8489314..42c4d42 100644
--- a/benches/sparse_keys.rs
+++ b/benches/sparse_keys.rs
@@ -92,6 +92,26 @@ fn sparse_val_count_bench(bencher: Bencher, n: u64) {
     assert_eq!(sink, n as usize);
 }
 
+#[divan::bench(args = [125, 250, 500, 1000, 2000, 4000])]
+fn sparse_goat_val_count_bench(bencher: Bencher, n: u64) {
+
+    let mut r = StdRng::seed_from_u64(1);
+    let keys: Vec<Vec<u8>> = (0..n).into_iter().map(|_| {
+        let len = (r.random::<u8>() % 18) + 3; //length between 3 and 20 chars
+        (0..len).into_iter().map(|_| r.random::<u8>()).collect()
+    }).collect();
+
+    let mut map: PathMap<u64> = PathMap::new();
+    for i in 0..n { map.set_val_at(&keys[i as usize], i); }
+
+    //Benchmark the time taken to count the number of values in the map
+    let mut sink = 0;
+    bencher.bench_local(|| {
+        *black_box(&mut sink) = map.goat_val_count()
+    });
+    assert_eq!(sink, n as usize);
+}
+
 #[divan::bench(args = [50, 100, 200, 400, 800, 1600])]
 fn binary_drop_head(bencher: Bencher, n: u64) {
 
diff --git a/benches/superdense_keys.rs b/benches/superdense_keys.rs
index 597954d..34a09ce 100644
--- a/benches/superdense_keys.rs
+++ b/benches/superdense_keys.rs
@@ -253,6 +253,21 @@ fn superdense_val_count_bench(bencher: Bencher, n: u64) {
     assert_eq!(sink, n as usize);
 }
 
+#[divan::bench(sample_size = 1, args = [100, 200, 400, 800, 1600, 3200, 20_000])]
+fn superdense_goat_val_count_bench(bencher: Bencher, n: u64) {
+
+    let mut map: PathMap<u64> = PathMap::new();
+    for i in 0..n { map.set_val_at(prefix_key(&i), i); }
+
+    //Benchmark the time taken to count the number of values in the map
+    let mut sink = 0;
+    bencher.bench_local(|| {
+        *black_box(&mut sink) = map.goat_val_count()
+    });
+    assert_eq!(sink, n as usize);
+}
+
+
 #[cfg(feature="arena_compact")]
 #[divan::bench(sample_size = 1, args = [100, 200, 400, 800, 1600, 3200, 20_000])]
 fn superdense_val_count_bench_act(bencher: Bencher, n: u64) {
diff --git a/src/dense_byte_node.rs b/src/dense_byte_node.rs
index dd846a2..defaccf 100644
--- a/src/dense_byte_node.rs
+++ b/src/dense_byte_node.rs
@@ -29,7 +29,7 @@ pub struct ByteNode<Cf, A: Allocator> {
     #[cfg(feature = "nightly")]
     values: Vec<Cf, A>,
     #[cfg(not(feature = "nightly"))]
-    values: Vec<Cf>,
+    pub(crate) values: Vec<Cf>,
     alloc: A,
 }
 
@@ -991,10 +991,18 @@ impl<V: Clone + Send + Sync, A: Allocator, Cf: CoFree<V=V, A=A>> TrieNode<V, A>
             t + cf.has_val() as usize + cf.rec().map(|r| val_count_below_node(r, cache)).unwrap_or(0)
         });
     }
-    fn node_goat_val_count(&self) -> usize {
+/*    fn node_goat_val_count(&self) -> usize {
         return self.values.iter().rfold(0, |t, cf| {
-            t + cf.has_val() as usize
+            t + cf.has_val() as usize + cf.rec().map(|r| r.as_tagged().node_goat_val_count()).unwrap_or(0)
         });
+    }*/
+    #[inline]
+    fn node_goat_val_count(&self) -> usize {
+        let mut result = 0;
+        for cf in self.values.iter() {
+            result += cf.has_val() as usize
+        }
+        result
     }
     fn node_child_iter_start(&self) -> (u64, Option<&TrieNodeODRc<V, A>>) {
         for (pos, cf) in self.values.iter().enumerate() {
diff --git a/src/line_list_node.rs b/src/line_list_node.rs
index 040f714..6612031 100644
--- a/src/line_list_node.rs
+++ b/src/line_list_node.rs
@@ -403,7 +403,7 @@ impl<V: Clone + Send + Sync, A: Allocator> LineListNode<V, A> {
         }
     }
     #[inline]
-    unsafe fn child_in_slot<const SLOT: usize>(&self) -> &TrieNodeODRc<V, A> {
+    pub(crate) unsafe fn child_in_slot<const SLOT: usize>(&self) -> &TrieNodeODRc<V, A> {
         match SLOT {
             0 => unsafe{ &*self.val_or_child0.child },
             1 => unsafe{ &*self.val_or_child1.child },
@@ -419,7 +419,7 @@ impl<V: Clone + Send + Sync, A: Allocator> LineListNode<V, A> {
         }
     }
     #[inline]
-    unsafe fn val_in_slot<const SLOT: usize>(&self) -> &V {
+    pub(crate) unsafe fn val_in_slot<const SLOT: usize>(&self) -> &V {
         match SLOT {
             0 => unsafe{ &**self.val_or_child0.val },
             1 => unsafe{ &**self.val_or_child1.val },
@@ -1986,6 +1986,25 @@ impl<V: Clone + Send + Sync, A: Allocator> TrieNode<V, A> for LineListNode<V, A>
         }
         result
     }
+/*    #[inline]
+    fn node_goat_val_count(&self) -> usize {
+        let mut result = 0;
+        if self.is_used_value_0() {
+            result += 1;
+        }
+        if self.is_used_value_1() {
+            result += 1;
+        }
+        if self.is_used_child_0() {
+            let child_node = unsafe{ self.child_in_slot::<0>() };
+            result += child_node.as_tagged().node_goat_val_count();
+        }
+        if self.is_used_child_1() {
+            let child_node = unsafe{ self.child_in_slot::<1>() };
+            result += child_node.as_tagged().node_goat_val_count();
+        }
+        result
+    }*/
     #[inline]
     fn node_goat_val_count(&self) -> usize {
         //Here are 3 alternative implementations.  They're basically the same in perf, with a slight edge to the
diff --git a/src/trie_map.rs b/src/trie_map.rs
index 3c5b0f3..5d21186 100644
--- a/src/trie_map.rs
+++ b/src/trie_map.rs
@@ -511,9 +511,25 @@ impl<V: Clone + Send + Sync + Unpin, A: Allocator> PathMap<V, A> {
         let root_val = unsafe{ &*self.root_val.get() }.is_some() as usize;
         match self.root() {
             Some(root) => {
-                traverse_physical(root,
-                    |node, ctx: usize| { ctx + node.node_goat_val_count() },
-                    |ctx, child_ctx| { ctx + child_ctx },
+                // root.as_tagged().node_goat_val_count() + root_val
+                // traverse_physical(root,
+                //                   |node, ctx: usize| { ctx + node.node_goat_val_count() },
+                //                   |ctx, child_ctx| { ctx + child_ctx },
+                // ) + root_val
+
+                // traverse_split_cata(
+                //     root,
+                //     |v, _| { 1usize },
+                //     |_, w, _| { 1 + w },
+                //     |bm, ws: &mut [usize], _| { ws.iter().sum() }
+                // ) + root_val
+                // Adam: this doesn't need to be called "traverse_osplit_cata" or be exposed under this interface; it can just live in morphisms
+                traverse_osplit_cata(
+                    root,
+                    |v, _| { 1usize }, // on leaf values
+                    |_, w, _| { 1 + w }, // on values amongst a path
+                    |bm, w: usize, _, total| { *total += w }, // on merging children into a node
+                    |bm, total: usize, _| { total } // finalizing a node
                 ) + root_val
             },
             None => root_val
diff --git a/src/trie_node.rs b/src/trie_node.rs
index 111bd5e..0687c4f 100644
--- a/src/trie_node.rs
+++ b/src/trie_node.rs
@@ -7,7 +7,7 @@ use dyn_clone::*;
 use local_or_heap::LocalOrHeap;
 use arrayvec::ArrayVec;
 
-use crate::utils::ByteMask;
+use crate::utils::{BitMask, ByteMask};
 use crate::alloc::Allocator;
 use crate::dense_byte_node::*;
 use crate::ring::*;
@@ -2422,16 +2422,147 @@ fn traverse_physical_children_internal<Ctx, NodeF, FoldF, V, A>(node: TaggedNode
 {
     let mut ctx = Ctx::default();
 
-    let (mut tok, mut child) = node.node_child_iter_start();
-    while let Some(child_node) = child {
-        let child_ctx = traverse_physical_internal(child_node, node_f, fold_f, cache);
-        ctx = fold_f(ctx, child_ctx);
-        (tok, child) = node.node_child_iter_next(tok);
+    match node {
+        TaggedNodeRef::DenseByteNode(n) => {
+            for cf in n.values.iter() {
+                if let Some(rec) = cf.rec() {
+                    let child_ctx = traverse_physical_internal(rec, node_f, fold_f, cache);
+                    ctx = fold_f(ctx, child_ctx);
+                }
+            }
+        }
+        TaggedNodeRef::LineListNode(n) => {
+            if n.is_used_child_0() {
+                let child_node = unsafe{ n.child_in_slot::<0>() };
+                let child_ctx = traverse_physical_internal(child_node, node_f, fold_f, cache);
+                ctx = fold_f(ctx, child_ctx);
+            }
+            if n.is_used_child_1() {
+                let child_node = unsafe{ n.child_in_slot::<1>() };
+                let child_ctx = traverse_physical_internal(child_node, node_f, fold_f, cache);
+                ctx = fold_f(ctx, child_ctx);
+            }
+        }
+        TaggedNodeRef::CellByteNode(_) => { todo!() }
+        TaggedNodeRef::TinyRefNode(_) => { todo!() }
+        TaggedNodeRef::EmptyNode => { todo!() }
     }
 
     node_f(node, ctx)
 }
 
+// This experiment is still OK, but the `&mut [W]` is awkward to instantiate if you don't actually have
+/*pub fn traverse_split_cata<'a, A : Allocator, V : TrieValue, W, MapF, CollapseF, AlgF>(node: &TrieNodeODRc<V, A>, mut map_f: MapF, mut collapse_f: CollapseF, alg_f: AlgF) -> W
+where
+    MapF: Copy + FnMut(&V, &[u8]) -> W + 'a,
+    CollapseF: Copy + FnMut(&V, W, &[u8]) -> W + 'a,
+    AlgF: Copy + Fn(&ByteMask, &mut [W], &[u8]) -> W + 'a,
+{
+    match node.as_tagged() {
+        TaggedNodeRef::DenseByteNode(n) => {
+            let mut ws = [const { std::mem::MaybeUninit::<W>::uninit() }; 256];
+            // let mut ws: Vec<std::mem::MaybeUninit::<W>> = Vec::with_capacity(n.mask.count_bits());
+            // unsafe { ws.set_len(n.mask.count_bits()) };
+            let mut c = 0;
+            for cf in n.values.iter() {
+                if let Some(rec) = cf.rec() {
+                    let w = traverse_split_cata(rec, map_f, collapse_f, alg_f);
+                    if let Some(v) = cf.val() {
+                        ws[c].write(collapse_f(v, w, &[]));
+                    } else {
+                        ws[c].write(w);
+                    }
+                } else if let Some(v) = cf.val() {
+                    ws[c].write(map_f(v, &[]));
+                }
+                c += 1;
+            }
+            alg_f(&n.mask, unsafe { std::mem::transmute(&mut ws[..c]) }, &[])
+        }
+        TaggedNodeRef::LineListNode(n) => {
+            // let mut ws = vec![];
+            // if n.is_used_value_0() {
+            //     ws.append(map_f(unsafe { n.val_in_slot::<0>() }, &[]));
+            // }
+            // if n.is_used_value_1() {
+            //     ws.append(map_f(unsafe { n.val_in_slot::<1>() }, &[]));
+            // }
+            // if n.is_used_child_0() {
+            //     let child_node = unsafe{ n.child_in_slot::<0>() };
+            //     let child_ctx = traverse_split_cata(child_node, map_f, collapse_f, alg_f);
+            //
+            // }
+            // if n.is_used_child_1() {
+            //     let child_node = unsafe{ n.child_in_slot::<1>() };
+            //     let child_ctx = traverse_physical_internal(child_node, node_f, fold_f, cache);
+            //     ctx = fold_f(ctx, child_ctx);
+            // }
+            alg_f(&ByteMask::new(), &mut [], &[])
+        }
+        TaggedNodeRef::CellByteNode(_) => { todo!() }
+        TaggedNodeRef::TinyRefNode(_) => { todo!() }
+        TaggedNodeRef::EmptyNode => { todo!() }
+    }
+}
+*/
+
+// Adam: This seems to be a winner, though it needs some work, the split alg gives us the opportunity to nicely compose the different calls for the different node types without introducing overhead
+pub fn traverse_osplit_cata<'a, A : Allocator, V : TrieValue, Alg : Default, W, MapF, CollapseF, InAlgF, OutAlgF>(node: &TrieNodeODRc<V, A>, mut map_f: MapF, mut collapse_f: CollapseF, in_alg_f: InAlgF, out_alg_f: OutAlgF) -> W
+where
+    MapF: Copy + FnMut(&V, &[u8]) -> W + 'a,
+    CollapseF: Copy + FnMut(&V, W, &[u8]) -> W + 'a,
+    InAlgF: Copy + Fn(&ByteMask, W, &[u8], &mut Alg),
+    OutAlgF: Copy + Fn(&ByteMask, Alg, &[u8]) -> W + 'a,
+{
+    match node.as_tagged() {
+        TaggedNodeRef::DenseByteNode(n) => {
+            let mut ws = Some(Alg::default());
+            for cf in n.values.iter() {
+                if let Some(rec) = cf.rec() {
+                    let w = traverse_osplit_cata(rec, map_f, collapse_f, in_alg_f, out_alg_f);
+                    if let Some(v) = cf.val() {
+                        in_alg_f(&n.mask, collapse_f(v, w, &[]), &[], unsafe { ws.as_mut().unwrap_unchecked() });
+                    } else {
+                        in_alg_f(&n.mask, w, &[], unsafe { ws.as_mut().unwrap_unchecked() });
+                    }
+                } else if let Some(v) = cf.val() {
+                    in_alg_f(&n.mask, map_f(v, &[]), &[], unsafe { ws.as_mut().unwrap_unchecked() });
+                }
+            }
+            out_alg_f(&n.mask, unsafe { std::mem::take(&mut ws).unwrap_unchecked() }, &[])
+        }
+        TaggedNodeRef::LineListNode(n) => {
+            // Adam: I skimped out on the collapse logic here, I assume there are some built-in LineListNode functions I can use for prefixes, or another way to organize the branching based on the mask directly
+            let mut ws = Some(Alg::default());
+
+            if n.is_used_value_0() {
+                in_alg_f(&ByteMask::new(), map_f(unsafe { n.val_in_slot::<0>() }, &[]), &[], unsafe { ws.as_mut().unwrap_unchecked() });
+            }
+            if n.is_used_value_1() {
+                in_alg_f(&ByteMask::new(), map_f(unsafe { n.val_in_slot::<1>() }, &[]), &[], unsafe { ws.as_mut().unwrap_unchecked() });
+            }
+            if n.is_used_child_0() {
+                let child_node = unsafe{ n.child_in_slot::<0>() };
+                let w = traverse_osplit_cata(child_node, map_f, collapse_f, in_alg_f, out_alg_f);
+                in_alg_f(&ByteMask::new(), w, &[], unsafe { ws.as_mut().unwrap_unchecked() });
+
+            }
+            if n.is_used_child_1() {
+                let child_node = unsafe{ n.child_in_slot::<1>() };
+                let w = traverse_osplit_cata(child_node, map_f, collapse_f, in_alg_f, out_alg_f);
+                in_alg_f(&ByteMask::new(), w, &[], unsafe { ws.as_mut().unwrap_unchecked() });
+            }
+
+            out_alg_f(&ByteMask::new(), unsafe { std::mem::take(&mut ws).unwrap_unchecked() }, &[])
+        }
+        TaggedNodeRef::CellByteNode(_) => { todo!() }
+        TaggedNodeRef::TinyRefNode(_) => { todo!() }
+        TaggedNodeRef::EmptyNode => {
+            out_alg_f(&ByteMask::new(), Alg::default(), &[])
+        }
+    }
+}
+
 /// Internal function to walk a mut TrieNodeODRc<V> ref along a path
 ///
 /// If `stop_early` is `true`, this function will return the parent node of the path and will never return
@@ -2483,6 +2614,9 @@ pub(crate) fn make_cell_node<V: Clone + Send + Sync, A: Allocator>(node: &mut Tr
 //  module come from the visibility of the trait it is derived on.  In this case, `TrieNode`
 //Credit to QuineDot for his ideas on this pattern here: https://users.rust-lang.org/t/inferred-lifetime-for-dyn-trait/112116/7
 pub(crate) use opaque_dyn_rc_trie_node::TrieNodeODRc;
+use crate::morphisms::SplitCata;
+use crate::TrieValue;
+
 #[cfg(not(feature = "slim_ptrs"))]
 mod opaque_dyn_rc_trie_node {
     use std::sync::Arc;