diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..4c5ff2a
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,16 @@
+root = true
+
+[Makefile]
+end_of_line = lf
+indent_style = tab
+indent_size = 8
+
+# Unix-style newlines with a newline ending every file
+[*]
+end_of_line = lf
+insert_final_newline = true
+indent_style = space
+indent_size = 2
+trim_trailing_whitespace = true
+max_line_length = 80
+
diff --git a/.gitignore b/.gitignore
index 793b258..fe1cf4a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,8 +37,7 @@ apps/pr/pr
 apps/sgd/sgd
 apps/triangle/triangle
 apps/triangle/triangle_nontex
-cub
 inputs/
-mgpu
 rt/src/*.o
 rt/lib/libggrt.a
+cscope.out
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..54d04bb
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "rt/include/mgpu"]
+	path = rt/include/mgpu
+	url = https://github.com/moderngpu/moderngpu
+	branch = branch_1.1
diff --git a/Makefile b/Makefile
index f5c2a34..81d2e01 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
 TOPLEVEL := .
 IRAPPS := bfs mst sssp sgd dmr mis cc pr triangle
 APPS := bh pta
-INPUT_URL := http://iss.ices.utexas.edu/projects/galois/downloads/lonestargpu2-inputs.tar.bz2
-BIP_INPUT_URL := http://iss.ices.utexas.edu/projects/galois/downloads/lonestargpu21-bipartite-inputs.tar.xz
+INPUT_URL := https://iss.oden.utexas.edu/projects/galois/downloads/lonestargpu2-inputs.tar.bz2
+BIP_INPUT_URL := https://iss.oden.utexas.edu/projects/galois/downloads/lonestargpu21-bipartite-inputs.tar.xz
 INPUT := lonestargpu2-inputs.tar.bz2
 BIP_INPUT := lonestargpu21-bipartite-inputs.tar.xz
 
@@ -29,7 +29,7 @@ inputs:
 	@echo "Uncompressing inputs ..."
 	@tar xvf $(INPUT)
 	@tar xvf $(BIP_INPUT)
-	@rm $(INPUT) $(BIP_INPUT)
+	@echo "Do you want to: rm $(INPUT) $(BIP_INPUT)"
 	@echo "Inputs available at $(TOPLEVEL)/inputs/"
 
 clean:
diff --git a/README.md b/README.md
index 660ff5f..f2385d7 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,10 @@
 The LonestarGPU (LSG) suite contains CUDA implementations of several
 irregular algorithms that exhibit amorphous data parallelism.
 
+The benchmarks in this repository have been moved to
+[Galois](https://github.com/IntelligentSoftwareSystems/Galois/): the
+instructions for running GPU benchmarks may be found in the `README`.
+
 # INSTALLATION
 
 You can checkout the latest release by typing (in a terminal):
@@ -21,24 +25,22 @@ git clone https://github.com/IntelligentSoftwareSystems/GaloisGPU
 
 ## Software pre-requisites
 
-* CUB (v1.3.1)
-
-[https://github.com/NVlabs/cub](https://github.com/NVlabs/cub)
-
-
 * ModernGPU (v1.1)
 
 [https://github.com/NVlabs/moderngpu/releases](https://github.com/NVlabs/moderngpu/releases)
 
-You will need to download CUB and Mgpu.
+ModernGPU can be downloaded as a submodule using `git submodule --init --recursive`.
+
+Else, you may download it manually.  
 Assuming LSGDIR contains the LonestarGPU source (i.e., this repository): 
 
 ```Shell
 cd $LSGDIR/rt/include
-ln -s path-to-cub-x.y/ cub
 ln -s path-to-mgpu-x.y/ mgpu
 ```
 
+CUB is included in recent CUDA releases, and need not be installed separately.
+
 To compile for a specific GPU architecture, make changes to the arch.mk file in the top directory.
 
 ## BUILDING
diff --git a/apps/mis/Makefile b/apps/mis/Makefile
index f35a72d..c206060 100644
--- a/apps/mis/Makefile
+++ b/apps/mis/Makefile
@@ -1,10 +1,10 @@
 # expect to be in ROOT/app/
 include ../../arch.mk
-INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include  -I../../rt/include/cub
+INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include
 LIBDIR=../../rt/lib
 LIBS=-lggrt -lcurand
 SKELAPP=../../skelapp
-EXTRAFLAGS=-g -O3 -DCSRG_TEX -w
+EXTRAFLAGS=-g -O3 -DCSRG_TEX
 COMPRESS_LIBS=-lz
 SNAPPY_LIBDIR=~/.local/lib
 
diff --git a/apps/mis/kernel.cu b/apps/mis/kernel.cu
index 0100b00..babeec5 100644
--- a/apps/mis/kernel.cu
+++ b/apps/mis/kernel.cu
@@ -167,35 +167,44 @@ void gg_main_pipe_1(CSRGraphTy& gg, int& STEPS, Shared<unsigned int>& prio, Pipe
     }
   }
 }
-__global__ void gg_main_pipe_1_gpu(CSRGraphTy gg, int STEPS, unsigned int* prio, PipeContextT<WorklistT> pipe, dim3 blocks, dim3 threads, int* cl_STEPS)
-{
-  unsigned tid = TID_1D;
-  unsigned nthreads = TOTAL_THREADS_1D;
 
+// Changed this from __global__ to a CPU function.
+// Changed STEPS from a GPU variable to a CPU variable.
+void gg_main_pipe_1_gpu(CSRGraphTy gg,
+                        unsigned int* prio,
+                        PipeContextT<WorklistT> pipe,
+                        dim3 blocks,
+                        dim3 threads,
+                        int* cl_STEPS)
+{
   const unsigned __kernel_tb_size = __tb_one;
-  STEPS = *cl_STEPS;
+  int STEPS = *cl_STEPS;
   {
     init_wl <<<blocks, threads>>>(gg, pipe.in_wl(), pipe.out_wl());
     pipe.in_wl().swap_slots();
-    cudaDeviceSynchronize();
+    assert(cudaDeviceSynchronize() == cudaSuccess);
     pipe.advance2();
     while (pipe.in_wl().nitems())
     {
       mark_nodes <<<blocks, threads>>>(gg, prio, pipe.in_wl(), pipe.out_wl());
-      cudaDeviceSynchronize();
+      assert(cudaDeviceSynchronize() == cudaSuccess);
       drop_marked_nodes_and_nbors <<<blocks, threads>>>(gg, pipe.in_wl(), pipe.out_wl());
       pipe.in_wl().swap_slots();
-      cudaDeviceSynchronize();
+      assert(cudaDeviceSynchronize() == cudaSuccess);
       pipe.advance2();
       STEPS++;
     }
   }
-  if (tid == 0)
-  {
-    *cl_STEPS = STEPS;
-  }
+
+  *cl_STEPS = STEPS;
 }
-void gg_main_pipe_1_wrapper(CSRGraphTy& gg, int& STEPS, Shared<unsigned int>& prio, PipeContextT<WorklistT>& pipe, dim3& blocks, dim3& threads)
+
+void gg_main_pipe_1_wrapper(CSRGraphTy& gg,
+                            int& STEPS,
+                            Shared<unsigned int>& prio,
+                            PipeContextT<WorklistT>& pipe,
+                            dim3& blocks,
+                            dim3& threads)
 {
   if (false)
   {
@@ -203,14 +212,9 @@ void gg_main_pipe_1_wrapper(CSRGraphTy& gg, int& STEPS, Shared<unsigned int>& pr
   }
   else
   {
-    int* cl_STEPS;
-    check_cuda(cudaMalloc(&cl_STEPS, sizeof(int) * 1));
-    check_cuda(cudaMemcpy(cl_STEPS, &STEPS, sizeof(int) * 1, cudaMemcpyHostToDevice));
+    int* cl_STEPS = &STEPS;
 
-    gg_main_pipe_1_gpu<<<1,1>>>(gg,STEPS,prio.gpu_wr_ptr(),pipe,blocks,threads,cl_STEPS);
-    // gg_main_pipe_1_gpu_gb<<<gg_main_pipe_1_gpu_gb_blocks, __tb_gg_main_pipe_1_gpu_gb>>>(gg,STEPS,prio.gpu_wr_ptr(),pipe,cl_STEPS, gg_main_pipe_1_gpu_gb_barrier);
-    check_cuda(cudaMemcpy(&STEPS, cl_STEPS, sizeof(int) * 1, cudaMemcpyDeviceToHost));
-    check_cuda(cudaFree(cl_STEPS));
+    gg_main_pipe_1_gpu(gg, prio.gpu_wr_ptr(), pipe, blocks, threads, cl_STEPS);
   }
 }
 void gg_main(CSRGraphTy& hg, CSRGraphTy& gg)
@@ -223,10 +227,10 @@ void gg_main(CSRGraphTy& hg, CSRGraphTy& gg)
   ggc::Timer t ("random");
   t.start();
   gen_prio_gpu <<<blocks, threads>>>(gg, prio.gpu_wr_ptr(), SEED1, SEED2, SEED3, SEED4);
-  cudaDeviceSynchronize();
+  assert(cudaDeviceSynchronize() == cudaSuccess);
   t.stop();
   printf("Random number generation took %llu ns\n", t.duration());
   pipe = PipeContextT<WorklistT>(gg.nnodes);
   gg_main_pipe_1_wrapper(gg,STEPS,prio,pipe,blocks,threads);
   printf("Total steps: %d\n", STEPS);
-}
\ No newline at end of file
+}
diff --git a/apps/mst/Makefile b/apps/mst/Makefile
index 7e40e6c..fbf7bbd 100644
--- a/apps/mst/Makefile
+++ b/apps/mst/Makefile
@@ -1,13 +1,15 @@
 # expect to be in ROOT/app/
 include ../../arch.mk
-INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include  -I../../rt/include/cub
+INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include
 LIBDIR=../../rt/lib
 LIBS=-lggrt -lcurand
 SKELAPP=../../skelapp
-EXTRAFLAGS=-g -O3 -DCSRG_TEX -w
+EXTRAFLAGS=-g -O3 -DCSRG_TEX
 COMPRESS_LIBS=-lz
 SNAPPY_LIBDIR=~/.local/lib
 
+NVCCFLAGS += -arch=native
+
 ifeq ($(USE_SNAPPY),1)
   COMPRESS_LIBS+=-lsnappy -L $(SNAPPY_LIBDIR)
 endif
@@ -36,7 +38,7 @@ skel.o: $(SKELAPP)/skel.cu
 	nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) $(EXTRAFLAGS) $(NVCCFLAGS) $(INCLUDES) $< -o $@
 
 skel-nontex.o: $(SKELAPP)/skel.cu
-	nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 -w $(NVCCFLAGS) $(INCLUDES) $< -o $@
+	nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 $(NVCCFLAGS) $(INCLUDES) $< -o $@
 
 %.o: %.cu
 #	nvcc -lineinfo -dc $(CUDA_ARCH) $(CXXFLAGS) $(EXTRAFLAGS) $(INCLUDES) $< $(EXTRA_SRC) -o $@ 
@@ -44,10 +46,10 @@ skel-nontex.o: $(SKELAPP)/skel.cu
 
 # this is only needed if the AST contained 'CSRGraphTy' instead of Graph().param() or such.
 kernel-nontex.o: kernel-nontex.cu
-	nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 -w $(INCLUDES) $< $(EXTRA_SRC) -o $@ 
+	nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 $(INCLUDES) $< $(EXTRA_SRC) -o $@ 
 
 support-nontex.o: support.cu
-	nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 -w $(INCLUDES) $< $(EXTRA_SRC) -o $@ 
+	nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 $(INCLUDES) $< $(EXTRA_SRC) -o $@ 
 
 %.cubin: %.cu
 	nvcc -cubin -dc $(CUDA_ARCH) $(CXXFLAGS) $(EXTRAFLAGS) $(NVCCFLAGS) $(INCLUDES) $< $(EXTRA_SRC) -o $@ 
@@ -58,7 +60,7 @@ mst: skel.o kernel.o support.o $(SKELAPP)/mgpucontext.o $(SKELAPP)/mgpuutil.o
 	cp $@ ../../bin
 
 mst_nontex: skel-nontex.o kernel-nontex.o support-nontex.o $(SKELAPP)/mgpucontext.o $(SKELAPP)/mgpuutil.o
-	nvcc -g -O3 -w $(CUDA_ARCH) $(CXXFLAGS) $(NVCCFLAGS) $(INCLUDES) -L$(LIBDIR) $(LIBS) -o $@ $^ -lcudadevrt $(COMPRESS_LIBS)
+	nvcc -g -O3 $(CUDA_ARCH) $(CXXFLAGS) $(NVCCFLAGS) $(INCLUDES) -L$(LIBDIR) $(LIBS) -o $@ $^ -lcudadevrt $(COMPRESS_LIBS)
 	cp $@ ../../bin
 
 mst-dp: skel.o kernel.o support.o
diff --git a/apps/sssp/Makefile b/apps/sssp/Makefile
index 92b826a..1660dd7 100644
--- a/apps/sssp/Makefile
+++ b/apps/sssp/Makefile
@@ -1,10 +1,10 @@
 # expect to be in ROOT/app/
 include ../../arch.mk
-INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include  -I../../rt/include/cub
+INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include
 LIBDIR=../../rt/lib
 LIBS=-lggrt -lcurand
 SKELAPP=../../skelapp
-EXTRAFLAGS=-g -O3 -w
+EXTRAFLAGS=-g -O3
 COMPRESS_LIBS=-lz
 SNAPPY_LIBDIR=~/.local/lib
 
diff --git a/apps/sssp/kernel.cu b/apps/sssp/kernel.cu
index dbec6b6..473da89 100644
--- a/apps/sssp/kernel.cu
+++ b/apps/sssp/kernel.cu
@@ -327,25 +327,25 @@ void gg_main_pipe_1(CSRGraph& gg, gint_p glevel, int& curdelta, int& i, int DELT
       {
         t_work.reset_thread_work();
         Inspect_sssp_kernel_dev <<<blocks, __tb_sssp_kernel>>>(gg, curdelta, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb, pipe.in_wl(), pipe.out_wl());
-        cudaDeviceSynchronize();
+        assert(cudaDeviceSynchronize() == cudaSuccess);
         int num_items = t_work.thread_work_wl.in_wl().nitems();
         if (num_items != 0)
         {
           t_work.compute_prefix_sum();
-          cudaDeviceSynchronize();
+          assert(cudaDeviceSynchronize() == cudaSuccess);
           sssp_kernel_dev_TB_LB <<<blocks, __tb_sssp_kernel>>>(gg, curdelta, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl, pipe.in_wl(), pipe.out_wl(), pipe.re_wl());
-          cudaDeviceSynchronize();
+          assert(cudaDeviceSynchronize() == cudaSuccess);
         }
       }
       sssp_kernel <<<blocks, __tb_sssp_kernel>>>(gg, curdelta, enable_lb, pipe.in_wl(), pipe.out_wl(), pipe.re_wl());
-      cudaDeviceSynchronize();
+      assert(cudaDeviceSynchronize() == cudaSuccess);
       pipe.in_wl().swap_slots();
       pipe.retry2();
     }
     pipe.advance2();
     pipe.out_wl().will_write();
     remove_dups <<<remove_dups_blocks, __tb_remove_dups>>>(glevel, pipe.in_wl(), pipe.out_wl(), remove_dups_barrier);
-    cudaDeviceSynchronize();
+    assert(cudaDeviceSynchronize() == cudaSuccess);
     pipe.in_wl().swap_slots();
     pipe.advance2();
     i++;
@@ -389,40 +389,7 @@ __global__ void __launch_bounds__(__tb_gg_main_pipe_1_gpu_gb) gg_main_pipe_1_gpu
     *cl_i = i;
   }
 }
-__global__ void gg_main_pipe_1_gpu(CSRGraph gg, gint_p glevel, int curdelta, int i, int DELTA, GlobalBarrier remove_dups_barrier, int remove_dups_blocks, PipeContextT<Worklist2> pipe, dim3 blocks, dim3 threads, int* cl_curdelta, int* cl_i, bool enable_lb)
-{
-  unsigned tid = TID_1D;
-  unsigned nthreads = TOTAL_THREADS_1D;
 
-  const unsigned __kernel_tb_size = __tb_one;
-  curdelta = *cl_curdelta;
-  i = *cl_i;
-  while (pipe.in_wl().nitems())
-  {
-    while (pipe.in_wl().nitems())
-    {
-      sssp_kernel <<<blocks, __tb_sssp_kernel>>>(gg, curdelta, enable_lb, pipe.in_wl(), pipe.out_wl(), pipe.re_wl());
-      cudaDeviceSynchronize();
-      pipe.in_wl().swap_slots();
-      cudaDeviceSynchronize();
-      pipe.retry2();
-    }
-    cudaDeviceSynchronize();
-    pipe.advance2();
-    remove_dups <<<remove_dups_blocks, __tb_remove_dups>>>(glevel, pipe.in_wl(), pipe.out_wl(), remove_dups_barrier);
-    cudaDeviceSynchronize();
-    pipe.in_wl().swap_slots();
-    cudaDeviceSynchronize();
-    pipe.advance2();
-    i++;
-    curdelta += DELTA;
-  }
-  if (tid == 0)
-  {
-    *cl_curdelta = curdelta;
-    *cl_i = i;
-  }
-}
 void gg_main_pipe_1_wrapper(CSRGraph& gg, gint_p glevel, int& curdelta, int& i, int DELTA, GlobalBarrier& remove_dups_barrier, int remove_dups_blocks, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)
 {
   static GlobalBarrierLifetime gg_main_pipe_1_gpu_gb_barrier;
@@ -467,7 +434,7 @@ void gg_main(CSRGraph& hg, CSRGraph& gg)
   static const size_t remove_dups_blocks = GG_MIN(blocks.x, ggc_get_nSM() * remove_dups_residency);
   if(!remove_dups_barrier_inited) { remove_dups_barrier.Setup(remove_dups_blocks); remove_dups_barrier_inited = true;};
   kernel <<<blocks, threads>>>(gg, start_node);
-  cudaDeviceSynchronize();
+  assert(cudaDeviceSynchronize() == cudaSuccess);
   int i = 0;
   int curdelta = 0;
   printf("delta: %d\n", DELTA);
@@ -477,4 +444,4 @@ void gg_main(CSRGraph& hg, CSRGraph& gg)
   pipe.in_wl().update_gpu(1);
   gg_main_pipe_1_wrapper(gg,glevel,curdelta,i,DELTA,remove_dups_barrier,remove_dups_blocks,pipe,blocks,threads);
   printf("iterations: %d\n", i);
-}
\ No newline at end of file
+}
diff --git a/arch.mk b/arch.mk
index 293cefb..39de219 100644
--- a/arch.mk
+++ b/arch.mk
@@ -1,8 +1,5 @@
-CUDA_ARCH := \
-	-gencode arch=compute_35,code=sm_35 \
-	-gencode arch=compute_37,code=sm_37 \
-	-gencode arch=compute_50,code=sm_50 \
-	-gencode arch=compute_52,code=sm_52 \
-	-gencode arch=compute_60,code=sm_60 \
-	-gencode arch=compute_61,code=sm_61 \
-	-gencode arch=compute_61,code=compute_61
+# nvcc handles architectures.
+# Change this if you need to.
+CUDA_ARCH := 
+
+NVCCFLAGS += -arch=native
diff --git a/rt/include/aolist.h b/rt/include/aolist.h
index 5019b3f..0c0cf34 100644
--- a/rt/include/aolist.h
+++ b/rt/include/aolist.h
@@ -131,8 +131,9 @@ struct AppendOnlyList {
       assert(lindex <= size);
     }
     
-    lindex = cub::ShuffleBroadcast(lindex, first);
-    //lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1
+    unsigned mask = __activemask();
+    lindex = __shfl_sync(mask, lindex, first);
+
     return lindex + offset;
   }
 
diff --git a/rt/include/mgpu b/rt/include/mgpu
new file mode 160000
index 0000000..4cddabe
--- /dev/null
+++ b/rt/include/mgpu
@@ -0,0 +1 @@
+Subproject commit 4cddabe1201a325fc630ed4ce718583d35658f8e
diff --git a/rt/include/worklist.h b/rt/include/worklist.h
index 384b223..aa6516e 100644
--- a/rt/include/worklist.h
+++ b/rt/include/worklist.h
@@ -361,8 +361,8 @@ struct Worklist {
       // counting density makes no sense -- it is always 1
     }
     
-    lindex = cub::ShuffleBroadcast(lindex, first);
-    //lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1
+    unsigned mask = __activemask();
+    lindex = __shfl_sync(mask, lindex, first);
 
     return lindex + offset;
   }
@@ -383,8 +383,8 @@ struct Worklist {
 #endif     
     }
     
-    lindex = cub::ShuffleBroadcast(lindex, first);
-    //lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1
+    unsigned mask = __activemask();
+    lindex = __shfl_sync(mask, lindex, first);
 
     return lindex + offset;
   }
@@ -405,8 +405,8 @@ struct Worklist {
 #endif     
     }
     
-    lindex = cub::ShuffleBroadcast(lindex, 0);
-    // lindex = cub::ShuffleIndex(lindex, 0); // CUB > 1.3.1
+    unsigned mask = __activemask();
+    lindex = __shfl_sync(mask, lindex, 0);
 
     return lindex + offset;
   }
@@ -740,8 +740,8 @@ struct Worklist2Light {
       // counting density makes no sense -- it is always 1
     }
     
-    lindex = cub::ShuffleBroadcast(lindex, first);
-    //lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1
+    unsigned mask = __activemask();
+    lindex = __shfl_sync(mask, lindex, first);
 
     return lindex + offset;
   }
diff --git a/rt/src/Makefile b/rt/src/Makefile
index 91ce2f5..99e9737 100644
--- a/rt/src/Makefile
+++ b/rt/src/Makefile
@@ -1,5 +1,5 @@
 include ../../arch.mk
-INCLUDES=-I../include -I../include/mgpu/include -I../include/cub
+INCLUDES=-I../include -I../include/mgpu/include
 LIBDIR=../lib
 SNAPPY_INC=~/.local/include
 SNAPPY_LIB=~/.local/lib
diff --git a/rt/src/csr_graph.cu b/rt/src/csr_graph.cu
index cacf5de..31f58ff 100644
--- a/rt/src/csr_graph.cu
+++ b/rt/src/csr_graph.cu
@@ -40,7 +40,7 @@ unsigned CSRGraph::allocOnHost() {
   size_t mem_usage = ((nnodes + 1) + nedges) * sizeof(index_type) 
     + (nedges) * sizeof(edge_data_type) + (nnodes) * sizeof(node_data_type);
     
-  printf("Host memory for graph: %3u MB\n", mem_usage / 1048756);
+  printf("Host memory for graph: %lu MB\n", mem_usage / 1048756);
 
   return (edge_data && row_start && edge_dst && node_data);
 }