diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..4c5ff2a --- /dev/null +++ b/.editorconfig @@ -0,0 +1,16 @@ +root = true + +[Makefile] +end_of_line = lf +indent_style = tab +indent_size = 8 + +# Unix-style newlines with a newline ending every file +[*] +end_of_line = lf +insert_final_newline = true +indent_style = space +indent_size = 2 +trim_trailing_whitespace = true +max_line_length = 80 + diff --git a/.gitignore b/.gitignore index 793b258..fe1cf4a 100644 --- a/.gitignore +++ b/.gitignore @@ -37,8 +37,7 @@ apps/pr/pr apps/sgd/sgd apps/triangle/triangle apps/triangle/triangle_nontex -cub inputs/ -mgpu rt/src/*.o rt/lib/libggrt.a +cscope.out diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..54d04bb --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "rt/include/mgpu"] + path = rt/include/mgpu + url = https://github.com/moderngpu/moderngpu + branch = branch_1.1 diff --git a/Makefile b/Makefile index f5c2a34..81d2e01 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ TOPLEVEL := . IRAPPS := bfs mst sssp sgd dmr mis cc pr triangle APPS := bh pta -INPUT_URL := http://iss.ices.utexas.edu/projects/galois/downloads/lonestargpu2-inputs.tar.bz2 -BIP_INPUT_URL := http://iss.ices.utexas.edu/projects/galois/downloads/lonestargpu21-bipartite-inputs.tar.xz +INPUT_URL := https://iss.oden.utexas.edu/projects/galois/downloads/lonestargpu2-inputs.tar.bz2 +BIP_INPUT_URL := https://iss.oden.utexas.edu/projects/galois/downloads/lonestargpu21-bipartite-inputs.tar.xz INPUT := lonestargpu2-inputs.tar.bz2 BIP_INPUT := lonestargpu21-bipartite-inputs.tar.xz @@ -29,7 +29,7 @@ inputs: @echo "Uncompressing inputs ..." @tar xvf $(INPUT) @tar xvf $(BIP_INPUT) - @rm $(INPUT) $(BIP_INPUT) + @echo "Do you want to: rm $(INPUT) $(BIP_INPUT)" @echo "Inputs available at $(TOPLEVEL)/inputs/" clean: diff --git a/README.md b/README.md index 660ff5f..f2385d7 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,10 @@ The LonestarGPU (LSG) suite contains CUDA implementations of several irregular algorithms that exhibit amorphous data parallelism. +The benchmarks in this repository have been moved to +[Galois](https://github.com/IntelligentSoftwareSystems/Galois/): the +instructions for running GPU benchmarks may be found in the `README`. + # INSTALLATION You can checkout the latest release by typing (in a terminal): @@ -21,24 +25,22 @@ git clone https://github.com/IntelligentSoftwareSystems/GaloisGPU ## Software pre-requisites -* CUB (v1.3.1) - -[https://github.com/NVlabs/cub](https://github.com/NVlabs/cub) - - * ModernGPU (v1.1) [https://github.com/NVlabs/moderngpu/releases](https://github.com/NVlabs/moderngpu/releases) -You will need to download CUB and Mgpu. +ModernGPU can be downloaded as a submodule using `git submodule --init --recursive`. + +Else, you may download it manually. Assuming LSGDIR contains the LonestarGPU source (i.e., this repository): ```Shell cd $LSGDIR/rt/include -ln -s path-to-cub-x.y/ cub ln -s path-to-mgpu-x.y/ mgpu ``` +CUB is included in recent CUDA releases, and need not be installed separately. + To compile for a specific GPU architecture, make changes to the arch.mk file in the top directory. ## BUILDING diff --git a/apps/mis/Makefile b/apps/mis/Makefile index f35a72d..c206060 100644 --- a/apps/mis/Makefile +++ b/apps/mis/Makefile @@ -1,10 +1,10 @@ # expect to be in ROOT/app/ include ../../arch.mk -INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include -I../../rt/include/cub +INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include LIBDIR=../../rt/lib LIBS=-lggrt -lcurand SKELAPP=../../skelapp -EXTRAFLAGS=-g -O3 -DCSRG_TEX -w +EXTRAFLAGS=-g -O3 -DCSRG_TEX COMPRESS_LIBS=-lz SNAPPY_LIBDIR=~/.local/lib diff --git a/apps/mis/kernel.cu b/apps/mis/kernel.cu index 0100b00..babeec5 100644 --- a/apps/mis/kernel.cu +++ b/apps/mis/kernel.cu @@ -167,35 +167,44 @@ void gg_main_pipe_1(CSRGraphTy& gg, int& STEPS, Shared& prio, Pipe } } } -__global__ void gg_main_pipe_1_gpu(CSRGraphTy gg, int STEPS, unsigned int* prio, PipeContextT pipe, dim3 blocks, dim3 threads, int* cl_STEPS) -{ - unsigned tid = TID_1D; - unsigned nthreads = TOTAL_THREADS_1D; +// Changed this from __global__ to a CPU function. +// Changed STEPS from a GPU variable to a CPU variable. +void gg_main_pipe_1_gpu(CSRGraphTy gg, + unsigned int* prio, + PipeContextT pipe, + dim3 blocks, + dim3 threads, + int* cl_STEPS) +{ const unsigned __kernel_tb_size = __tb_one; - STEPS = *cl_STEPS; + int STEPS = *cl_STEPS; { init_wl <<>>(gg, pipe.in_wl(), pipe.out_wl()); pipe.in_wl().swap_slots(); - cudaDeviceSynchronize(); + assert(cudaDeviceSynchronize() == cudaSuccess); pipe.advance2(); while (pipe.in_wl().nitems()) { mark_nodes <<>>(gg, prio, pipe.in_wl(), pipe.out_wl()); - cudaDeviceSynchronize(); + assert(cudaDeviceSynchronize() == cudaSuccess); drop_marked_nodes_and_nbors <<>>(gg, pipe.in_wl(), pipe.out_wl()); pipe.in_wl().swap_slots(); - cudaDeviceSynchronize(); + assert(cudaDeviceSynchronize() == cudaSuccess); pipe.advance2(); STEPS++; } } - if (tid == 0) - { - *cl_STEPS = STEPS; - } + + *cl_STEPS = STEPS; } -void gg_main_pipe_1_wrapper(CSRGraphTy& gg, int& STEPS, Shared& prio, PipeContextT& pipe, dim3& blocks, dim3& threads) + +void gg_main_pipe_1_wrapper(CSRGraphTy& gg, + int& STEPS, + Shared& prio, + PipeContextT& pipe, + dim3& blocks, + dim3& threads) { if (false) { @@ -203,14 +212,9 @@ void gg_main_pipe_1_wrapper(CSRGraphTy& gg, int& STEPS, Shared& pr } else { - int* cl_STEPS; - check_cuda(cudaMalloc(&cl_STEPS, sizeof(int) * 1)); - check_cuda(cudaMemcpy(cl_STEPS, &STEPS, sizeof(int) * 1, cudaMemcpyHostToDevice)); + int* cl_STEPS = &STEPS; - gg_main_pipe_1_gpu<<<1,1>>>(gg,STEPS,prio.gpu_wr_ptr(),pipe,blocks,threads,cl_STEPS); - // gg_main_pipe_1_gpu_gb<<>>(gg,STEPS,prio.gpu_wr_ptr(),pipe,cl_STEPS, gg_main_pipe_1_gpu_gb_barrier); - check_cuda(cudaMemcpy(&STEPS, cl_STEPS, sizeof(int) * 1, cudaMemcpyDeviceToHost)); - check_cuda(cudaFree(cl_STEPS)); + gg_main_pipe_1_gpu(gg, prio.gpu_wr_ptr(), pipe, blocks, threads, cl_STEPS); } } void gg_main(CSRGraphTy& hg, CSRGraphTy& gg) @@ -223,10 +227,10 @@ void gg_main(CSRGraphTy& hg, CSRGraphTy& gg) ggc::Timer t ("random"); t.start(); gen_prio_gpu <<>>(gg, prio.gpu_wr_ptr(), SEED1, SEED2, SEED3, SEED4); - cudaDeviceSynchronize(); + assert(cudaDeviceSynchronize() == cudaSuccess); t.stop(); printf("Random number generation took %llu ns\n", t.duration()); pipe = PipeContextT(gg.nnodes); gg_main_pipe_1_wrapper(gg,STEPS,prio,pipe,blocks,threads); printf("Total steps: %d\n", STEPS); -} \ No newline at end of file +} diff --git a/apps/mst/Makefile b/apps/mst/Makefile index 7e40e6c..fbf7bbd 100644 --- a/apps/mst/Makefile +++ b/apps/mst/Makefile @@ -1,13 +1,15 @@ # expect to be in ROOT/app/ include ../../arch.mk -INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include -I../../rt/include/cub +INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include LIBDIR=../../rt/lib LIBS=-lggrt -lcurand SKELAPP=../../skelapp -EXTRAFLAGS=-g -O3 -DCSRG_TEX -w +EXTRAFLAGS=-g -O3 -DCSRG_TEX COMPRESS_LIBS=-lz SNAPPY_LIBDIR=~/.local/lib +NVCCFLAGS += -arch=native + ifeq ($(USE_SNAPPY),1) COMPRESS_LIBS+=-lsnappy -L $(SNAPPY_LIBDIR) endif @@ -36,7 +38,7 @@ skel.o: $(SKELAPP)/skel.cu nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) $(EXTRAFLAGS) $(NVCCFLAGS) $(INCLUDES) $< -o $@ skel-nontex.o: $(SKELAPP)/skel.cu - nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 -w $(NVCCFLAGS) $(INCLUDES) $< -o $@ + nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 $(NVCCFLAGS) $(INCLUDES) $< -o $@ %.o: %.cu # nvcc -lineinfo -dc $(CUDA_ARCH) $(CXXFLAGS) $(EXTRAFLAGS) $(INCLUDES) $< $(EXTRA_SRC) -o $@ @@ -44,10 +46,10 @@ skel-nontex.o: $(SKELAPP)/skel.cu # this is only needed if the AST contained 'CSRGraphTy' instead of Graph().param() or such. kernel-nontex.o: kernel-nontex.cu - nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 -w $(INCLUDES) $< $(EXTRA_SRC) -o $@ + nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 $(INCLUDES) $< $(EXTRA_SRC) -o $@ support-nontex.o: support.cu - nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 -w $(INCLUDES) $< $(EXTRA_SRC) -o $@ + nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 $(INCLUDES) $< $(EXTRA_SRC) -o $@ %.cubin: %.cu nvcc -cubin -dc $(CUDA_ARCH) $(CXXFLAGS) $(EXTRAFLAGS) $(NVCCFLAGS) $(INCLUDES) $< $(EXTRA_SRC) -o $@ @@ -58,7 +60,7 @@ mst: skel.o kernel.o support.o $(SKELAPP)/mgpucontext.o $(SKELAPP)/mgpuutil.o cp $@ ../../bin mst_nontex: skel-nontex.o kernel-nontex.o support-nontex.o $(SKELAPP)/mgpucontext.o $(SKELAPP)/mgpuutil.o - nvcc -g -O3 -w $(CUDA_ARCH) $(CXXFLAGS) $(NVCCFLAGS) $(INCLUDES) -L$(LIBDIR) $(LIBS) -o $@ $^ -lcudadevrt $(COMPRESS_LIBS) + nvcc -g -O3 $(CUDA_ARCH) $(CXXFLAGS) $(NVCCFLAGS) $(INCLUDES) -L$(LIBDIR) $(LIBS) -o $@ $^ -lcudadevrt $(COMPRESS_LIBS) cp $@ ../../bin mst-dp: skel.o kernel.o support.o diff --git a/apps/sssp/Makefile b/apps/sssp/Makefile index 92b826a..1660dd7 100644 --- a/apps/sssp/Makefile +++ b/apps/sssp/Makefile @@ -1,10 +1,10 @@ # expect to be in ROOT/app/ include ../../arch.mk -INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include -I../../rt/include/cub +INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include LIBDIR=../../rt/lib LIBS=-lggrt -lcurand SKELAPP=../../skelapp -EXTRAFLAGS=-g -O3 -w +EXTRAFLAGS=-g -O3 COMPRESS_LIBS=-lz SNAPPY_LIBDIR=~/.local/lib diff --git a/apps/sssp/kernel.cu b/apps/sssp/kernel.cu index dbec6b6..473da89 100644 --- a/apps/sssp/kernel.cu +++ b/apps/sssp/kernel.cu @@ -327,25 +327,25 @@ void gg_main_pipe_1(CSRGraph& gg, gint_p glevel, int& curdelta, int& i, int DELT { t_work.reset_thread_work(); Inspect_sssp_kernel_dev <<>>(gg, curdelta, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb, pipe.in_wl(), pipe.out_wl()); - cudaDeviceSynchronize(); + assert(cudaDeviceSynchronize() == cudaSuccess); int num_items = t_work.thread_work_wl.in_wl().nitems(); if (num_items != 0) { t_work.compute_prefix_sum(); - cudaDeviceSynchronize(); + assert(cudaDeviceSynchronize() == cudaSuccess); sssp_kernel_dev_TB_LB <<>>(gg, curdelta, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl, pipe.in_wl(), pipe.out_wl(), pipe.re_wl()); - cudaDeviceSynchronize(); + assert(cudaDeviceSynchronize() == cudaSuccess); } } sssp_kernel <<>>(gg, curdelta, enable_lb, pipe.in_wl(), pipe.out_wl(), pipe.re_wl()); - cudaDeviceSynchronize(); + assert(cudaDeviceSynchronize() == cudaSuccess); pipe.in_wl().swap_slots(); pipe.retry2(); } pipe.advance2(); pipe.out_wl().will_write(); remove_dups <<>>(glevel, pipe.in_wl(), pipe.out_wl(), remove_dups_barrier); - cudaDeviceSynchronize(); + assert(cudaDeviceSynchronize() == cudaSuccess); pipe.in_wl().swap_slots(); pipe.advance2(); i++; @@ -389,40 +389,7 @@ __global__ void __launch_bounds__(__tb_gg_main_pipe_1_gpu_gb) gg_main_pipe_1_gpu *cl_i = i; } } -__global__ void gg_main_pipe_1_gpu(CSRGraph gg, gint_p glevel, int curdelta, int i, int DELTA, GlobalBarrier remove_dups_barrier, int remove_dups_blocks, PipeContextT pipe, dim3 blocks, dim3 threads, int* cl_curdelta, int* cl_i, bool enable_lb) -{ - unsigned tid = TID_1D; - unsigned nthreads = TOTAL_THREADS_1D; - const unsigned __kernel_tb_size = __tb_one; - curdelta = *cl_curdelta; - i = *cl_i; - while (pipe.in_wl().nitems()) - { - while (pipe.in_wl().nitems()) - { - sssp_kernel <<>>(gg, curdelta, enable_lb, pipe.in_wl(), pipe.out_wl(), pipe.re_wl()); - cudaDeviceSynchronize(); - pipe.in_wl().swap_slots(); - cudaDeviceSynchronize(); - pipe.retry2(); - } - cudaDeviceSynchronize(); - pipe.advance2(); - remove_dups <<>>(glevel, pipe.in_wl(), pipe.out_wl(), remove_dups_barrier); - cudaDeviceSynchronize(); - pipe.in_wl().swap_slots(); - cudaDeviceSynchronize(); - pipe.advance2(); - i++; - curdelta += DELTA; - } - if (tid == 0) - { - *cl_curdelta = curdelta; - *cl_i = i; - } -} void gg_main_pipe_1_wrapper(CSRGraph& gg, gint_p glevel, int& curdelta, int& i, int DELTA, GlobalBarrier& remove_dups_barrier, int remove_dups_blocks, PipeContextT& pipe, dim3& blocks, dim3& threads) { static GlobalBarrierLifetime gg_main_pipe_1_gpu_gb_barrier; @@ -467,7 +434,7 @@ void gg_main(CSRGraph& hg, CSRGraph& gg) static const size_t remove_dups_blocks = GG_MIN(blocks.x, ggc_get_nSM() * remove_dups_residency); if(!remove_dups_barrier_inited) { remove_dups_barrier.Setup(remove_dups_blocks); remove_dups_barrier_inited = true;}; kernel <<>>(gg, start_node); - cudaDeviceSynchronize(); + assert(cudaDeviceSynchronize() == cudaSuccess); int i = 0; int curdelta = 0; printf("delta: %d\n", DELTA); @@ -477,4 +444,4 @@ void gg_main(CSRGraph& hg, CSRGraph& gg) pipe.in_wl().update_gpu(1); gg_main_pipe_1_wrapper(gg,glevel,curdelta,i,DELTA,remove_dups_barrier,remove_dups_blocks,pipe,blocks,threads); printf("iterations: %d\n", i); -} \ No newline at end of file +} diff --git a/arch.mk b/arch.mk index 293cefb..39de219 100644 --- a/arch.mk +++ b/arch.mk @@ -1,8 +1,5 @@ -CUDA_ARCH := \ - -gencode arch=compute_35,code=sm_35 \ - -gencode arch=compute_37,code=sm_37 \ - -gencode arch=compute_50,code=sm_50 \ - -gencode arch=compute_52,code=sm_52 \ - -gencode arch=compute_60,code=sm_60 \ - -gencode arch=compute_61,code=sm_61 \ - -gencode arch=compute_61,code=compute_61 +# nvcc handles architectures. +# Change this if you need to. +CUDA_ARCH := + +NVCCFLAGS += -arch=native diff --git a/rt/include/aolist.h b/rt/include/aolist.h index 5019b3f..0c0cf34 100644 --- a/rt/include/aolist.h +++ b/rt/include/aolist.h @@ -131,8 +131,9 @@ struct AppendOnlyList { assert(lindex <= size); } - lindex = cub::ShuffleBroadcast(lindex, first); - //lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1 + unsigned mask = __activemask(); + lindex = __shfl_sync(mask, lindex, first); + return lindex + offset; } diff --git a/rt/include/mgpu b/rt/include/mgpu new file mode 160000 index 0000000..4cddabe --- /dev/null +++ b/rt/include/mgpu @@ -0,0 +1 @@ +Subproject commit 4cddabe1201a325fc630ed4ce718583d35658f8e diff --git a/rt/include/worklist.h b/rt/include/worklist.h index 384b223..aa6516e 100644 --- a/rt/include/worklist.h +++ b/rt/include/worklist.h @@ -361,8 +361,8 @@ struct Worklist { // counting density makes no sense -- it is always 1 } - lindex = cub::ShuffleBroadcast(lindex, first); - //lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1 + unsigned mask = __activemask(); + lindex = __shfl_sync(mask, lindex, first); return lindex + offset; } @@ -383,8 +383,8 @@ struct Worklist { #endif } - lindex = cub::ShuffleBroadcast(lindex, first); - //lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1 + unsigned mask = __activemask(); + lindex = __shfl_sync(mask, lindex, first); return lindex + offset; } @@ -405,8 +405,8 @@ struct Worklist { #endif } - lindex = cub::ShuffleBroadcast(lindex, 0); - // lindex = cub::ShuffleIndex(lindex, 0); // CUB > 1.3.1 + unsigned mask = __activemask(); + lindex = __shfl_sync(mask, lindex, 0); return lindex + offset; } @@ -740,8 +740,8 @@ struct Worklist2Light { // counting density makes no sense -- it is always 1 } - lindex = cub::ShuffleBroadcast(lindex, first); - //lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1 + unsigned mask = __activemask(); + lindex = __shfl_sync(mask, lindex, first); return lindex + offset; } diff --git a/rt/src/Makefile b/rt/src/Makefile index 91ce2f5..99e9737 100644 --- a/rt/src/Makefile +++ b/rt/src/Makefile @@ -1,5 +1,5 @@ include ../../arch.mk -INCLUDES=-I../include -I../include/mgpu/include -I../include/cub +INCLUDES=-I../include -I../include/mgpu/include LIBDIR=../lib SNAPPY_INC=~/.local/include SNAPPY_LIB=~/.local/lib diff --git a/rt/src/csr_graph.cu b/rt/src/csr_graph.cu index cacf5de..31f58ff 100644 --- a/rt/src/csr_graph.cu +++ b/rt/src/csr_graph.cu @@ -40,7 +40,7 @@ unsigned CSRGraph::allocOnHost() { size_t mem_usage = ((nnodes + 1) + nedges) * sizeof(index_type) + (nedges) * sizeof(edge_data_type) + (nnodes) * sizeof(node_data_type); - printf("Host memory for graph: %3u MB\n", mem_usage / 1048756); + printf("Host memory for graph: %lu MB\n", mem_usage / 1048756); return (edge_data && row_start && edge_dst && node_data); }