Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
root = true

[Makefile]
end_of_line = lf
indent_style = tab
indent_size = 8

# Unix-style newlines with a newline ending every file
[*]
end_of_line = lf
insert_final_newline = true
indent_style = space
indent_size = 2
trim_trailing_whitespace = true
max_line_length = 80

3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ apps/pr/pr
apps/sgd/sgd
apps/triangle/triangle
apps/triangle/triangle_nontex
cub
inputs/
mgpu
rt/src/*.o
rt/lib/libggrt.a
cscope.out
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[submodule "rt/include/mgpu"]
path = rt/include/mgpu
url = https://github.com/moderngpu/moderngpu
branch = branch_1.1
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
TOPLEVEL := .
IRAPPS := bfs mst sssp sgd dmr mis cc pr triangle
APPS := bh pta
INPUT_URL := http://iss.ices.utexas.edu/projects/galois/downloads/lonestargpu2-inputs.tar.bz2
BIP_INPUT_URL := http://iss.ices.utexas.edu/projects/galois/downloads/lonestargpu21-bipartite-inputs.tar.xz
INPUT_URL := https://iss.oden.utexas.edu/projects/galois/downloads/lonestargpu2-inputs.tar.bz2
BIP_INPUT_URL := https://iss.oden.utexas.edu/projects/galois/downloads/lonestargpu21-bipartite-inputs.tar.xz
INPUT := lonestargpu2-inputs.tar.bz2
BIP_INPUT := lonestargpu21-bipartite-inputs.tar.xz

Expand All @@ -29,7 +29,7 @@ inputs:
@echo "Uncompressing inputs ..."
@tar xvf $(INPUT)
@tar xvf $(BIP_INPUT)
@rm $(INPUT) $(BIP_INPUT)
@echo "Do you want to: rm $(INPUT) $(BIP_INPUT)"
@echo "Inputs available at $(TOPLEVEL)/inputs/"

clean:
Expand Down
16 changes: 9 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
The LonestarGPU (LSG) suite contains CUDA implementations of several
irregular algorithms that exhibit amorphous data parallelism.

The benchmarks in this repository have been moved to
[Galois](https://github.com/IntelligentSoftwareSystems/Galois/): the
instructions for running GPU benchmarks may be found in the `README`.

# INSTALLATION

You can checkout the latest release by typing (in a terminal):
Expand All @@ -21,24 +25,22 @@ git clone https://github.com/IntelligentSoftwareSystems/GaloisGPU

## Software pre-requisites

* CUB (v1.3.1)

[https://github.com/NVlabs/cub](https://github.com/NVlabs/cub)


* ModernGPU (v1.1)

[https://github.com/NVlabs/moderngpu/releases](https://github.com/NVlabs/moderngpu/releases)

You will need to download CUB and Mgpu.
ModernGPU can be downloaded as a submodule using `git submodule --init --recursive`.

Else, you may download it manually.
Assuming LSGDIR contains the LonestarGPU source (i.e., this repository):

```Shell
cd $LSGDIR/rt/include
ln -s path-to-cub-x.y/ cub
ln -s path-to-mgpu-x.y/ mgpu
```

CUB is included in recent CUDA releases, and need not be installed separately.

To compile for a specific GPU architecture, make changes to the arch.mk file in the top directory.

## BUILDING
Expand Down
4 changes: 2 additions & 2 deletions apps/mis/Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# expect to be in ROOT/app/
include ../../arch.mk
INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include -I../../rt/include/cub
INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include
LIBDIR=../../rt/lib
LIBS=-lggrt -lcurand
SKELAPP=../../skelapp
EXTRAFLAGS=-g -O3 -DCSRG_TEX -w
EXTRAFLAGS=-g -O3 -DCSRG_TEX
COMPRESS_LIBS=-lz
SNAPPY_LIBDIR=~/.local/lib

Expand Down
48 changes: 26 additions & 22 deletions apps/mis/kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -167,50 +167,54 @@ void gg_main_pipe_1(CSRGraphTy& gg, int& STEPS, Shared<unsigned int>& prio, Pipe
}
}
}
__global__ void gg_main_pipe_1_gpu(CSRGraphTy gg, int STEPS, unsigned int* prio, PipeContextT<WorklistT> pipe, dim3 blocks, dim3 threads, int* cl_STEPS)
{
unsigned tid = TID_1D;
unsigned nthreads = TOTAL_THREADS_1D;

// Changed this from __global__ to a CPU function.
// Changed STEPS from a GPU variable to a CPU variable.
void gg_main_pipe_1_gpu(CSRGraphTy gg,
unsigned int* prio,
PipeContextT<WorklistT> pipe,
dim3 blocks,
dim3 threads,
int* cl_STEPS)
{
const unsigned __kernel_tb_size = __tb_one;
STEPS = *cl_STEPS;
int STEPS = *cl_STEPS;
{
init_wl <<<blocks, threads>>>(gg, pipe.in_wl(), pipe.out_wl());
pipe.in_wl().swap_slots();
cudaDeviceSynchronize();
assert(cudaDeviceSynchronize() == cudaSuccess);
pipe.advance2();
while (pipe.in_wl().nitems())
{
mark_nodes <<<blocks, threads>>>(gg, prio, pipe.in_wl(), pipe.out_wl());
cudaDeviceSynchronize();
assert(cudaDeviceSynchronize() == cudaSuccess);
drop_marked_nodes_and_nbors <<<blocks, threads>>>(gg, pipe.in_wl(), pipe.out_wl());
pipe.in_wl().swap_slots();
cudaDeviceSynchronize();
assert(cudaDeviceSynchronize() == cudaSuccess);
pipe.advance2();
STEPS++;
}
}
if (tid == 0)
{
*cl_STEPS = STEPS;
}

*cl_STEPS = STEPS;
}
void gg_main_pipe_1_wrapper(CSRGraphTy& gg, int& STEPS, Shared<unsigned int>& prio, PipeContextT<WorklistT>& pipe, dim3& blocks, dim3& threads)

void gg_main_pipe_1_wrapper(CSRGraphTy& gg,
int& STEPS,
Shared<unsigned int>& prio,
PipeContextT<WorklistT>& pipe,
dim3& blocks,
dim3& threads)
{
if (false)
{
gg_main_pipe_1(gg,STEPS,prio,pipe,blocks,threads);
}
else
{
int* cl_STEPS;
check_cuda(cudaMalloc(&cl_STEPS, sizeof(int) * 1));
check_cuda(cudaMemcpy(cl_STEPS, &STEPS, sizeof(int) * 1, cudaMemcpyHostToDevice));
int* cl_STEPS = &STEPS;

gg_main_pipe_1_gpu<<<1,1>>>(gg,STEPS,prio.gpu_wr_ptr(),pipe,blocks,threads,cl_STEPS);
// gg_main_pipe_1_gpu_gb<<<gg_main_pipe_1_gpu_gb_blocks, __tb_gg_main_pipe_1_gpu_gb>>>(gg,STEPS,prio.gpu_wr_ptr(),pipe,cl_STEPS, gg_main_pipe_1_gpu_gb_barrier);
check_cuda(cudaMemcpy(&STEPS, cl_STEPS, sizeof(int) * 1, cudaMemcpyDeviceToHost));
check_cuda(cudaFree(cl_STEPS));
gg_main_pipe_1_gpu(gg, prio.gpu_wr_ptr(), pipe, blocks, threads, cl_STEPS);
}
}
void gg_main(CSRGraphTy& hg, CSRGraphTy& gg)
Expand All @@ -223,10 +227,10 @@ void gg_main(CSRGraphTy& hg, CSRGraphTy& gg)
ggc::Timer t ("random");
t.start();
gen_prio_gpu <<<blocks, threads>>>(gg, prio.gpu_wr_ptr(), SEED1, SEED2, SEED3, SEED4);
cudaDeviceSynchronize();
assert(cudaDeviceSynchronize() == cudaSuccess);
t.stop();
printf("Random number generation took %llu ns\n", t.duration());
pipe = PipeContextT<WorklistT>(gg.nnodes);
gg_main_pipe_1_wrapper(gg,STEPS,prio,pipe,blocks,threads);
printf("Total steps: %d\n", STEPS);
}
}
14 changes: 8 additions & 6 deletions apps/mst/Makefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# expect to be in ROOT/app/
include ../../arch.mk
INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include -I../../rt/include/cub
INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include
LIBDIR=../../rt/lib
LIBS=-lggrt -lcurand
SKELAPP=../../skelapp
EXTRAFLAGS=-g -O3 -DCSRG_TEX -w
EXTRAFLAGS=-g -O3 -DCSRG_TEX
COMPRESS_LIBS=-lz
SNAPPY_LIBDIR=~/.local/lib

NVCCFLAGS += -arch=native

ifeq ($(USE_SNAPPY),1)
COMPRESS_LIBS+=-lsnappy -L $(SNAPPY_LIBDIR)
endif
Expand Down Expand Up @@ -36,18 +38,18 @@ skel.o: $(SKELAPP)/skel.cu
nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) $(EXTRAFLAGS) $(NVCCFLAGS) $(INCLUDES) $< -o $@

skel-nontex.o: $(SKELAPP)/skel.cu
nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 -w $(NVCCFLAGS) $(INCLUDES) $< -o $@
nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 $(NVCCFLAGS) $(INCLUDES) $< -o $@

%.o: %.cu
# nvcc -lineinfo -dc $(CUDA_ARCH) $(CXXFLAGS) $(EXTRAFLAGS) $(INCLUDES) $< $(EXTRA_SRC) -o $@
nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) $(EXTRAFLAGS) $(NVCCFLAGS) $(INCLUDES) $< $(EXTRA_SRC) -o $@

# this is only needed if the AST contained 'CSRGraphTy' instead of Graph().param() or such.
kernel-nontex.o: kernel-nontex.cu
nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 -w $(INCLUDES) $< $(EXTRA_SRC) -o $@
nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 $(INCLUDES) $< $(EXTRA_SRC) -o $@

support-nontex.o: support.cu
nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 -w $(INCLUDES) $< $(EXTRA_SRC) -o $@
nvcc -dc $(CUDA_ARCH) $(CXXFLAGS) -g -O3 $(INCLUDES) $< $(EXTRA_SRC) -o $@

%.cubin: %.cu
nvcc -cubin -dc $(CUDA_ARCH) $(CXXFLAGS) $(EXTRAFLAGS) $(NVCCFLAGS) $(INCLUDES) $< $(EXTRA_SRC) -o $@
Expand All @@ -58,7 +60,7 @@ mst: skel.o kernel.o support.o $(SKELAPP)/mgpucontext.o $(SKELAPP)/mgpuutil.o
cp $@ ../../bin

mst_nontex: skel-nontex.o kernel-nontex.o support-nontex.o $(SKELAPP)/mgpucontext.o $(SKELAPP)/mgpuutil.o
nvcc -g -O3 -w $(CUDA_ARCH) $(CXXFLAGS) $(NVCCFLAGS) $(INCLUDES) -L$(LIBDIR) $(LIBS) -o $@ $^ -lcudadevrt $(COMPRESS_LIBS)
nvcc -g -O3 $(CUDA_ARCH) $(CXXFLAGS) $(NVCCFLAGS) $(INCLUDES) -L$(LIBDIR) $(LIBS) -o $@ $^ -lcudadevrt $(COMPRESS_LIBS)
cp $@ ../../bin

mst-dp: skel.o kernel.o support.o
Expand Down
4 changes: 2 additions & 2 deletions apps/sssp/Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# expect to be in ROOT/app/
include ../../arch.mk
INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include -I../../rt/include/cub
INCLUDES=-I../../rt/include -I../../rt/include/mgpu/include
LIBDIR=../../rt/lib
LIBS=-lggrt -lcurand
SKELAPP=../../skelapp
EXTRAFLAGS=-g -O3 -w
EXTRAFLAGS=-g -O3
COMPRESS_LIBS=-lz
SNAPPY_LIBDIR=~/.local/lib

Expand Down
47 changes: 7 additions & 40 deletions apps/sssp/kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -327,25 +327,25 @@ void gg_main_pipe_1(CSRGraph& gg, gint_p glevel, int& curdelta, int& i, int DELT
{
t_work.reset_thread_work();
Inspect_sssp_kernel_dev <<<blocks, __tb_sssp_kernel>>>(gg, curdelta, t_work.thread_work_wl, t_work.thread_src_wl, enable_lb, pipe.in_wl(), pipe.out_wl());
cudaDeviceSynchronize();
assert(cudaDeviceSynchronize() == cudaSuccess);
int num_items = t_work.thread_work_wl.in_wl().nitems();
if (num_items != 0)
{
t_work.compute_prefix_sum();
cudaDeviceSynchronize();
assert(cudaDeviceSynchronize() == cudaSuccess);
sssp_kernel_dev_TB_LB <<<blocks, __tb_sssp_kernel>>>(gg, curdelta, t_work.thread_prefix_work_wl.gpu_wr_ptr(), num_items, t_work.thread_src_wl, pipe.in_wl(), pipe.out_wl(), pipe.re_wl());
cudaDeviceSynchronize();
assert(cudaDeviceSynchronize() == cudaSuccess);
}
}
sssp_kernel <<<blocks, __tb_sssp_kernel>>>(gg, curdelta, enable_lb, pipe.in_wl(), pipe.out_wl(), pipe.re_wl());
cudaDeviceSynchronize();
assert(cudaDeviceSynchronize() == cudaSuccess);
pipe.in_wl().swap_slots();
pipe.retry2();
}
pipe.advance2();
pipe.out_wl().will_write();
remove_dups <<<remove_dups_blocks, __tb_remove_dups>>>(glevel, pipe.in_wl(), pipe.out_wl(), remove_dups_barrier);
cudaDeviceSynchronize();
assert(cudaDeviceSynchronize() == cudaSuccess);
pipe.in_wl().swap_slots();
pipe.advance2();
i++;
Expand Down Expand Up @@ -389,40 +389,7 @@ __global__ void __launch_bounds__(__tb_gg_main_pipe_1_gpu_gb) gg_main_pipe_1_gpu
*cl_i = i;
}
}
__global__ void gg_main_pipe_1_gpu(CSRGraph gg, gint_p glevel, int curdelta, int i, int DELTA, GlobalBarrier remove_dups_barrier, int remove_dups_blocks, PipeContextT<Worklist2> pipe, dim3 blocks, dim3 threads, int* cl_curdelta, int* cl_i, bool enable_lb)
{
unsigned tid = TID_1D;
unsigned nthreads = TOTAL_THREADS_1D;

const unsigned __kernel_tb_size = __tb_one;
curdelta = *cl_curdelta;
i = *cl_i;
while (pipe.in_wl().nitems())
{
while (pipe.in_wl().nitems())
{
sssp_kernel <<<blocks, __tb_sssp_kernel>>>(gg, curdelta, enable_lb, pipe.in_wl(), pipe.out_wl(), pipe.re_wl());
cudaDeviceSynchronize();
pipe.in_wl().swap_slots();
cudaDeviceSynchronize();
pipe.retry2();
}
cudaDeviceSynchronize();
pipe.advance2();
remove_dups <<<remove_dups_blocks, __tb_remove_dups>>>(glevel, pipe.in_wl(), pipe.out_wl(), remove_dups_barrier);
cudaDeviceSynchronize();
pipe.in_wl().swap_slots();
cudaDeviceSynchronize();
pipe.advance2();
i++;
curdelta += DELTA;
}
if (tid == 0)
{
*cl_curdelta = curdelta;
*cl_i = i;
}
}
void gg_main_pipe_1_wrapper(CSRGraph& gg, gint_p glevel, int& curdelta, int& i, int DELTA, GlobalBarrier& remove_dups_barrier, int remove_dups_blocks, PipeContextT<Worklist2>& pipe, dim3& blocks, dim3& threads)
{
static GlobalBarrierLifetime gg_main_pipe_1_gpu_gb_barrier;
Expand Down Expand Up @@ -467,7 +434,7 @@ void gg_main(CSRGraph& hg, CSRGraph& gg)
static const size_t remove_dups_blocks = GG_MIN(blocks.x, ggc_get_nSM() * remove_dups_residency);
if(!remove_dups_barrier_inited) { remove_dups_barrier.Setup(remove_dups_blocks); remove_dups_barrier_inited = true;};
kernel <<<blocks, threads>>>(gg, start_node);
cudaDeviceSynchronize();
assert(cudaDeviceSynchronize() == cudaSuccess);
int i = 0;
int curdelta = 0;
printf("delta: %d\n", DELTA);
Expand All @@ -477,4 +444,4 @@ void gg_main(CSRGraph& hg, CSRGraph& gg)
pipe.in_wl().update_gpu(1);
gg_main_pipe_1_wrapper(gg,glevel,curdelta,i,DELTA,remove_dups_barrier,remove_dups_blocks,pipe,blocks,threads);
printf("iterations: %d\n", i);
}
}
13 changes: 5 additions & 8 deletions arch.mk
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
CUDA_ARCH := \
-gencode arch=compute_35,code=sm_35 \
-gencode arch=compute_37,code=sm_37 \
-gencode arch=compute_50,code=sm_50 \
-gencode arch=compute_52,code=sm_52 \
-gencode arch=compute_60,code=sm_60 \
-gencode arch=compute_61,code=sm_61 \
-gencode arch=compute_61,code=compute_61
# nvcc handles architectures.
# Change this if you need to.
CUDA_ARCH :=

NVCCFLAGS += -arch=native
5 changes: 3 additions & 2 deletions rt/include/aolist.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,9 @@ struct AppendOnlyList {
assert(lindex <= size);
}

lindex = cub::ShuffleBroadcast(lindex, first);
//lindex = cub::ShuffleIndex(lindex, first); // CUB > 1.3.1
unsigned mask = __activemask();
lindex = __shfl_sync(mask, lindex, first);

return lindex + offset;
}

Expand Down
1 change: 1 addition & 0 deletions rt/include/mgpu
Submodule mgpu added at 4cddab
Loading