Skip to content

Commit

Permalink
Add sequential RCM on CPU.
Browse files Browse the repository at this point in the history
Also change the parallel algorithm a little bit to remove a large
overhead.
  • Loading branch information
ihcinihsdk committed Jun 13, 2014
1 parent e7e8840 commit b7cb153
Show file tree
Hide file tree
Showing 6 changed files with 497 additions and 97 deletions.
10 changes: 6 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ MESSAGE(STATUS "Cuda version: ${CUDA_VERSION}")

SET(RCM_HEADERS
rcm/common.h
rcm/rcm.h
rcm/rcm_um.h
rcm/exception.h
rcm/timer.h
Expand All @@ -23,15 +24,16 @@ SET(MMIO_FILES
mm_io/mm_io.c
)

SOURCE_GROUP("Headers" FILES ${MC64_HEADERS})
SOURCE_GROUP("CUDA Headers" FILES ${MC64_CUHEADERS})
SOURCE_GROUP("Headers" FILES ${RCM_HEADERS})
SOURCE_GROUP("CUDA Headers" FILES ${RCM_CUHEADERS})
SOURCE_GROUP("MM_IO" FILES ${MMIO_FILES})

INCLUDE_DIRECTORIES(
${CMAKE_SOURCE_DIR}
)

IF(NOT (${CUDA_VERSION} VERSION_LESS "6.0"))
cuda_add_executable(driver_um driver_um.cu ${MC64_HEADERS} ${MC64_CUHEADERS} ${MMIO_FILES})
cuda_add_executable(testing testing.cu ${MC64_HEADERS} ${MC64_CUHEADERS} ${MMIO_FILES})
cuda_add_executable(driver_um driver_um.cu ${RCM_HEADERS} ${RCM_CUHEADERS} ${MMIO_FILES})
cuda_add_executable(testing testing.cu ${RCM_HEADERS} ${RCM_CUHEADERS} ${MMIO_FILES})
cuda_add_executable(testing_um testing_um.cu ${RCM_HEADERS} ${RCM_CUHEADERS} ${MMIO_FILES})
ENDIF()
5 changes: 4 additions & 1 deletion rcm/device/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ __global__ void achieveLevels(int N,
bool* frontier,
int* visited,
int* updated_by,
int* levels)
int* levels,
bool* has_frontier)
{
int bid = blockIdx.x + blockIdx.y * gridDim.x;

Expand All @@ -48,6 +49,8 @@ __global__ void achieveLevels(int N,
frontier[column] = true;
updated_by[column] = bid + 1;
levels[column] = cur_cost + 1;
if (!(*has_frontier))
*has_frontier = true;
}
}

Expand Down
331 changes: 331 additions & 0 deletions rcm/rcm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,331 @@
#ifndef RCM_H
#define RCM_H

#include <rcm/common.h>
#include <rcm/exception.h>
#include <rcm/device/kernels.cuh>

#include <thrust/scan.h>
#include <thrust/functional.h>
#include <thrust/sequence.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/gather.h>
#include <thrust/binary_search.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/logical.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/adjacent_difference.h>
#include <thrust/inner_product.h>

#include <queue>

extern "C" {
#include "mm_io/mm_io.h"
}

namespace rcm {

class RCM_base
{
protected:
int m_half_bandwidth;
int m_half_bandwidth_original;

size_t m_n;
size_t m_nnz;

typedef typename thrust::tuple<int, int, int, int> IntTuple;

template <typename IVector>
static void offsets_to_indices(const IVector& offsets, IVector& indices)
{
// convert compressed row offsets into uncompressed row indices
thrust::fill(indices.begin(), indices.end(), 0);
thrust::scatter( thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(offsets.size()-1),
offsets.begin(),
indices.begin());
thrust::inclusive_scan(indices.begin(), indices.end(), indices.begin(), thrust::maximum<int>());
}

template <typename IVector>
static void indices_to_offsets(const IVector& indices, IVector& offsets)
{
// convert uncompressed row indices into compressed row offsets
thrust::lower_bound(indices.begin(),
indices.end(),
thrust::counting_iterator<int>(0),
thrust::counting_iterator<int>(offsets.size()),
offsets.begin());
}

public:

virtual ~RCM_base() {}

struct Difference: public thrust::binary_function<int, int, int>
{
inline
__host__ __device__
int operator() (const int &a, const int &b) const {
return abs(a-b);
}
};

struct ExtendedDifference: public thrust::binary_function<int, int, int>
{
int *m_perm;

ExtendedDifference(int *perm): m_perm(perm) {}
inline
__host__ __device__
int operator() (const int &a, const int &b) const {
return abs(m_perm[a]-m_perm[b]);
}
};

struct TupleCompare
{
inline
__host__ __device__
bool operator() (IntTuple a, IntTuple b) const
{
int a_level = thrust::get<0>(a), b_level = thrust::get<0>(b);
if (a_level != b_level) return a_level < b_level;
int a_updated_by = thrust::get<1>(a), b_updated_by = thrust::get<1>(b);
if (a_updated_by != b_updated_by) return a_updated_by < b_updated_by;
return thrust::get<2>(a) < thrust::get<2>(b);
}
};

int getHalfBandwidth() const {return m_half_bandwidth;}
int getHalfBandwidthOriginal() const {return m_half_bandwidth_original;}

virtual void execute() = 0;
};

class RCM: public RCM_base
{
private:
typedef typename thrust::host_vector<int> IntVectorH;
typedef typename thrust::host_vector<double> DoubleVectorH;
typedef typename thrust::host_vector<bool> BoolVectorH;

typedef typename IntVectorH::iterator IntIterator;
typedef typename thrust::tuple<IntIterator, IntIterator> IntIteratorTuple;
typedef typename thrust::zip_iterator<IntIteratorTuple> EdgeIterator;

typedef typename thrust::tuple<int, int> NodeType;

IntVectorH m_row_offsets;
IntVectorH m_column_indices;
DoubleVectorH m_values;

IntVectorH m_perm;

void buildTopology(EdgeIterator& begin,
EdgeIterator& end,
int node_begin,
int node_end,
IntVectorH& row_offsets,
IntVectorH& column_indices);

struct CompareValue
{
inline
bool operator() (NodeType a, NodeType b) const {
return thrust::get<1>(a) > thrust::get<1>(b);
}
};

public:
RCM(const IntVectorH& row_offsets,
const IntVectorH& column_indices,
const DoubleVectorH& values)
: m_row_offsets(row_offsets),
m_column_indices(column_indices),
m_values(values)
{
size_t n = row_offsets.size() - 1;
m_perm.resize(n);
m_n = n;
m_nnz = m_values.size();
}

~RCM() {}

void execute();
};

void
RCM::execute()
{
IntVectorH tmp_reordering(m_n);
IntVectorH tmp_perm(m_n);

thrust::sequence(tmp_reordering.begin(), tmp_reordering.end());

IntVectorH row_indices(m_nnz);
IntVectorH tmp_row_indices(m_nnz << 1);
IntVectorH tmp_column_indices(m_nnz << 1);
IntVectorH tmp_row_offsets(m_n + 1);
offsets_to_indices(m_row_offsets, row_indices);

m_half_bandwidth_original = m_half_bandwidth = thrust::inner_product(row_indices.begin(), row_indices.end(), m_column_indices.begin(), 0, thrust::maximum<int>(), Difference());

EdgeIterator begin = thrust::make_zip_iterator(thrust::make_tuple(row_indices.begin(), m_column_indices.begin()));
EdgeIterator end = thrust::make_zip_iterator(thrust::make_tuple(row_indices.end(), m_column_indices.end()));
buildTopology(begin, end, 0, m_n, tmp_row_offsets, tmp_column_indices);

const int MAX_NUM_TRIAL = 5;

BoolVectorH tried(m_n, false);
tried[0] = true;

int last_tried = 0;

m_perm.resize(m_n);
thrust::sequence(m_perm.begin(), m_perm.end());

for (int trial_num = 0; trial_num < MAX_NUM_TRIAL ; trial_num++)
{
std::queue<int> q;
std::priority_queue<NodeType, std::vector<NodeType>, CompareValue > pq;

int tmp_node;
BoolVectorH pushed(m_n, false);

int left_cnt = m_n;
int j = 0, last = 0;

if (trial_num > 0) {

if (trial_num < MAX_NUM_TRIAL) {
tmp_node = rand() % m_n;

while(tried[tmp_node])
tmp_node = rand() % m_n;
} else {
if (last_tried >= m_n - 1) {
fprintf(stderr, "All possible starting points have been tried in RCM\n");
break;
}
for (tmp_node = last_tried+1; tmp_node < m_n; tmp_node++)
if (!tried[tmp_node]) {
last_tried = tmp_node;
break;
}
}

pushed[tmp_node] = true;
tried[tmp_node] = true;
q.push(tmp_node);
}

while(left_cnt--) {
if(q.empty()) {
left_cnt++;
int i;

for(i = last; i < m_n; i++) {
if(!pushed[i]) {
q.push(i);
pushed[i] = true;
last = i;
break;
}
}
if(i < m_n) continue;
fprintf(stderr, "Can never get here!\n");
return;
}

tmp_node = q.front();
tmp_reordering[j] = tmp_node;
j++;

q.pop();

int start_idx = tmp_row_offsets[tmp_node], end_idx = tmp_row_offsets[tmp_node + 1];

for (int i = start_idx; i < end_idx; i++) {
int target_node = tmp_column_indices[i];
if(!pushed[target_node]) {
pushed[target_node] = true;
pq.push(thrust::make_tuple(target_node, tmp_row_offsets[target_node + 1] - tmp_row_offsets[target_node]));
}
}

while(!pq.empty()) {
q.push(thrust::get<0>(pq.top()));
pq.pop();
}
}

thrust::scatter(thrust::make_counting_iterator(0),
thrust::make_counting_iterator((int)(m_n)),
tmp_reordering.begin(),
tmp_perm.begin());

{
int *perm_array = thrust::raw_pointer_cast(&tmp_perm[0]);
int tmp_bdwidth = thrust::inner_product(row_indices.begin(), row_indices.end(), m_column_indices.begin(), 0, thrust::maximum<int>(), ExtendedDifference(perm_array));

if (m_half_bandwidth > tmp_bdwidth) {
m_half_bandwidth = tmp_bdwidth;
m_perm = tmp_perm;
}
}
}
}

void
RCM::buildTopology(EdgeIterator& begin,
EdgeIterator& end,
int node_begin,
int node_end,
IntVectorH& row_offsets,
IntVectorH& column_indices)
{
if (row_offsets.size() != m_n + 1)
row_offsets.resize(m_n + 1, 0);
else
thrust::fill(row_offsets.begin(), row_offsets.end(), 0);

IntVectorH row_indices((end - begin) << 1);
column_indices.resize((end - begin) << 1);
int actual_cnt = 0;

for(EdgeIterator edgeIt = begin; edgeIt != end; edgeIt++) {
int from = thrust::get<0>(*edgeIt), to = thrust::get<1>(*edgeIt);
if (from != to) {
row_indices[actual_cnt] = from;
column_indices[actual_cnt] = to;
row_indices[actual_cnt + 1] = to;
column_indices[actual_cnt + 1] = from;
actual_cnt += 2;
}
}
row_indices.resize(actual_cnt);
column_indices.resize(actual_cnt);
// thrust::sort_by_key(row_indices.begin(), row_indices.end(), column_indices.begin());
{
int& nnz = actual_cnt;
IntVectorH tmp_column_indices(nnz);
for (int i = 0; i < nnz; i++)
row_offsets[row_indices[i]] ++;

thrust::inclusive_scan(row_offsets.begin() + node_begin, row_offsets.begin() + (node_end + 1), row_offsets.begin() + node_begin);

for (int i = nnz - 1; i >= 0; i--) {
int idx = (--row_offsets[row_indices[i]]);
tmp_column_indices[idx] = column_indices[i];
}
column_indices = tmp_column_indices;
}
}

} // end namespace rcm

#endif
Loading

0 comments on commit b7cb153

Please sign in to comment.