diff --git a/Caller.cu b/Caller.cu deleted file mode 100644 index 6f14a29..0000000 --- a/Caller.cu +++ /dev/null @@ -1,16 +0,0 @@ -#include -__global__ void -emptyKernel(){ -} - - - -extern "C" void -call(){ - dim3 grid( 1, 1, 1); - dim3 threads( 256, 1, 1); - emptyKernel <<>> (); - cudaThreadSynchronize(); - printf("Called\n"); -} - diff --git a/CoordWidget.cpp b/CoordWidget.cpp deleted file mode 100644 index b9f4891..0000000 --- a/CoordWidget.cpp +++ /dev/null @@ -1,23 +0,0 @@ -#include "CoordWidget.h" - -CoordWidget::CoordWidget() -{ - coordLabel = new QLabel("CoordLabel"); - QVBoxLayout *layout = new QVBoxLayout; - layout->addWidget(coordLabel); - setLayout(layout); -} - - - -CoordWidget::~CoordWidget() -{ -} - -void CoordWidget::setCoord(int x, int y){ - char temp[200]; - sprintf(temp,"%d,%d",x,y); - QString str = temp; - coordLabel->setText("Coord " + str); - -} \ No newline at end of file diff --git a/CoordWidget.h b/CoordWidget.h deleted file mode 100644 index d5f9569..0000000 --- a/CoordWidget.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef COORDWIDGET_H_ -#define COORDWIDGET_H_ - -#include -#include -#include - -class CoordWidget : public QWidget -{ - Q_OBJECT -public: - CoordWidget(); - virtual ~CoordWidget(); -private: - QLabel* coordLabel; -public slots: - void setCoord(int x, int y); - -}; - -#endif /*COORDWIDGET_H_*/ diff --git a/GLWidget.cpp b/GLWidget.cpp deleted file mode 100644 index 8e13b0e..0000000 --- a/GLWidget.cpp +++ /dev/null @@ -1,467 +0,0 @@ - -#include -#include -#include -#include - - - - -#include "GLWidget.h" - - -void filter(uchar4 ** h_Src,int width, int height, float * gradient){ - uchar4 temp[width*height]; - for(int i=0;i 0xff ) chanX= 0xff; - else chanX = (unsigned char) Sumx; - - short uly = temp[i+width-1].y; - short upy = temp[i+width].y; - short ury = temp[i+width+1].y; - - short cly = temp[i-1].y; - short cey = temp[i].y; - short cry = temp[i+1].y; - - short dly = temp[i-width-1].y; - short downy = temp[i-width].y; - short dry = temp[i-width+1].y; - - short Horzy = ury + 2*cry + dry - uly - 2*cly - dly; - short Verty = uly + 2*upy + ury - dly - 2*downy - dry; - Horzy /= 8; - Verty /= 8; - short Sumy = (short) sqrt( Horzy*Horzy + Verty*Verty); - unsigned char chanY; - if ( Sumy < 0 ) chanY= 0; - else if ( Sumy > 0xff ) chanY= 0xff; - else chanY = (unsigned char) Sumy; - - short ulz = temp[i+width-1].z; - short upz = temp[i+width].z; - short urz = temp[i+width+1].z; - - short clz = temp[i-1].z; - short cez = temp[i].z; - short crz = temp[i+1].z; - - short dlz = temp[i-width-1].z; - short downz = temp[i-width].z; - short drz = temp[i-width+1].z; - - short Horzz = urz + 2*crz + drz - ulz - 2*clz - dlz; - short Vertz = ulz + 2*upz + urz - dlz - 2*downz - drz; - Horzz /= 8; - Vertz /= 8; - short Sumz = (short) sqrt ( Horzz*Horzz + Vertz*Vertz ); - unsigned char chanZ; - if ( Sumz < 0 ) chanZ= 0; - else if ( Sumz > 0xff ) chanZ= 0xff; - else chanZ = (unsigned char) Sumz; - - - (*h_Src)[i].x = chanX; - (*h_Src)[i].y = chanY; - (*h_Src)[i].z = chanZ; - gradient[i] = 0.3 * chanX/255.0 + 0.59 * chanY/255.0 + 0.11 * chanZ/255.0; - - //(*h_Src)[i].y = std::max(temp[i].y - 50,0); - //(*h_Src)[i].z = std::max(temp[i].z - 50,0); - //(*h_Src)[i].w = temp[i].w ; - } - -} - -GLWidget::GLWidget(QWidget* parent) : QGLWidget(parent) -{ - zoomFactor = 1.0f; - zoomFactorX = 1.0f; - zoomFactorY = 1.0f; - ry=0.0f; - dx=0.0f; - setFocusPolicy (Qt::StrongFocus); - setMouseTracking(true); - myWindow = (Window*)parent; - calculatedGradient=false; - chosenTexture = ORIGINALTEX; - drawTeapot = 0; - approach=0.0f; - -} - -GLWidget::~GLWidget() -{ -} - - QSize GLWidget::minimumSizeHint() const - { - return QSize(50, 50); - } - - QSize GLWidget::sizeHint() const - { - return QSize(512, 384); - } - QSize GLWidget::maximumSizeHint() const - { - return QSize(2000, 2000); - } - void GLWidget::initializeGL() - { - int argc=0; - char** test; - glutInit(&argc,test); - QColor trolltechPurple = QColor::fromCmykF(0.39, 0.39, 0.0, 0.0); - - glMatrixMode(GL_PROJECTION); - glLoadIdentity(); - - //gluPerspective(45.0f, (GLfloat)400.0/(GLfloat)400.0,0.1f,100.0f); - //gluPerspective(zoomFactor, (GLfloat)400.0/(GLfloat)400.0,0.1f,100.0f); - - qglClearColor(trolltechPurple.dark()); - - - //loading the texture - LoadBMPFile(&h_Src, &imageW, &imageH, "plane.bmp"); - - glGenTextures(1,&tex); - glBindTexture(GL_TEXTURE_2D, tex); - glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, imageW, imageH, 0, GL_RGBA, GL_UNSIGNED_BYTE, h_Src); - - for(int i=0;idelta()/480.0>0) - zoomFactorX+=event->delta()/480.0; - if(zoomFactorY+event->delta()/480.0>0) - zoomFactorY+=event->delta()/480.0; - updateGL(); - } - - void GLWidget::mousePressEvent(QMouseEvent *event){ - int argc=0; - char** test; - lastPos = event->pos(); - if(event->button() == Qt::LeftButton){ - calculatedGradient = true; - int xpos = (int) ( -dx + event->x()/zoomFactorX + 0.5); - int ypos = (int) ( height() - dy - ( (height()-event->y())/zoomFactorY )+ 0.5); - int startNode = (imageH-ypos)*imageW + xpos; -// int startNode = event->y()*imageW + event->x(); - runTestGraph( argc, test,imageW,imageH,startNode,gradient); - } - updateGL(); - } - void GLWidget::mouseMoveEvent(QMouseEvent *event){ - int mydx = event->x() - lastPos.x(); - int mydy = event->y() - lastPos.y(); - int xpos = (int) ( -dx + event->x()/zoomFactorX + 0.5); - int ypos = (int) ( height() - dy - ( (height()-event->y())/zoomFactorY )+ 0.5); - //printf("x %d y %d dy %d\n",xpos, ypos,dy); - myWindow->setX(xpos); - myWindow->setY(ypos); - emit coordChanged(xpos,ypos); - - mouseX = xpos; - mouseY = ypos; - - if(event->buttons() && Qt::LeftButton){ - //printf("Delta x %f\n",dx); - dx += mydx / zoomFactorX; - dy += mydy / zoomFactorY; - //dx += zoomFactorX*mydx/width(); - //dy += zoomFactorY*mydy/height(); - } - else{ - if(calculatedGradient){ - if( xpos>=0 && xpos < imageW && ypos >=0 && ypos < imageH){ - int node = (imageH-ypos)*imageW + xpos; - int path[5000]; - for(int i=0;i<5000;i++) - path[i]=-1; - - printPathSource(node,path); - glBindTexture(GL_TEXTURE_2D, mask); - glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR); - for(int i=0;ipos(); - updateGL(); - - } - - void GLWidget::keyPressEvent(QKeyEvent* event){ - if(event->key()==Qt::Key_R){ - printf("Ry %f\n",ry); - ry+=10.0f; - } - else if(event->key()==Qt::Key_T){ - printf("Ry %f\n",ry); - ry-=10.0f; - } - else if(event->key()==Qt::Key_O){ - if(chosenTexture==ORIGINALTEX) - chosenTexture = SOBELTEX; - else - chosenTexture = ORIGINALTEX; - } - else if(event->key()==Qt::Key_P){ - drawTeapot = !drawTeapot; - } - else if(event->key()==Qt::Key_W){ - approach -= 0.1f; - } - else if(event->key()==Qt::Key_S){ - approach += 0.1f; - } - else{ - printf("Ignoring\n"); - event->ignore(); - } - updateGL(); - - - } diff --git a/GLWidget.h b/GLWidget.h deleted file mode 100644 index 00a058f..0000000 --- a/GLWidget.h +++ /dev/null @@ -1,79 +0,0 @@ -#ifndef GLWIDGET_H_ -#define GLWIDGET_H_ - -//#include - -#include - - -#include -#include -#include "Window.h" - -#define ORIGINALTEX 0 -#define SOBELTEX 1 - -//extern void APIENTRY glBlendColor (GLclampf, GLclampf, GLclampf, GLclampf); - -//Isolated definition -typedef struct{ - unsigned char x, y, z, w; -} uchar4; - -//////////////////////////////////////////////////////////////////////////////// -// Small BMP loading utility -//////////////////////////////////////////////////////////////////////////////// -extern "C" void LoadBMPFile(uchar4 **, int *, int *, const char *); - -//CUDA kernel wrapper -void -runTestGraph( int argc, char** argv,int iw, int ih, int startNode, float* gradient); -void printPathSource(int destination,int* path); - -void empty(); - -class GLWidget : public QGLWidget -{ - Q_OBJECT - -public: - GLWidget(QWidget* parent=0); - virtual ~GLWidget(); - QSize minimumSizeHint() const; - QSize maximumSizeHint() const; - QSize sizeHint() const; - -protected: - void initializeGL(); - void paintGL(); - void resizeGL(int width, int height); - void mousePressEvent(QMouseEvent *event); - void mouseMoveEvent(QMouseEvent *event); - void wheelEvent ( QWheelEvent * event ); - void keyPressEvent(QKeyEvent* event); - float zoomFactor,zoomFactorX,zoomFactorY; - float ry; - float dx,dy; - int mouseX, mouseY; - //float wWidth,wHeight; - QPoint lastPos; - Window* myWindow; - GLuint tex, mask, sobel; - uchar4* h_Src; - int imageW, imageH; - float * gradient; - int chosenTexture; - int drawTeapot; - float approach; - - -signals: - void coordChanged(int x,int y); - -private: - bool calculatedGradient; - -}; - - -#endif /*GLWIDGET_H_*/ diff --git a/How to compile b/How to compile deleted file mode 100644 index 5ecfb4c..0000000 --- a/How to compile +++ /dev/null @@ -1,3 +0,0 @@ -make clean -make -nvcc gridclean.cu *.o -I /home/baggio/NVIDIA_CUDA_SDK/common/inc/ -L /home/baggio/NVIDIA_CUDA_SDK/lib/ -lcuda -lcudart -lcutil -lGL -lGLU -arch sm_11 -lQtOpenGL -lQtGui -lQtCore -lGLU -lGL -lpthread -lglut -o GPUWire diff --git a/Master Thesis - Updated February 15th.pdf b/Master Thesis - Updated February 15th.pdf deleted file mode 100644 index 38e137b..0000000 Binary files a/Master Thesis - Updated February 15th.pdf and /dev/null differ diff --git a/ProjectHome.md b/ProjectHome.md new file mode 100644 index 0000000..917a7b7 --- /dev/null +++ b/ProjectHome.md @@ -0,0 +1,21 @@ +This projects hosts the source code for a implementation of the known Livewire algorithm for image segmentation using graphic processing units, through NVidia's CUDA API. + +## Video showing GPU based segmentation running CUDA and Qt on Linux ## + +There's a small video showing one of the results of the project [here](http://www.youtube.com/watch?v=eH_Jojw9NWw) + +## GPGPU BASED IMAGE SEGMENTATION LIVEWIRE ALGORITHM IMPLEMENTATION ## + + +### Master Thesis Abstract ### + +This thesis presents a GPU implementation of the Livewire algorithm. Instead of using traditional architectures, like the CPU, this implementation focus advantages obtained using Single Instruction Multiple Data (SIMD) architectures. The algorithm is divided in three phases: Sobel or Laplacian filter convolution, image modeling as a grid graph and solving the non-negative weighted edges single source shortest path problem. In order to calculate the shortest path, a parallel approach is made through the development of an adapted version of the ∆-stepping algorithm for GPUs. Each part of the algorithm was programmed as a single kernel, which is executed and compiled to the GPU, using CUDA ( Compute Unified Device Architecture), available on NVidia GeForce 8 series. GPUs have been the first SIMD commodity devices widely available on several desktops. Although originally designed for applications highly focused on rendering, GPGPU ( General Purpose Computing on Graphic Processing Units) researchers have shown that the huge processing power available on these devices as well as the recent advent of a programmable pipeline have made of GPUs an attractive option for low cost high performance platforms. Even though the implementation has used CUDA API, several other approaches are pointed out, showing a wide variety of alternatives such as other platforms – multicore CPUs, Cell processor –, other graphic APIs, such as Cg, OpenGL and DirectX, or even +different approaches like RapidMind and Brook, which make GPU access transparent. The conclusion of this thesis highlights a successful implementation of the algorithm using the GPU architecture showing advantages and disadvantages of this approach. A critical result analysis shows that intense speedups are seen in image filtering algorithms. On the other hand, the wide use of dependent device memory look-ups has constrained ∆-stepping +algorithm from achieving higher performance than CPU implementation although a better performance is expected for wider graphs. If device memory access latency was decreased or if more threads were available, a huge increase in performance would be expected. Besides showing the viability of the Livewire algorithm implementation, this thesis makes available an open-source image segmentation GPU based application, which can be used as example for future GPU algorithm implementations at http://code.google.com/p/gpuwire/. + +### Resumo da tese ### + +Esta tese apresenta a implementação do algoritmo de segmentação de imagens Livewire em uma placa de vídeo, que é uma arquitetura Single Instruction Multiple Data(SIMD), ao invés da utilização tradicional da CPU. O algoritmo é dividido em três fases: aplicação do filtro Sobel ou Laplaciano sobre a imagem, seguido de modelagem da mesma através de grafos do tipo grid e posterior resolução do menor caminho a partir de um dado nó. Para tal cálculo uma abordagem paralela feita através do desenvolvimento de uma versão adaptada do algoritmo ∆-stepping para placas de vídeo. Cada uma das partes +do algoritmo foi transformada em um núcleo que é executado e compilado na própria placa de vídeo, utilizando-se a arquitetura CUDA (Compute Unified Device Architecture) disponível na série 8 da NVidia GeForce. As placas de vídeo são os primeiros dispositivos SIMD amplamente disponíveis em diversos computadores. Embora originalmente desenvolvidos para aplicações de renderização, pesquisadores da área de GPGPU (General Purpose Computing on Graphic Processing Units) têm demonstrado que o imenso poder computacional destes dispositivos e a sua recente capacidade de programação fazem deles +uma alternativa atrativa como plataforma de alta-performance. A implementação foi focada na arquitetura CUDA, mas diversas outras abordagens são comentadas e referenciadas mostrando grande parte das alternativas disponíveis, como outras plataformas - CPUs com multicore, processador Cell -, outras APIs gráficas como Cg e OpenGL, ou mesmo abordagens que deixam transparente o uso de GPUs, como RapidMind e Brook. A conclusão coloca em evidência o sucesso da implementação do algoritmo para a plataforma de GPUs ressaltando os aspectos positivos e negativos da abordagem utilizada. Uma análise crítica dos resultados demonstra que o processamento de imagens através de filtros tem grande ganho de desempenho com relação a uma CPU, no entanto, devido a muitos acessos à memória do dispositivo, o algoritmo ∆-stepping não mostrou performance superior em tal arquitetura com os tamanhos de grafos testados, apontando uma maior melhora quanto maior o tamanho do grafo. Uma menor demora no acesso à memória local ou mesmo um maior número de threads poderiam aumentar muito a performance do algoritmo. Além da demonstração de viabilidade de implementação do algoritmo, esta tese +contribui disponibilizando uma aplicação open-source de segmentação de imagens através da GPU (em http://code.google.com/p/gpuwire/), servindo como base para futuras implementações na mesma arquitetura. \ No newline at end of file diff --git a/README.md b/README.md deleted file mode 100644 index 6ef0e55..0000000 --- a/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# gpuwire -Automatically exported from code.google.com/p/gpuwire - -Please, check Master Thesis pdf. - -![This is the image description](plane.bmp) diff --git a/SUMMARY.md b/SUMMARY.md deleted file mode 100644 index a5130d2..0000000 --- a/SUMMARY.md +++ /dev/null @@ -1,6 +0,0 @@ -# Summary - -* Intro -* [Setting up](setting_up.md) -* [Copying](copying.md) - diff --git a/Window.cpp b/Window.cpp deleted file mode 100644 index 4fedf99..0000000 --- a/Window.cpp +++ /dev/null @@ -1,40 +0,0 @@ -#include -#include "Window.h" -#include "GLWidget.h" - - -Window::Window() -{ - glWidget = new GLWidget(this); - QVBoxLayout *mainLayout = new QVBoxLayout; - mainLayout->addWidget(glWidget); - - - - labelX = new QLabel("X:"); - labelY = new QLabel("Y:"); - - //mainLayout->addWidget(labelX); - //mainLayout->addWidget(labelY); - - coordWidget = new CoordWidget(); - - mainLayout->addWidget(coordWidget); - - setLayout(mainLayout); - setWindowTitle("GPU LiveWire"); - setMinimumSize(QSize(0, 0)); - - QObject::connect(glWidget,SIGNAL(coordChanged(int,int)), coordWidget, SLOT(setCoord(int,int))); -} - -void Window::setX(int x){ - labelX->setText(tr("X: %1").arg(x)); -} -void Window::setY(int y){ - labelY->setText(tr("Y: %1").arg(y)); -} - -Window::~Window() -{ -} diff --git a/Window.h b/Window.h deleted file mode 100644 index bc92447..0000000 --- a/Window.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef WINDOW_H_ -#define WINDOW_H_ - -#include -#include "CoordWidget.h" - -class GLWidget; -class QLabel; - -class Window : public QWidget -{ - Q_OBJECT - -public: - Window(); - virtual ~Window(); - void setX(int x); - void setY(int y); - - -private: - GLWidget *glWidget; - QLabel *labelX; - QLabel *labelY; - CoordWidget *coordWidget; - -}; - -#endif /*WINDOW_H_*/ diff --git a/bmploader.cpp b/bmploader.cpp deleted file mode 100644 index 967fce8..0000000 --- a/bmploader.cpp +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright 1993-2007 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO USER: - * - * This source code is subject to NVIDIA ownership rights under U.S. and - * international Copyright laws. Users and possessors of this source code - * are hereby granted a nonexclusive, royalty-free license to use this code - * in individual and commercial software. - * - * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE - * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR - * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH - * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, - * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS - * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE - * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE - * OR PERFORMANCE OF THIS SOURCE CODE. - * - * U.S. Government End Users. This source code is a "commercial item" as - * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of - * "commercial computer software" and "commercial computer software - * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) - * and is provided to the U.S. Government only as a commercial end item. - * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through - * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the - * source code with only those rights set forth herein. - * - * Any use of this source code in individual and commercial software must - * include, in the user documentation and internal comments to the code, - * the above Disclaimer and U.S. Government End Users Notice. - */ - - -#include -#include - -#ifdef _WIN32 -# pragma warning( disable : 4996 ) // disable deprecated warning -#endif - -#pragma pack(1) - -typedef struct{ - short type; - int size; - short reserved1; - short reserved2; - int offset; -} BMPHeader; - -typedef struct{ - int size; - int width; - int height; - short planes; - short bitsPerPixel; - unsigned compression; - unsigned imageSize; - int xPelsPerMeter; - int yPelsPerMeter; - int clrUsed; - int clrImportant; -} BMPInfoHeader; - - - -//Isolated definition -typedef struct{ - unsigned char x, y, z, w; -} uchar4; - - - -extern "C" void LoadBMPFile(uchar4 **dst, int *width, int *height, const char *name){ - BMPHeader hdr; - BMPInfoHeader infoHdr; - int x, y; - - FILE *fd; - - - printf("Loading %s...\n", name); - if(sizeof(uchar4) != 4){ - printf("***Bad uchar4 size***\n"); - exit(0); - } - - if( !(fd = fopen(name,"rb")) ){ - printf("***BMP load error: file access denied***\n"); - exit(0); - } - - fread(&hdr, sizeof(hdr), 1, fd); - if(hdr.type != 0x4D42){ - printf("***BMP load error: bad file format***\n"); - exit(0); - } - fread(&infoHdr, sizeof(infoHdr), 1, fd); - - if(infoHdr.bitsPerPixel != 24){ - printf("***BMP load error: invalid color depth***\n"); - exit(0); - } - - if(infoHdr.compression){ - printf("***BMP load error: compressed image***\n"); - exit(0); - } - - *width = infoHdr.width; - *height = infoHdr.height; - *dst = (uchar4 *)malloc(*width * *height * 4); - - printf("BMP width: %u\n", infoHdr.width); - printf("BMP height: %u\n", infoHdr.height); - - fseek(fd, hdr.offset - sizeof(hdr) - sizeof(infoHdr), SEEK_CUR); - - for(y = 0; y < infoHdr.height; y++){ - for(x = 0; x < infoHdr.width; x++){ - (*dst)[(y * infoHdr.width + x)].z = fgetc(fd); - (*dst)[(y * infoHdr.width + x)].y = fgetc(fd); - (*dst)[(y * infoHdr.width + x)].x = fgetc(fd); - } - - for(x = 0; x < (4 - (3 * infoHdr.width) % 4) % 4; x++) - fgetc(fd); - } - - - if(ferror(fd)){ - printf("***Unknown BMP load error.***\n"); - free(*dst); - exit(0); - }else - printf("BMP file loaded successfully!\n"); - - fclose(fd); -} diff --git a/copying.md b/copying.md deleted file mode 100644 index e69de29..0000000 diff --git a/delta_kernel.cu b/delta_kernel.cu deleted file mode 100644 index 934f045..0000000 --- a/delta_kernel.cu +++ /dev/null @@ -1,561 +0,0 @@ -#ifndef _DELTA_KERNEL_H_ -#define _DELTA_KERNEL_H_ - -#include -#define DELTA 1.0 -#define INF 1e20 -#define BUCKETSIZE 4096*8*2 -#define NUMBUCKETS 512 - -#define DOWN 0 -#define UP 1 -#define RIGHT 2 -#define LEFT 3 - - -//#define EMULATION - -#ifdef EMULATION -#define DEBUG(x...) printf(x) -#else -#define DEBUG(x...) -#endif - -texture mytex0; -texture mytex1; -texture mytex2; -texture mytex3; - -__device__ void -demptyKernel(){ -} - -__global__ void -labelKernel (int i, int* B,int* BCount,int* BPos, int tw,int th, int* RLoc,int* R, float* dR,int* dSourceR, float* d,int* vBucketMap){ -//ver se não precisa colocar 4 BiCount em algum lugar... - - //todo: try to increase speed using shared memory for RLoc... think more about it (maybe RLoc is too big for shared memory) - const unsigned int tid = threadIdx.x; - const unsigned int num_threads = blockDim.x; - - int BiCount = BPos[i]; - int node, row, col,index; - float cost,f1,f2,fmin; - - //cleaning R - //4 times because each node can be reached from up, down, left and right directions (and more 4 times because for each node 4 more are open) -// DEBUG("BiCount %d lastpos %d\n",BiCount,4*(BiCount-1)+3); - for(int k=0; (num_threads*k + tid) < 16*BiCount;k++){ - R[(num_threads*k + tid) ]=-1; - dR[(num_threads*k + tid) ]=INF; - dSourceR[(num_threads*k + tid) ]=-1; - } - - - __syncthreads(); - const int dx[4]={0,0,1,-1}; - const int dy[4]={1,-1,0,0}; - - - for(int k=0; num_threads*k + tid < BiCount;k++){ - node = B [ BUCKETSIZE*i + num_threads*k + tid]; - if(node!=-1){ - - DEBUG("(tid %d) node %d(from B[%d] pos %d)\n",tid,node,i,num_threads*k + tid); - for(int j=0;j<4;j++){ - switch(j){ - case 0: - cost = tex2D(mytex0,node%tw,node/tw); - break; - case 1: - cost = tex2D(mytex1,node%tw,node/tw); - break; - case 2: - cost = tex2D(mytex2,node%tw,node/tw); - break; - case 3: - cost = tex2D(mytex3,node%tw,node/tw); - break; - } - row = node/tw + dy[j]; - col = node%tw + dx[j]; - - if( (row>=0) && (row < th) && (col >= 0) && (col < tw) ){ - RLoc [ row*tw + col ] = 4*(num_threads*k + tid)+j; - DEBUG("(tid %d)Connecting node %d to be processed by %d\n",tid,row*tw+col,RLoc [ row*tw + col ]); - } - - } - - } - } - - __syncthreads(); - - //copy Edges to R - for(int k=0; num_threads*k + tid < BiCount;k++){ - node = B [ BUCKETSIZE*i + num_threads*k + tid]; - if(node!=-1){ - for(int j=0;j<4;j++){ - switch(j){ - case 0: - cost = tex2D(mytex0,node%tw,node/tw); - break; - case 1: - cost = tex2D(mytex1,node%tw,node/tw); - break; - case 2: - cost = tex2D(mytex2,node%tw,node/tw); - break; - case 3: - cost = tex2D(mytex3,node%tw,node/tw); - break; - } - - - - row = node/tw + dy[j]; - col = node%tw + dx[j]; - DEBUG("Pre-candidate in R %d (d=%f)\n",row*tw + col,d[node]+cost,dR [4*RLoc[row*tw + col]+j]); - if( (row>=0) && (row < th) && (col >= 0) && (col < tw) ){ - DEBUG("Candidate in R %d (d=%f)\n",row*tw + col,d[node]+cost,dR [4*RLoc[row*tw + col]+j]); - if((cost<=DELTA)&&( d[node]+cost < d[row*tw + col])){ - R [4*RLoc[row*tw + col]+j] = row*tw + col; - dR[4*RLoc[row*tw + col]+j] = d[node]+cost; - dSourceR[4*RLoc[row*tw + col]+j]=node; - DEBUG("New node in R %d (d=%f,e=%f) in pos %d\n",R [4*RLoc[row*tw + col]+j],dR [4*RLoc[row*tw + col]+j],cost,4*RLoc[row*tw + col]+j); - vBucketMap[node]=-1; - } - } - } - } - } - __syncthreads(); - - //gathering data to find the minimum cost way to get to node n - //TODO: OPTIMIZE IN SUCH A WAY IT WON'T BE NEEDED TO GO THROUGH THE 4 EDGES, since they store the same value - int smin ; - float dists[4]; - for(int k=0; (num_threads*k + tid) < 4*BiCount;k++){ - index = 4*(num_threads*k + tid); - dists[0] = dR[ index ]; - dists[1] = dR[ index+1]; - dists[2] = dR[ index+2]; - dists[3] = dR[ index+3]; - - //finds the node with minimum distance, so that the path can be stored - if( dists[0] < dists[1]){ - if( dists[2] < dists[3]){ - if( dists[0] < dists[2]){ - smin = dSourceR[index]; - } - else{ - smin = dSourceR[index+2]; - } - } - else{ - if( dists[0] < dists[3]){ - smin = dSourceR[index]; - } - else{ - smin = dSourceR[index+3]; - } - } - } - else{ - if( dists[2] < dists[3]){ - if( dists[1] < dists[2]){ - smin = dSourceR[index+1]; - } - else{ - smin = dSourceR[index+2]; - } - } - else{ - if( dists[1] < dists[3]){ - smin = dSourceR[index+1]; - } - else{ - smin = dSourceR[index+3]; - } - } - - } - - f1 = fminf( dists[0] , dists[1] ); - f2 = fminf( dists[2] , dists[3] ); - - fmin = fminf(f1,f2); - dR[index ]=fmin; - dR[index+1]=fmin; - dR[index+2]=fmin; - dR[index+3]=fmin; - - DEBUG("Smin %d\n",smin); - dSourceR[index ]=smin; - dSourceR[index+1]=smin; - dSourceR[index+2]=smin; - dSourceR[index+3]=smin; - - - } - __syncthreads(); - -} - -//Pensar se o fato de S ter duplicatas pode impactar em algo -__global__ void -copyB2SKernel(int i, int* B, int* BCount,int* BPos, int* S, int* SCount){ - //TODO: optimize this code - //there's an optimized way of doing this, which is by only - //storing SCount = Scount+ BCount, as output - //and controlling with local variables thread positions - - const unsigned int tid = threadIdx.x; - const unsigned int num_threads = blockDim.x; - - int pos; - - int BiCount = BCount[i]; - for(int k=0; num_threads*k + tid < BiCount;k++){ - if(B[i*BUCKETSIZE+num_threads*k+tid]!=-1){ - pos = atomicAdd(&SCount[0],1); - S[pos] = B[i*BUCKETSIZE+num_threads*k+tid]; - } - } - __syncthreads(); - BCount[i]=0; - BPos[i]=0; - __syncthreads(); -// DEBUG("(tid %d) SCount %d\n",tid,SCount[0]); - - -} - - -//Parallel relax edges -__global__ void -relaxKernelPath( int RCount, int* B,int* BCount,int* BPos, int* RLoc,int* R,float* dR,float* d, int* vBucketLoc, int* vBucketMap, float* deb, int* dSourceR, int* dSource){ - - const unsigned int tid = threadIdx.x; - const unsigned int num_threads = blockDim.x; - int v,bn,bn_old, index; - float x; - - - //remove node from old bucket - - - - for(int k=0; num_threads*k + tid < RCount;k++){ - index = num_threads*k + tid; - - if(R[index]!=-1){ - - x = dR[index]; - v = R[index]; - - if(x=0) && (row < th) && (col >= 0) && (col < tw) ){ - RLoc [ row*tw + col ] = 4*(num_threads*k + tid)+j; - DEBUG("(tid %d)Connecting node %d to be processed by %d\n",tid,row*tw+col,RLoc [ row*tw + col ]); - } - - } - - } - } - - __syncthreads(); - - //copy Edges to R - for(int k=0; num_threads*k + tid < BiCount;k++){ - node = B [ BUCKETSIZE*i + num_threads*k + tid]; - if(node!=-1){ - for(int j=0;j<4;j++){ - switch(j){ - case 0: - cost = tex2D(mytex0,node%tw,node/tw); - break; - case 1: - cost = tex2D(mytex1,node%tw,node/tw); - break; - case 2: - cost = tex2D(mytex2,node%tw,node/tw); - break; - case 3: - cost = tex2D(mytex3,node%tw,node/tw); - break; - } - - - - row = node/tw + dy[j]; - col = node%tw + dx[j]; - DEBUG("Pre-candidate in R %d (d=%f)\n",row*tw + col,d[node]+cost,dR [4*RLoc[row*tw + col]+j]); - if( (row>=0) && (row < th) && (col >= 0) && (col < tw) ){ - DEBUG("Candidate in R %d (d=%f)\n",row*tw + col,d[node]+cost,dR [4*RLoc[row*tw + col]+j]); - if((cost>DELTA)&&( d[node]+cost < d[row*tw + col])){ - R [4*RLoc[row*tw + col]+j] = row*tw + col; - dR[4*RLoc[row*tw + col]+j] = d[node]+cost; - DEBUG("New node in R %d (d=%f,e=%f) in pos %d\n",R [4*RLoc[row*tw + col]+j],dR [4*RLoc[row*tw + col]+j],cost,4*RLoc[row*tw + col]+j); - vBucketMap[node]=-1; - } - } - } - } - } - __syncthreads(); - - //gathering data to find the minimum cost way to get to node n - //TODO: OPTIMIZE IN SUCH A WAY IT WON'T BE NEEDED TO GO THROUGH THE 4 EDGES, since they store the same value - for(int k=0; (num_threads*k + tid) < 16*BiCount;k++){ - f1 = fminf( dR[4*(num_threads*k + tid) ], dR[4*(num_threads*k + tid)+1] ); - f2 = fminf( dR[4*(num_threads*k + tid)+2], dR[4*(num_threads*k + tid)+3] ); - fmin = fminf(f1,f2); - dR[4*(num_threads*k + tid) ]=fmin; - dR[4*(num_threads*k + tid)+1]=fmin; - dR[4*(num_threads*k + tid)+2]=fmin; - dR[4*(num_threads*k + tid)+3]=fmin; - } - __syncthreads(); - -} - - - -__global__ void -emptyKernel(){ -} -// B is the bucket i vector -// RLoc[n] stores the position of node n in R (so that if more than one attempt to update -// the distance to node n is made at the same time, it can be shifted to 0,1,2 or 3 in the position of R) - - -#endif // #ifndef _MEMORY_KERNEL_H_ diff --git a/gridclean.cu b/gridclean.cu deleted file mode 100644 index db78235..0000000 --- a/gridclean.cu +++ /dev/null @@ -1,606 +0,0 @@ -//compile with -//nvcc delta.cu -I /home/baggio/NVIDIA_CUDA_SDK/common/inc/ -L /home/baggio/NVIDIA_CUDA_SDK/lib/ -lcuda -lcudart -lcutil -lGL -lGLU -// includes, system -#include -#include -#include -#include -#include - -// includes, project -#include -#include -//#define TIMER -#define FINETUNING -// includes, kernels -#include -#define NUMTHREADS 128 - - - - - -float* pixels = NULL; -float* dEdges[4]; -float maxDistance = 0.0f; -int* hSource = NULL; - -int GN = 262144; -int maxRCount = 0; - -cudaArray* array[4]; - - -struct edge{ - int dNode[4]; - float weight[4]; -}; - -edge* nodes; - - - - - -void -runTest( int argc, char** argv,int iw, int ih, int startNode); - -void -runTestGraph ( int argc, char** argv, int iw, int ih, int startNode, float* gradient); - - -void printPath(int* source, int destination){ - printf("%d <- %d\n",destination,source[destination]); - if(source[destination]==destination) return; - printPath( source, source[destination]); - -} - -void printPathSource(int destination, int* path){ - path[0] = hSource[destination]; - printf("%d <- %d\n",destination,hSource[destination]); - if(hSource[destination]==destination) return; - printPathSource( hSource[destination],path+1); -} - -void loadTexture(int iw, int ih, float* data, cudaArray* cArray, texture* myTex){ - cudaChannelFormatDesc desc = cudaCreateChannelDesc(); - CUDA_SAFE_CALL(cudaMallocArray(&cArray, &desc, iw, ih)); - CUDA_SAFE_CALL(cudaMemcpyToArray(cArray, 0, 0, data, sizeof(float)*iw*ih, cudaMemcpyHostToDevice)); - // Bind the array to the texture - cudaBindTextureToArray( *myTex, cArray, desc); - -} - -void loadGraphEdges(int iw, int ih,char* myFile){ - int n; - FILE* in = fopen(myFile,"r"); - fscanf(in,"%d\n",&n); - - for(int i=0;i<4;i++){ - dEdges[i] = (float*) malloc(iw*ih*sizeof(float)); - for(int j=0;jmaxDistance) maxDistance=hDist[i]; - } - else - printf("INFINI ",hDist[i]); - if(i%iw==iw-1) printf("\n"); - } - return; -} - - - -void -runTest( int argc, char** argv, int iw, int ih, int startNode) -{ - - - - //initialize the device - cudaSetDevice(0); - - GN = iw*ih; - - - unsigned int num_threads = NUMTHREADS; - - // setup execution parameters - dim3 grid( 1, 1, 1); - dim3 threads( num_threads, 1, 1); - - nodes = (edge*) malloc(GN*sizeof(edge)); - - //loadGraphEdges(iw,ih,argv[1]); - - - float* dDist; - cudaMalloc( (void**) &dDist, GN*sizeof(float)); - float* hDist = (float*) malloc(GN*sizeof(float)); - - for(int i=0;imaxRCount) maxRCount = RCount[0]; - printf("RCount %d\n",RCount[0]); -// if(RCount[0]==0) i++; - sameCount++; - -#ifdef TIMER - cutStartTimer( laptimer); -#endif - -#ifdef FINETUNING - cutStartTimer( labelTimer); -#endif - - labelKernel <<>> ( i, dB, BCount,dBPos, iw,ih, dRLoc, dR, dDistR, dSourceR, dDist,dBucketMap); - cudaThreadSynchronize(); - - - -#ifdef FINETUNING - cutStopTimer( labelTimer); - labelCounter++; -#endif - - cudaMemcpy( &hBPos[i], &dBPos[i], 1*sizeof(int), cudaMemcpyDeviceToHost) ; - - -#ifdef TIMER - ktime = cutGetTimerValue( laptimer ); - printf("Label kernel %f\n",ktime); - cutStartTimer( laptimer); -#endif - -#ifdef FINETUNING - cutStartTimer( copyTimer); -#endif - - - - copyB2SKernel <<>> ( i, dB, BCount,dBPos, dS, dSCount); - cudaThreadSynchronize(); - -#ifdef FINETUNING - cutStopTimer( copyTimer); - copyCounter++; -#endif - - - -#ifdef TIMER - ktime = cutGetTimerValue( laptimer ); - printf("CopyB2S kernel %f\n",ktime); -#endif - -#ifdef TIMER - cutStartTimer( laptimer); -#endif - - -#ifdef FINETUNING - cutStartTimer( emptyTimer); -#endif - - -// emptyKernel <<>> (); -// cudaThreadSynchronize(); - -#ifdef FINETUNING - cutStopTimer( emptyTimer); - emptyCounter++; -#endif - - -#ifdef TIMER - ktime = cutGetTimerValue ( laptimer ); - printf(" Empty kernel %f\n",ktime); -#endif - - //todo: correct RCount - cutStartTimer( laptimer); - - -// printf("Sending RCount %d\n",4*4*hBPos[i]); - -#ifdef FINETUNING - cutStartTimer( relaxTimer); -#endif - - relaxKernelPath <<>> ( 4*4*hBPos[i], dB, BCount, dBPos, dRLoc, dR, dDistR, dDist,dBucketPos, dBucketMap,lido, dSourceR, dSource); - cudaThreadSynchronize(); - -#ifdef FINETUNING - cutStopTimer( relaxTimer); - relaxCounter++; -#endif - - -#ifdef TIMER - ktime = cutGetTimerValue( laptimer ); - printf("Relaxing kernel %f\n",ktime); -#endif - - - cudaMemcpy( RCount, &BCount[i], 1*sizeof(int), cudaMemcpyDeviceToHost) ; - cudaMemcpy( hSCount, dSCount, 1*sizeof(int), cudaMemcpyDeviceToHost) ; - - -#ifdef TIMER - printf("After RCount %d (i=%d) | SCount %d\n",RCount[0],i,hSCount[0]); -#endif - - - } - -// ktime = cutGetTimerValue( laptimer ); -#ifdef TIMER - printf("Same called %d times.\n",sameCount); -#endif - -// printf("Label kernel %f\n",ktime); -// printf("Before heavy label%d\n",i); - - -// cutStartTimer( laptimerrunTestGraph); - -// labelHeavyKernel <<>> ( 0, dS, hSCount[0], iw,ih, dRLoc, dR, dDistR, dDist,dBucketMap); -// cudaThreadSynchronize(); - - - - ktime = cutGetTimerValue( laptimer ); -#ifdef TIMER - printf("Labelling heavy kernel %f\n",ktime); -#endif - -// printf("After heavy label%d\n",i); - //todo: correct SCount - -#ifdef TIMER - cutStartTimer( laptimer); -#endif -// int temp[1]; -// cudaMemcpy( temp, &dSCount[0], 1*sizeof(int), cudaMemcpyDeviceToHost) ; -// relaxKernel <<>> ( temp[0], dB, BCount, dRLoc, dR, dDistR, dDist,dBucketPos, dBucketMap,lido); - -// relaxKernel <<>> ( 4*4*hSCount[0], dB, BCount, dBPos, dRLoc, dR, dDistR, dDist,dBucketPos, dBucketMap,lido); - -// ktime = cutGetTimerValue( laptimer ); -#ifdef TIMER - printf("Relaxing heavy kernel %f\n",ktime); -#endif - -// printf("After heavy relax%d\n",i); - -// printf("Done here. i = %d\n",i); - - - } - - - // check if kernel execution generated and error - - CUT_CHECK_ERROR("Kernel execution failed"); - - - - ktime = cutGetTimerValue( nvtimer ); - cudaMemcpy( hRLoc, dRLoc, 1024*sizeof(int), cudaMemcpyDeviceToHost) ; - cudaMemcpy( hRLoc, dRLoc, 1024*sizeof(int), cudaMemcpyDeviceToHost) ; - cudaMemcpy( hR, dR, 4*1024*8*sizeof(int), cudaMemcpyDeviceToHost) ; - cudaMemcpy( hDistR, dDistR, 4*1024*8*sizeof(float), cudaMemcpyDeviceToHost) ; - cudaMemcpy( hS, dS, 1024*8*sizeof(float), cudaMemcpyDeviceToHost) ; - cudaMemcpy( hDist, dDist, GN*sizeof(int), cudaMemcpyDeviceToHost) ; - cudaMemcpy( hBCount, BCount, NUMBUCKETS*sizeof(int), cudaMemcpyDeviceToHost) ; - cudaMemcpy( hB, dB, 2*NUMBUCKETS*sizeof(int), cudaMemcpyDeviceToHost) ; - - cudaMemcpy( Hlido, lido, GN*sizeof(float),cudaMemcpyDeviceToHost); - cudaMemcpy( hR, dR, GN*sizeof(int),cudaMemcpyDeviceToHost); - - cudaMemcpy( hSource, dSource, GN*sizeof(int), cudaMemcpyDeviceToHost); - - - - printDistances(hDist,iw,ih); - - printPath(hSource, 10); - -#ifdef FINETUNING - printf("Label averageTime: %f ms (called %d times)\n",cutGetAverageTimerValue(labelTimer), labelCounter); - printf("Copy averageTime: %f ms (called %d times)\n",cutGetAverageTimerValue(copyTimer), copyCounter); - printf("Relax averageTime: %f ms (called %d times)\n",cutGetAverageTimerValue(relaxTimer), relaxCounter); - printf("Empty averageTime: %f ms (called %d times)\n",cutGetAverageTimerValue(emptyTimer), emptyCounter); - printf("Num threads: %d\n",num_threads); - printf("Max RCount %d\n",maxRCount); - printf("Max Distance %f\n", maxDistance); -#endif - - for(int i=0;i<5;i++){ - printf("source[%d]= %d\n",i,hSource[i]); - } - for(int i=512;i<517;i++){ - printf("source[%d]= %d\n",i,hSource[i]); - } - - printf("It took: %f ms\n", ktime); - - cudaUnbindTexture(mytex0); - cudaUnbindTexture(mytex1); - cudaUnbindTexture(mytex2); - cudaUnbindTexture(mytex3); - - cudaFreeArray(array[0]); - cudaFree(dB); - cudaFree(dRLoc); - cudaFree(dDist); - cudaFree(dBucketMap); - cudaFree(dBucketPos); - - cudaFree(BCount); - - cudaFree(dR); - cudaFree(dDistR); - cudaFree(dSource); - cudaFree(dS); - cudaFree(dSCount); - free(dEdges[0]); - free(dEdges[1]); - free(dEdges[2]); - free(dEdges[3]); - free(nodes); - - -} - -void empty(){ - dim3 grid( 1, 1, 1); - dim3 threads( 256, 1, 1); - emptyKernel <<>> (); -} - -void loadEdgesFromGradient(int iw, int ih, float* gradient){ - - for(int i=0;i<4;i++){ - dEdges[i] = (float*) malloc(iw*ih*sizeof(float)); - for(int j=0;j -#include -#include "Window.h" - -void -runTest( int argc, char** argv,int iw, int ih, int startNode); - -void -runTestGraph( int argc, char** argv,int iw, int ih, int startNode, float* gradient); - -void empty(); - -int main(int argc, char* argv[]){ - //only to initialize the environment - empty(); - - /*float* dummy; - if(argc <2){ - printf("Usage: program_name graph_file graph_width graph_height start_node\n"); - return 0; - }*/ - //printf( default iw and ih is 512 -/* if(argc == 2){ - runTest( argc, argv,512,512,0); - } - else{ - int iw, ih; - sscanf(argv[2],"%d",&iw); - sscanf(argv[3],"%d",&ih); - int startNode = 0; - if(argc >= 5) - sscanf(argv[4],"%d",&startNode); - //runTestGraph( argc, argv,iw,ih,startNode,dummy); - }*/ - - QApplication app(argc,argv); - Window window; - window.show(); - - return app.exec(); -} diff --git a/personal/Kathy.ppt b/personal/Kathy.ppt deleted file mode 100644 index c81d05e..0000000 Binary files a/personal/Kathy.ppt and /dev/null differ diff --git a/plane.bmp b/plane.bmp deleted file mode 100644 index daea037..0000000 Binary files a/plane.bmp and /dev/null differ diff --git a/qt.pro b/qt.pro deleted file mode 100644 index 2215df1..0000000 --- a/qt.pro +++ /dev/null @@ -1,15 +0,0 @@ -###################################################################### -# Automatically generated by qmake (2.01a) Sat Nov 17 11:02:08 2007 -###################################################################### - -TEMPLATE = app -TARGET = -DEPENDPATH += . -INCLUDEPATH += . -QT+=opengl - - - -# Input -HEADERS += GLWidget.h Window.h CoordWidget.h -SOURCES += GLWidget.cpp main.cpp Window.cpp bmploader.cpp CoordWidget.cpp diff --git a/setting_up.md b/setting_up.md deleted file mode 100644 index bb242e1..0000000 --- a/setting_up.md +++ /dev/null @@ -1,7 +0,0 @@ -This is how to set up -everything - - -.![](http://www.jrsoftware.org/images/is-welcome.png) - -![](main.cpp) \ No newline at end of file