diff --git a/perflab/matrix/Makefile b/perflab/matrix/Makefile new file mode 100644 index 0000000..2dc672d --- /dev/null +++ b/perflab/matrix/Makefile @@ -0,0 +1,34 @@ +CC = gcc +CFLAGS = -Wall -O1 -g +#LDFLAGS = -lm -lcudart -lcuda + +# Source files +SRCS = rowcol_test.c clock.c cpe.c fcyc.c lsquare.c rowcol_202302723005.c +#CUDA_SRCS = rowcol.cu +OBJS = $(SRCS:.c=.o) +#rowcol.o + +# Target executable +TARGET = matrix_test + +# Default target +all: $(TARGET) + +# Rule to build the executable +$(TARGET): $(OBJS) + $(CC) $(OBJS) -o $(TARGET) $(LDFLAGS) + +# Rule to build object files +%.o: %.c + $(CC) $(CFLAGS) -c $< -o $@ + +# Rule to build CUDA object files +#rowcol.o: rowcol.cu +# $(NVCC) $(CUDA_FLAGS) -c $< -o $@ + +# Clean rule +clean: + rm -f $(OBJS) $(TARGET) + +# Phony targets +.PHONY: all clean diff --git a/perflab/matrix/clock.c b/perflab/matrix/clock.c index a587590..b826af4 100644 --- a/perflab/matrix/clock.c +++ b/perflab/matrix/clock.c @@ -1,229 +1,196 @@ -/* clock.c - * Retrofitted to use thread-specific timers - * and to get clock information from /proc/cpuinfo - * (C) R. E. Bryant, 2010 - * - */ - -/* When this constant is not defined, uses time stamp counter */ -#define USE_POSIX 0 - -/* Choice to use cpu_gettime call or Intel time stamp counter directly */ - -#include -#include -#include -#include -//#include -#include -#include -#include "clock.h" - -/* Use x86 cycle counter */ - -/* Initialize the cycle counter */ -static unsigned cyc_hi = 0; -static unsigned cyc_lo = 0; - -/* Set *hi and *lo to the high and low order bits of the cycle counter. - Implementation requires assembly code to use the rdtsc instruction. */ -void access_counter(unsigned *hi, unsigned *lo) -{ - - long long counter; - - counter = __rdtsc(); - (*hi) = (unsigned int)(counter >> 32); - (*lo) = (unsigned int)counter; -/* - - LARGE_INTEGER lPerformanceCount; - - QueryPerformanceCounter(&lPerformanceCount); - (*hi) = (unsigned int)lPerformanceCount.HighPart; - (*lo) = (unsigned int)lPerformanceCount.LowPart; -// printf("%08X %08X\n",(*hi),(*lo)); -*/ -} - - -/* Record the current value of the cycle counter. */ -void start_counter() -{ - access_counter(&cyc_hi, &cyc_lo); -} - -/* Return the number of cycles since the last call to start_counter. */ -double get_counter() -{ - unsigned ncyc_hi, ncyc_lo; - unsigned hi, lo, borrow; - double result; - - /* Get cycle counter */ - access_counter(&ncyc_hi, &ncyc_lo); - - /* Do double precision subtraction */ - lo = ncyc_lo - cyc_lo; - borrow = cyc_lo > ncyc_lo; - hi = ncyc_hi - cyc_hi - borrow; - result = (double) hi * (1 << 30) * 4 + lo; - return result; -} -void make_CPU_busy(void) -{ - volatile double old_tick,new_tick; - start_counter(); - old_tick = get_counter(); - new_tick = get_counter(); - while (new_tick - old_tick < 1000000000) - new_tick = get_counter(); -} - -//CPU的频率 -double mhz(int verbose) -{ - LARGE_INTEGER lFrequency; - LARGE_INTEGER lPerformanceCount_Start; - LARGE_INTEGER lPerformanceCount_End; - double mhz; - double fTime; - __int64 _i64StartCpuCounter; - __int64 _i64EndCpuCounter; - //On a multiprocessor machine, it should not matter which processor is called. - //However, you can get different results on different processors due to bugs in - //the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function. - HANDLE hThread=GetCurrentThread(); - SetThreadAffinityMask(hThread,0x1); - - //主板上高精度定时器的晶振频率 - //这个定时器应该就是一片8253或者8254 - //在intel ich7中集成了8254 - QueryPerformanceFrequency(&lFrequency); -// if (verbose>0) -// printf("高精度定时器的晶振频率:%1.0fHz.\n",(double)lFrequency.QuadPart); - - //这个定时器每经过一个时钟周期,其计数器会+1 - QueryPerformanceCounter(&lPerformanceCount_Start); - - //RDTSC指令:获取CPU经历的时钟周期数 - _i64StartCpuCounter=__rdtsc(); - - //延时长一点,误差会小一点 - //int nTemp=100000; - //while (--nTemp); - Sleep(200); - - QueryPerformanceCounter(&lPerformanceCount_End); - - _i64EndCpuCounter=__rdtsc(); - - //f=1/T => f=计数次数/(计数次数*T) - //这里的“计数次数*T”就是时间差 - fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart) - /(double)lFrequency.QuadPart; - - mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0); - if (verbose>0) - printf("CPU频率为:%1.6fMHz.\n",mhz); - return mhz; -} - -double CPU_Factor1(void) -{ - double result; - int i,j,k,ii,jj,kk; - LARGE_INTEGER lStart,lEnd; - LARGE_INTEGER lFrequency; - HANDLE hThread; - double fTime; - - QueryPerformanceFrequency(&lFrequency); - - ii = 43273; - kk = 1238; - result = 1; - jj = 1244; - - hThread=GetCurrentThread(); - SetThreadAffinityMask(hThread,0x1); - QueryPerformanceCounter(&lStart); - //_asm("cpuid"); - start_counter(); - for (i=0;i<100;i++) - for (j=0;j<1000;j++) - for (k=0;k<1000;k++) - kk += kk*ii+jj; - - result = get_counter(); - QueryPerformanceCounter(&lEnd); - fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart); - printf("CPU运行时间为%f",result); - printf("\t %f\n",fTime); - return result; -} - -double CPU_Factor(void) -{ - double frequency; - double multiplier = 1000 * 1000 * 1000;//nano - LARGE_INTEGER lFrequency; - LARGE_INTEGER start,stop; - HANDLE hThread; - int i; - const int gigahertz= 1000*1000*1000; - const int known_instructions_per_loop = 27317; - - int iterations = 100000000; - int g = 0; - double normal_ticks_per_second; -double ticks; -double time; -double loops_per_sec; -double instructions_per_loop; -double ratio; -double actual_freq; - - QueryPerformanceFrequency(&lFrequency); - frequency = (double)lFrequency.QuadPart; - - hThread=GetCurrentThread(); - SetThreadAffinityMask(hThread,0x1); - QueryPerformanceCounter(&start); - for( i = 0; i < iterations; i++) - { - g++; - g++; - g++; - g++; - } - QueryPerformanceCounter(&stop); - - //normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199 - normal_ticks_per_second = frequency * 1000; - ticks = (double)((double)stop.QuadPart - (double)start.QuadPart); - time = (ticks * multiplier) /frequency; - loops_per_sec = iterations / (time/multiplier); - instructions_per_loop = normal_ticks_per_second / loops_per_sec; - - ratio = (instructions_per_loop / known_instructions_per_loop); - actual_freq = normal_ticks_per_second / ratio; -/* - actual_freq = normal_ticks_per_second / ratio; - actual_freq = known_instructions_per_loop*iterations*multiplier/time; - - 2293 = x/time; - - 2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000 - loops_per_sec = iterations*frequency / ticks - - instructions_per_loop = / loops_per_sec; -*/ - printf("Perf counter freq: %f\n", normal_ticks_per_second); - printf("Loops per sec: %f\n", loops_per_sec); - printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop); - printf("Presumed freq: %f\n", actual_freq); - printf("ratio: %f\n", ratio); - printf("time=%f\n",time); - return ratio; -} +/* clock.c + * Retrofitted to use thread-specific timers + * and to get clock information from /proc/cpuinfo + * (C) R. E. Bryant, 2010 + * Modified for cross-platform compatibility + */ + +#define _GNU_SOURCE // For sched_setaffinity on Linux +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#include +#else +#include +#include +#include +#include +typedef struct { + uint64_t QuadPart; +} LARGE_INTEGER; +typedef void *HANDLE; +#define __int64 long long +#define Sleep(ms) usleep((ms) * 1000) +#endif + +#include "clock.h" + +/* Use x86 cycle counter */ +static unsigned cyc_hi = 0; +static unsigned cyc_lo = 0; + +void access_counter(unsigned *hi, unsigned *lo) { + uint64_t counter = __rdtsc(); + *hi = (unsigned)(counter >> 32); + *lo = (unsigned)counter; +} + +void start_counter() { access_counter(&cyc_hi, &cyc_lo); } + +double get_counter() { + unsigned ncyc_hi, ncyc_lo; + access_counter(&ncyc_hi, &ncyc_lo); + uint64_t start = ((uint64_t)cyc_hi << 32) | cyc_lo; + uint64_t end = ((uint64_t)ncyc_hi << 32) | ncyc_lo; + return (double)(end - start); +} + +void make_CPU_busy(void) { + volatile double old_tick = get_counter(); + volatile double new_tick; + while ((new_tick - old_tick) < 1000000000) { + new_tick = get_counter(); + } +} + +#ifdef _WIN32 +#define GET_TIME(dest) QueryPerformanceCounter(dest) +#else +static inline void GET_TIME(LARGE_INTEGER *dest) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + dest->QuadPart = (uint64_t)ts.tv_sec * 1000000000 + ts.tv_nsec; +} +#define QueryPerformanceFrequency(freq) ((freq)->QuadPart = 1000000000) +#endif + +double mhz(int verbose) { + LARGE_INTEGER lFrequency; + LARGE_INTEGER lPerformanceCount_Start; + LARGE_INTEGER lPerformanceCount_End; + double mhz; + double fTime; + __int64 _i64StartCpuCounter; + __int64 _i64EndCpuCounter; + +#ifdef _WIN32 + HANDLE hThread = GetCurrentThread(); + SetThreadAffinityMask(hThread, 0x1); +#else + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + sched_setaffinity(0, sizeof(cpuset), &cpuset); +#endif + + QueryPerformanceFrequency(&lFrequency); + GET_TIME(&lPerformanceCount_Start); + _i64StartCpuCounter = __rdtsc(); + Sleep(200); + GET_TIME(&lPerformanceCount_End); + _i64EndCpuCounter = __rdtsc(); + + fTime = (lPerformanceCount_End.QuadPart - lPerformanceCount_Start.QuadPart) / + (double)lFrequency.QuadPart; + mhz = (_i64EndCpuCounter - _i64StartCpuCounter) / (fTime * 1000000.0); + + if (verbose > 0) { + printf("CPU棰戠巼涓: %.6fMHz.\n", mhz); + } + return mhz; +} + +double CPU_Factor1(void) { + double result; + int i, j, k; + LARGE_INTEGER lStart, lEnd; + LARGE_INTEGER lFrequency; + double fTime; + +#ifdef _WIN32 + HANDLE hThread = GetCurrentThread(); + SetThreadAffinityMask(hThread, 0x1); +#else + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + sched_setaffinity(0, sizeof(cpuset), &cpuset); +#endif + + QueryPerformanceFrequency(&lFrequency); + GET_TIME(&lStart); + start_counter(); + + for (i = 0; i < 100; i++) + for (j = 0; j < 1000; j++) + for (k = 0; k < 1000; k++) + ; + + result = get_counter(); + GET_TIME(&lEnd); + + fTime = (lEnd.QuadPart - lStart.QuadPart) / (double)lFrequency.QuadPart; + printf("CPU璁$畻鏃堕暱涓: %f", result); + printf("\t %f\n", fTime); + return result; +} + +double CPU_Factor(void) { + double frequency; + double multiplier = 1000 * 1000 * 1000; // nano + LARGE_INTEGER lFrequency; + LARGE_INTEGER start, stop; + int i; + const int known_instructions_per_loop = 27317; + int iterations = 100000000; + int g = 0; + double normal_ticks_per_second; + double ticks; + double time; + double loops_per_sec; + double instructions_per_loop; + double ratio; + double actual_freq; + +#ifdef _WIN32 + HANDLE hThread = GetCurrentThread(); + SetThreadAffinityMask(hThread, 0x1); +#else + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + sched_setaffinity(0, sizeof(cpuset), &cpuset); +#endif + + QueryPerformanceFrequency(&lFrequency); + frequency = (double)lFrequency.QuadPart; + GET_TIME(&start); + + for (i = 0; i < iterations; i++) { + g++; + g++; + g++; + g++; + } + + GET_TIME(&stop); + normal_ticks_per_second = frequency * 1000; + ticks = (double)(stop.QuadPart - start.QuadPart); + time = (ticks * multiplier) / frequency; + loops_per_sec = iterations / (time / multiplier); + instructions_per_loop = normal_ticks_per_second / loops_per_sec; + ratio = instructions_per_loop / known_instructions_per_loop; + actual_freq = normal_ticks_per_second / ratio; + + printf("Perf counter freq: %f\n", normal_ticks_per_second); + printf("Loops per sec: %f\n", loops_per_sec); + printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop); + printf("Presumed freq: %f\n", actual_freq); + printf("ratio: %f\n", ratio); + printf("time=%f\n", time); + return ratio; +} diff --git a/perflab/matrix/clock.c.bak b/perflab/matrix/clock.c.bak new file mode 100644 index 0000000..3b2a198 --- /dev/null +++ b/perflab/matrix/clock.c.bak @@ -0,0 +1,229 @@ +/* clock.c + * Retrofitted to use thread-specific timers + * and to get clock information from /proc/cpuinfo + * (C) R. E. Bryant, 2010 + * + */ + +/* When this constant is not defined, uses time stamp counter */ +#define USE_POSIX 0 + +/* Choice to use cpu_gettime call or Intel time stamp counter directly */ + +#include +#include +#include +#include +//#include +//#include +#include +#include "clock.h" + +/* Use x86 cycle counter */ + +/* Initialize the cycle counter */ +static unsigned cyc_hi = 0; +static unsigned cyc_lo = 0; + +/* Set *hi and *lo to the high and low order bits of the cycle counter. + Implementation requires assembly code to use the rdtsc instruction. */ +void access_counter(unsigned *hi, unsigned *lo) +{ + + long long counter; + + counter = __rdtsc(); + (*hi) = (unsigned int)(counter >> 32); + (*lo) = (unsigned int)counter; +/* + + LARGE_INTEGER lPerformanceCount; + + QueryPerformanceCounter(&lPerformanceCount); + (*hi) = (unsigned int)lPerformanceCount.HighPart; + (*lo) = (unsigned int)lPerformanceCount.LowPart; +// printf("%08X %08X\n",(*hi),(*lo)); +*/ +} + + +/* Record the current value of the cycle counter. */ +void start_counter() +{ + access_counter(&cyc_hi, &cyc_lo); +} + +/* Return the number of cycles since the last call to start_counter. */ +double get_counter() +{ + unsigned ncyc_hi, ncyc_lo; + unsigned hi, lo, borrow; + double result; + + /* Get cycle counter */ + access_counter(&ncyc_hi, &ncyc_lo); + + /* Do double precision subtraction */ + lo = ncyc_lo - cyc_lo; + borrow = cyc_lo > ncyc_lo; + hi = ncyc_hi - cyc_hi - borrow; + result = (double) hi * (1 << 30) * 4 + lo; + return result; +} +void make_CPU_busy(void) +{ + volatile double old_tick,new_tick; + start_counter(); + old_tick = get_counter(); + new_tick = get_counter(); + while (new_tick - old_tick < 1000000000) + new_tick = get_counter(); +} + +//CPU的频率 +double mhz(int verbose) +{ + LARGE_INTEGER lFrequency; + LARGE_INTEGER lPerformanceCount_Start; + LARGE_INTEGER lPerformanceCount_End; + double mhz; + double fTime; + __int64 _i64StartCpuCounter; + __int64 _i64EndCpuCounter; + //On a multiprocessor machine, it should not matter which processor is called. + //However, you can get different results on different processors due to bugs in + //the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function. + HANDLE hThread=GetCurrentThread(); + SetThreadAffinityMask(hThread,0x1); + + //主板上高精度定时器的晶振频率 + //这个定时器应该就是一片8253或者8254 + //在intel ich7中集成了8254 + QueryPerformanceFrequency(&lFrequency); +// if (verbose>0) +// printf("高精度定时器的晶振频率:%1.0fHz.\n",(double)lFrequency.QuadPart); + + //这个定时器每经过一个时钟周期,其计数器会+1 + QueryPerformanceCounter(&lPerformanceCount_Start); + + //RDTSC指令:获取CPU经历的时钟周期数 + _i64StartCpuCounter=__rdtsc(); + + //延时长一点,误差会小一点 + //int nTemp=100000; + //while (--nTemp); + Sleep(200); + + QueryPerformanceCounter(&lPerformanceCount_End); + + _i64EndCpuCounter=__rdtsc(); + + //f=1/T => f=计数次数/(计数次数*T) + //这里的“计数次数*T”就是时间差 + fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart) + /(double)lFrequency.QuadPart; + + mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0); + if (verbose>0) + printf("CPU频率为:%1.6fMHz.\n",mhz); + return mhz; +} + +double CPU_Factor1(void) +{ + double result; + int i,j,k,ii,jj,kk; + LARGE_INTEGER lStart,lEnd; + LARGE_INTEGER lFrequency; + HANDLE hThread; + double fTime; + + QueryPerformanceFrequency(&lFrequency); + + ii = 43273; + kk = 1238; + result = 1; + jj = 1244; + + hThread=GetCurrentThread(); + SetThreadAffinityMask(hThread,0x1); + QueryPerformanceCounter(&lStart); + //_asm("cpuid"); + start_counter(); + for (i=0;i<100;i++) + for (j=0;j<1000;j++) + for (k=0;k<1000;k++) + kk += kk*ii+jj; + + result = get_counter(); + QueryPerformanceCounter(&lEnd); + fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart); + printf("CPU运行时间为%f",result); + printf("\t %f\n",fTime); + return result; +} + +double CPU_Factor(void) +{ + double frequency; + double multiplier = 1000 * 1000 * 1000;//nano + LARGE_INTEGER lFrequency; + LARGE_INTEGER start,stop; + HANDLE hThread; + int i; + const int gigahertz= 1000*1000*1000; + const int known_instructions_per_loop = 27317; + + int iterations = 100000000; + int g = 0; + double normal_ticks_per_second; +double ticks; +double time; +double loops_per_sec; +double instructions_per_loop; +double ratio; +double actual_freq; + + QueryPerformanceFrequency(&lFrequency); + frequency = (double)lFrequency.QuadPart; + + hThread=GetCurrentThread(); + SetThreadAffinityMask(hThread,0x1); + QueryPerformanceCounter(&start); + for( i = 0; i < iterations; i++) + { + g++; + g++; + g++; + g++; + } + QueryPerformanceCounter(&stop); + + //normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199 + normal_ticks_per_second = frequency * 1000; + ticks = (double)((double)stop.QuadPart - (double)start.QuadPart); + time = (ticks * multiplier) /frequency; + loops_per_sec = iterations / (time/multiplier); + instructions_per_loop = normal_ticks_per_second / loops_per_sec; + + ratio = (instructions_per_loop / known_instructions_per_loop); + actual_freq = normal_ticks_per_second / ratio; +/* + actual_freq = normal_ticks_per_second / ratio; + actual_freq = known_instructions_per_loop*iterations*multiplier/time; + + 2293 = x/time; + + 2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000 + loops_per_sec = iterations*frequency / ticks + + instructions_per_loop = / loops_per_sec; +*/ + printf("Perf counter freq: %f\n", normal_ticks_per_second); + printf("Loops per sec: %f\n", loops_per_sec); + printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop); + printf("Presumed freq: %f\n", actual_freq); + printf("ratio: %f\n", ratio); + printf("time=%f\n",time); + return ratio; +} diff --git a/perflab/matrix/clock.o b/perflab/matrix/clock.o new file mode 100644 index 0000000..0e68ee3 Binary files /dev/null and b/perflab/matrix/clock.o differ diff --git a/perflab/matrix/cpe.o b/perflab/matrix/cpe.o new file mode 100644 index 0000000..96d717d Binary files /dev/null and b/perflab/matrix/cpe.o differ diff --git a/perflab/matrix/fcyc.c b/perflab/matrix/fcyc.c index dc2f735..a5f4077 100644 --- a/perflab/matrix/fcyc.c +++ b/perflab/matrix/fcyc.c @@ -119,7 +119,7 @@ double fcyc(test_funct f, int *params) if (clear_cache) clear(); start_counter(); - f(params); + f((long*)params); cyc = get_counter(); if (cyc > 0.0) add_sample(cyc); @@ -131,7 +131,7 @@ double fcyc(test_funct f, int *params) clear(); start_counter(); for (i=0;i 0.0) add_sample(cyc); diff --git a/perflab/matrix/fcyc.o b/perflab/matrix/fcyc.o new file mode 100644 index 0000000..0943503 Binary files /dev/null and b/perflab/matrix/fcyc.o differ diff --git a/perflab/matrix/lsquare.o b/perflab/matrix/lsquare.o new file mode 100644 index 0000000..f36c57e Binary files /dev/null and b/perflab/matrix/lsquare.o differ diff --git a/perflab/matrix/matrix_test b/perflab/matrix/matrix_test new file mode 100644 index 0000000..9fbc0a4 Binary files /dev/null and b/perflab/matrix/matrix_test differ diff --git a/perflab/matrix/rowcol.c b/perflab/matrix/rowcol.c index 9e50cbb..b504582 100644 --- a/perflab/matrix/rowcol.c +++ b/perflab/matrix/rowcol.c @@ -1,77 +1,69 @@ /************************************************************************** - 行/列求和函数。按下面的要求编辑此文件: - 1. 将你的学号、姓名,以注释的方式写到下面; - 2. 实现不同版本的行列求和函数; - 3. 编辑rc_fun_rec rc_fun_tab数组,将你的最好的答案 - (最好的行和列求和、最好的列求和)作为数组的前两项 + ??/??????????????????????????????? + 1. ??????????????????????????????? + 2. ?????????????????????? + 3. ??rc_fun_rec rc_fun_tab?????????????????? + ??????????????????????????????????????????? ***************************************************************************/ - + /* - 学号:201209054233 - 姓名:夜半加班狂 + ????201209054233 + ?????????????? */ +#include "rowcol.h" +#include +#include +#include -#include -#include -#include "rowcol.h" -#include - -/* 参考的列求和函数实现 */ -/* 计算矩阵中的每一列的和。请注意对于行和列求和来说,调用参数是 - 一样的,只是第2个参数不会用到而已 +/* ????????????????? */ +/* ??????????????????????????????????????????????? + ??????????2????????????????? */ -void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) -{ - int i,j; +void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) { + int i, j; + for (j = 0; j < N; j++) { + colsum[j] = 0; + for (i = 0; i < N; i++) + colsum[j] += M[i][j]; + } +} + +/* ???????????????????? */ +/* ??????????????????????? */ + +void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) { + int i, j; + for (i = 0; i < N; i++) { + rowsum[i] = colsum[i] = 0; for (j = 0; j < N; j++) { - colsum[j] = 0; - for (i = 0; i < N; i++) - colsum[j] += M[i][j]; + rowsum[i] += M[i][j]; + colsum[i] += M[j][i]; } + } } - -/* 参考的列和行求和函数实现 */ -/* 计算矩阵中的每一行、每一列的和。 */ - -void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) -{ - int i,j; - for (i = 0; i < N; i++) { - rowsum[i] = colsum[i] = 0; - for (j = 0; j < N; j++) { - rowsum[i] += M[i][j]; - colsum[i] += M[j][i]; - } - } -} - - - -/* - 这个表格包含多个数组元素,每一组元素(函数名字, COL/ROWCOL, "描述字符串") - COL表示该函数仅仅计算每一列的和 - ROWCOL表示该函数计算每一行、每一列的和 - 将你认为最好的两个实现,放在最前面。 - 比如: - {my_c_sum1, "超级垃圾列求和实现"}, - {my_rc_sum2, "好一点的行列求和实现"}, +/* + ????????????????????????????????????????, COL/ROWCOL, "?????????"?? + COL?????????????????????? + ROWCOL??????????????????????? + ????????????????????????????? + ???? + {my_c_sum1, "?????????????????"}, + {my_rc_sum2, "??????????????????"}, */ -rc_fun_rec rc_fun_tab[] = -{ +rc_fun_rec rc_fun_tab[] = { - /* 第一项,应当是你写的最好列求和的函数实现 */ + /* ???????????????????????????????? */ {c_sum, COL, "Best column sum"}, - /* 第二项,应当是你写的最好行列求和的函数实现 */ + /* ?????????????????????????????????? */ {rc_sum, ROWCOL, "Best row and column sum"}, {c_sum, COL, "Column sum, reference implementation"}, {rc_sum, ROWCOL, "Row and column sum, reference implementation"}, - /* 下面的代码不能修改或者删除!!表明数组列表结束 */ - {NULL,ROWCOL,NULL} -}; + /* ??????????????????????????????????????? */ + {NULL, ROWCOL, NULL}}; \ No newline at end of file diff --git a/perflab/matrix/rowcol.c~ b/perflab/matrix/rowcol.c~ new file mode 100644 index 0000000..990ce83 --- /dev/null +++ b/perflab/matrix/rowcol.c~ @@ -0,0 +1,162 @@ +/************************************************************************** + 行/列求和函数。按下面的要求编辑此文件: + 1. 将你的学号、姓名,以注释的方式写到下面; + 2. 实现不同版本的行列求和函数; + 3. 编辑rc_fun_rec rc_fun_tab数组,将你的最好的答案 + (最好的行和列求和、最好的列求和)作为数组的前两项 +***************************************************************************/ + +/* + 学号:202302723005 + 姓名:程景愉 +*/ + + +#include +#include +#include "rowcol.h" +#include +#include + +/* 参考的列求和函数实现 */ +/* 计算矩阵中的每一列的和。请注意对于行和列求和来说,调用参数是 + 一样的,只是第2个参数不会用到而已 +*/ + +void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) +{ + int i,j; + for (j = 0; j < N; j++) { + colsum[j] = 0; + for (i = 0; i < N; i++) + colsum[j] += M[i][j]; + } +} + + +/* 参考的列和行求和函数实现 */ +/* 计算矩阵中的每一行、每一列的和。 */ + +void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) +{ + int i,j; + for (i = 0; i < N; i++) { + rowsum[i] = colsum[i] = 0; + for (j = 0; j < N; j++) { + rowsum[i] += M[i][j]; + colsum[i] += M[j][i]; + } + } +} + +/* CUDA优化的列求和函数 */ +void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum) +{ + // 分配设备内存 + int *d_M, *d_colsum; + cudaMalloc(&d_M, N * N * sizeof(int)); + cudaMalloc(&d_colsum, N * sizeof(int)); + + // 将数据从主机复制到设备 + cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice); + + // 定义CUDA核函数 + dim3 blockDim(256); + dim3 gridDim((N + blockDim.x - 1) / blockDim.x); + + // 启动核函数 + cudaColumnSum<<>>(d_M, d_colsum); + + // 将结果从设备复制回主机 + cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost); + + // 释放设备内存 + cudaFree(d_M); + cudaFree(d_colsum); +} + +/* CUDA优化的行列求和函数 */ +void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) +{ + // 分配设备内存 + int *d_M, *d_rowsum, *d_colsum; + cudaMalloc(&d_M, N * N * sizeof(int)); + cudaMalloc(&d_rowsum, N * sizeof(int)); + cudaMalloc(&d_colsum, N * sizeof(int)); + + // 将数据从主机复制到设备 + cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice); + + // 定义CUDA核函数 + dim3 blockDim(256); + dim3 gridDim((N + blockDim.x - 1) / blockDim.x); + + // 启动核函数 + cudaRowColSum<<>>(d_M, d_rowsum, d_colsum); + + // 将结果从设备复制回主机 + cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost); + + // 释放设备内存 + cudaFree(d_M); + cudaFree(d_rowsum); + cudaFree(d_colsum); +} + +/* CUDA核函数 - 列求和 */ +__global__ void cudaColumnSum(int *M, int *colsum) +{ + int col = blockIdx.x * blockDim.x + threadIdx.x; + if (col < N) { + colsum[col] = 0; + for (int row = 0; row < N; row++) { + colsum[col] += M[row * N + col]; + } + } +} + +/* CUDA核函数 - 行列求和 */ +__global__ void cudaRowColSum(int *M, int *rowsum, int *colsum) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < N) { + // 计算行和 + rowsum[idx] = 0; + for (int j = 0; j < N; j++) { + rowsum[idx] += M[idx * N + j]; + } + + // 计算列和 + colsum[idx] = 0; + for (int i = 0; i < N; i++) { + colsum[idx] += M[i * N + idx]; + } + } +} + +/* + 这个表格包含多个数组元素,每一组元素(函数名字, COL/ROWCOL, "描述字符串") + COL表示该函数仅仅计算每一列的和 + ROWCOL表示该函数计算每一行、每一列的和 + 将你认为最好的两个实现,放在最前面。 + 比如: + {my_c_sum1, "超级垃圾列求和实现"}, + {my_rc_sum2, "好一点的行列求和实现"}, +*/ + +rc_fun_rec rc_fun_tab[] = +{ + + /* 第一项,应当是你写的最好列求和的函数实现 */ + {cuda_c_sum, COL, "CUDA optimized column sum"}, + /* 第二项,应当是你写的最好行列求和的函数实现 */ + {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"}, + + {c_sum, COL, "Column sum, reference implementation"}, + + {rc_sum, ROWCOL, "Row and column sum, reference implementation"}, + + /* 下面的代码不能修改或者删除!!表明数组列表结束 */ + {NULL,ROWCOL,NULL} +}; diff --git a/perflab/matrix/rowcol.o b/perflab/matrix/rowcol.o new file mode 100644 index 0000000..abada5f Binary files /dev/null and b/perflab/matrix/rowcol.o differ diff --git a/perflab/matrix/rowcol.y~ b/perflab/matrix/rowcol.y~ new file mode 100644 index 0000000..5d3310a --- /dev/null +++ b/perflab/matrix/rowcol.y~ @@ -0,0 +1,240 @@ +/************************************************************************** + 靠/靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 + 1. 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 + 2. 靠靠靠靠靠靠靠靠靠靠靠 + 3. 靠rc_fun_rec rc_fun_tab靠靠靠靠靠靠旷靠靠 + 靠靠旷靠靠靠靠靠靠靠蹩靠靠靠靠靠靠靠靠靠靠 +***************************************************************************/ + +/* + 靠靠201209054233 + 靠靠靠靠靠靠靠 +*/ + + +#include +#include +#include "rowcol.h" +#include + +/* 靠靠靠靠靠靠靠靠 */ +/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠蚩靠靠 + 靠靠靠靠靠2靠靠靠靠靠旷靠靠 +*/ + +void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) +{ + int i,j; + for (j = 0; j < N; j++) { + colsum[j] = 0; + for (i = 0; i < N; i++) + colsum[j] += M[i][j]; + } +} + + +/* 靠靠靠靠靠靠靠靠靠靠 */ +/* 靠靠靠靠靠靠靠靠靠靠 */ + +void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) +{ + int i,j; + for (i = 0; i < N; i++) { + rowsum[i] = colsum[i] = 0; + for (j = 0; j < N; j++) { + rowsum[i] += M[i][j]; + colsum[i] += M[j][i]; + } + } +} + + + +/* + 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠, COL/ROWCOL, "靠靠靠靠"靠 + COL靠靠靠靠靠靠靠靠靠 + ROWCOL靠靠靠靠靠靠靠靠靠 + 靠靠靠靠靠蹩靠靠靠靠靠靠靠靠 + 靠靠 + {my_c_sum1, "靠靠靠靠靠靠靠靠"}, + {my_rc_sum2, "靠靠靠靠靠靠靠靠靠"}, +*/ + +rc_fun_rec rc_fun_tab[] = +{ + + /* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */ + {c_sum, COL, "Best column sum"}, + /* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */ + {rc_sum, ROWCOL, "Best row and column sum"}, + + {c_sum, COL, "Column sum, reference implementation"}, + + {rc_sum, ROWCOL, "Row and column sum, reference implementation"}, + + /* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */ + {NULL,ROWCOL,NULL} +}; + +// /************************************************************************** +// 行/列求和函数。按下面的要求编辑此文件: +// 1. 将你的学号、姓名,以注释的方式写到下面; +// 2. 实现不同版本的行列求和函数; +// 3. 编辑rc_fun_rec rc_fun_tab数组,将你的最好的答案 +// (最好的行和列求和、最好的列求和)作为数组的前两项 +// ***************************************************************************/ +// +// /* +// 学号:202302723005 +// 姓名:程景愉 +// */ +// +// +// #include +// #include +// #include "rowcol.h" +// #include +// #include +// +// /* 参考的列求和函数实现 */ +// /* 计算矩阵中的每一列的和。请注意对于行和列求和来说,调用参数是 +// 一样的,只是第2个参数不会用到而已 +// */ +// +// void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) +// { +// int i,j; +// for (j = 0; j < N; j++) { +// colsum[j] = 0; +// for (i = 0; i < N; i++) +// colsum[j] += M[i][j]; +// } +// } +// +// +// /* 参考的列和行求和函数实现 */ +// /* 计算矩阵中的每一行、每一列的和。 */ +// +// void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) +// { +// int i,j; +// for (i = 0; i < N; i++) { +// rowsum[i] = colsum[i] = 0; +// for (j = 0; j < N; j++) { +// rowsum[i] += M[i][j]; +// colsum[i] += M[j][i]; +// } +// } +// } +// +// /* CUDA优化的列求和函数 */ +// void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum) +// { +// // 分配设备内存 +// int *d_M, *d_colsum; +// cudaMalloc(&d_M, N * N * sizeof(int)); +// cudaMalloc(&d_colsum, N * sizeof(int)); +// +// // 将数据从主机复制到设备 +// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice); +// +// // 定义CUDA核函数 +// dim3 blockDim(256); +// dim3 gridDim((N + blockDim.x - 1) / blockDim.x); +// +// // 启动核函数 +// cudaColumnSum<<>>(d_M, d_colsum); +// +// // 将结果从设备复制回主机 +// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost); +// +// // 释放设备内存 +// cudaFree(d_M); +// cudaFree(d_colsum); +// } +// +// /* CUDA优化的行列求和函数 */ +// void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) +// { +// // 分配设备内存 +// int *d_M, *d_rowsum, *d_colsum; +// cudaMalloc(&d_M, N * N * sizeof(int)); +// cudaMalloc(&d_rowsum, N * sizeof(int)); +// cudaMalloc(&d_colsum, N * sizeof(int)); +// +// // 将数据从主机复制到设备 +// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice); +// +// // 定义CUDA核函数 +// dim3 blockDim(256); +// dim3 gridDim((N + blockDim.x - 1) / blockDim.x); +// +// // 启动核函数 +// cudaRowColSum<<>>(d_M, d_rowsum, d_colsum); +// +// // 将结果从设备复制回主机 +// cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost); +// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost); +// +// // 释放设备内存 +// cudaFree(d_M); +// cudaFree(d_rowsum); +// cudaFree(d_colsum); +// } +// +// /* CUDA核函数 - 列求和 */ +// __global__ void cudaColumnSum(int *M, int *colsum) +// { +// int col = blockIdx.x * blockDim.x + threadIdx.x; +// if (col < N) { +// colsum[col] = 0; +// for (int row = 0; row < N; row++) { +// colsum[col] += M[row * N + col]; +// } +// } +// } +// +// /* CUDA核函数 - 行列求和 */ +// __global__ void cudaRowColSum(int *M, int *rowsum, int *colsum) +// { +// int idx = blockIdx.x * blockDim.x + threadIdx.x; +// if (idx < N) { +// // 计算行和 +// rowsum[idx] = 0; +// for (int j = 0; j < N; j++) { +// rowsum[idx] += M[idx * N + j]; +// } +// +// // 计算列和 +// colsum[idx] = 0; +// for (int i = 0; i < N; i++) { +// colsum[idx] += M[i * N + idx]; +// } +// } +// } +// +// /* +// 这个表格包含多个数组元素,每一组元素(函数名字, COL/ROWCOL, "描述字符串") +// COL表示该函数仅仅计算每一列的和 +// ROWCOL表示该函数计算每一行、每一列的和 +// 将你认为最好的两个实现,放在最前面。 +// 比如: +// {my_c_sum1, "超级垃圾列求和实现"}, +// {my_rc_sum2, "好一点的行列求和实现"}, +// */ +// +// rc_fun_rec rc_fun_tab[] = +// { +// +// /* 第一项,应当是你写的最好列求和的函数实现 */ +// {cuda_c_sum, COL, "CUDA optimized column sum"}, +// /* 第二项,应当是你写的最好行列求和的函数实现 */ +// {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"}, +// +// {c_sum, COL, "Column sum, reference implementation"}, +// +// {rc_sum, ROWCOL, "Row and column sum, reference implementation"}, +// +// /* 下面的代码不能修改或者删除!!表明数组列表结束 */ +// {NULL,ROWCOL,NULL} +// }; diff --git a/perflab/matrix/rowcol.z~ b/perflab/matrix/rowcol.z~ new file mode 100644 index 0000000..5d3310a --- /dev/null +++ b/perflab/matrix/rowcol.z~ @@ -0,0 +1,240 @@ +/************************************************************************** + 靠/靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 + 1. 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 + 2. 靠靠靠靠靠靠靠靠靠靠靠 + 3. 靠rc_fun_rec rc_fun_tab靠靠靠靠靠靠旷靠靠 + 靠靠旷靠靠靠靠靠靠靠蹩靠靠靠靠靠靠靠靠靠靠 +***************************************************************************/ + +/* + 靠靠201209054233 + 靠靠靠靠靠靠靠 +*/ + + +#include +#include +#include "rowcol.h" +#include + +/* 靠靠靠靠靠靠靠靠 */ +/* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠蚩靠靠 + 靠靠靠靠靠2靠靠靠靠靠旷靠靠 +*/ + +void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) +{ + int i,j; + for (j = 0; j < N; j++) { + colsum[j] = 0; + for (i = 0; i < N; i++) + colsum[j] += M[i][j]; + } +} + + +/* 靠靠靠靠靠靠靠靠靠靠 */ +/* 靠靠靠靠靠靠靠靠靠靠 */ + +void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) +{ + int i,j; + for (i = 0; i < N; i++) { + rowsum[i] = colsum[i] = 0; + for (j = 0; j < N; j++) { + rowsum[i] += M[i][j]; + colsum[i] += M[j][i]; + } + } +} + + + +/* + 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠, COL/ROWCOL, "靠靠靠靠"靠 + COL靠靠靠靠靠靠靠靠靠 + ROWCOL靠靠靠靠靠靠靠靠靠 + 靠靠靠靠靠蹩靠靠靠靠靠靠靠靠 + 靠靠 + {my_c_sum1, "靠靠靠靠靠靠靠靠"}, + {my_rc_sum2, "靠靠靠靠靠靠靠靠靠"}, +*/ + +rc_fun_rec rc_fun_tab[] = +{ + + /* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */ + {c_sum, COL, "Best column sum"}, + /* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */ + {rc_sum, ROWCOL, "Best row and column sum"}, + + {c_sum, COL, "Column sum, reference implementation"}, + + {rc_sum, ROWCOL, "Row and column sum, reference implementation"}, + + /* 靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠靠 */ + {NULL,ROWCOL,NULL} +}; + +// /************************************************************************** +// 行/列求和函数。按下面的要求编辑此文件: +// 1. 将你的学号、姓名,以注释的方式写到下面; +// 2. 实现不同版本的行列求和函数; +// 3. 编辑rc_fun_rec rc_fun_tab数组,将你的最好的答案 +// (最好的行和列求和、最好的列求和)作为数组的前两项 +// ***************************************************************************/ +// +// /* +// 学号:202302723005 +// 姓名:程景愉 +// */ +// +// +// #include +// #include +// #include "rowcol.h" +// #include +// #include +// +// /* 参考的列求和函数实现 */ +// /* 计算矩阵中的每一列的和。请注意对于行和列求和来说,调用参数是 +// 一样的,只是第2个参数不会用到而已 +// */ +// +// void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) +// { +// int i,j; +// for (j = 0; j < N; j++) { +// colsum[j] = 0; +// for (i = 0; i < N; i++) +// colsum[j] += M[i][j]; +// } +// } +// +// +// /* 参考的列和行求和函数实现 */ +// /* 计算矩阵中的每一行、每一列的和。 */ +// +// void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) +// { +// int i,j; +// for (i = 0; i < N; i++) { +// rowsum[i] = colsum[i] = 0; +// for (j = 0; j < N; j++) { +// rowsum[i] += M[i][j]; +// colsum[i] += M[j][i]; +// } +// } +// } +// +// /* CUDA优化的列求和函数 */ +// void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum) +// { +// // 分配设备内存 +// int *d_M, *d_colsum; +// cudaMalloc(&d_M, N * N * sizeof(int)); +// cudaMalloc(&d_colsum, N * sizeof(int)); +// +// // 将数据从主机复制到设备 +// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice); +// +// // 定义CUDA核函数 +// dim3 blockDim(256); +// dim3 gridDim((N + blockDim.x - 1) / blockDim.x); +// +// // 启动核函数 +// cudaColumnSum<<>>(d_M, d_colsum); +// +// // 将结果从设备复制回主机 +// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost); +// +// // 释放设备内存 +// cudaFree(d_M); +// cudaFree(d_colsum); +// } +// +// /* CUDA优化的行列求和函数 */ +// void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) +// { +// // 分配设备内存 +// int *d_M, *d_rowsum, *d_colsum; +// cudaMalloc(&d_M, N * N * sizeof(int)); +// cudaMalloc(&d_rowsum, N * sizeof(int)); +// cudaMalloc(&d_colsum, N * sizeof(int)); +// +// // 将数据从主机复制到设备 +// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice); +// +// // 定义CUDA核函数 +// dim3 blockDim(256); +// dim3 gridDim((N + blockDim.x - 1) / blockDim.x); +// +// // 启动核函数 +// cudaRowColSum<<>>(d_M, d_rowsum, d_colsum); +// +// // 将结果从设备复制回主机 +// cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost); +// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost); +// +// // 释放设备内存 +// cudaFree(d_M); +// cudaFree(d_rowsum); +// cudaFree(d_colsum); +// } +// +// /* CUDA核函数 - 列求和 */ +// __global__ void cudaColumnSum(int *M, int *colsum) +// { +// int col = blockIdx.x * blockDim.x + threadIdx.x; +// if (col < N) { +// colsum[col] = 0; +// for (int row = 0; row < N; row++) { +// colsum[col] += M[row * N + col]; +// } +// } +// } +// +// /* CUDA核函数 - 行列求和 */ +// __global__ void cudaRowColSum(int *M, int *rowsum, int *colsum) +// { +// int idx = blockIdx.x * blockDim.x + threadIdx.x; +// if (idx < N) { +// // 计算行和 +// rowsum[idx] = 0; +// for (int j = 0; j < N; j++) { +// rowsum[idx] += M[idx * N + j]; +// } +// +// // 计算列和 +// colsum[idx] = 0; +// for (int i = 0; i < N; i++) { +// colsum[idx] += M[i * N + idx]; +// } +// } +// } +// +// /* +// 这个表格包含多个数组元素,每一组元素(函数名字, COL/ROWCOL, "描述字符串") +// COL表示该函数仅仅计算每一列的和 +// ROWCOL表示该函数计算每一行、每一列的和 +// 将你认为最好的两个实现,放在最前面。 +// 比如: +// {my_c_sum1, "超级垃圾列求和实现"}, +// {my_rc_sum2, "好一点的行列求和实现"}, +// */ +// +// rc_fun_rec rc_fun_tab[] = +// { +// +// /* 第一项,应当是你写的最好列求和的函数实现 */ +// {cuda_c_sum, COL, "CUDA optimized column sum"}, +// /* 第二项,应当是你写的最好行列求和的函数实现 */ +// {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"}, +// +// {c_sum, COL, "Column sum, reference implementation"}, +// +// {rc_sum, ROWCOL, "Row and column sum, reference implementation"}, +// +// /* 下面的代码不能修改或者删除!!表明数组列表结束 */ +// {NULL,ROWCOL,NULL} +// }; diff --git a/perflab/matrix/rowcol_202302723005.c b/perflab/matrix/rowcol_202302723005.c new file mode 100644 index 0000000..b504582 --- /dev/null +++ b/perflab/matrix/rowcol_202302723005.c @@ -0,0 +1,69 @@ +/************************************************************************** + ??/??????????????????????????????? + 1. ??????????????????????????????? + 2. ?????????????????????? + 3. ??rc_fun_rec rc_fun_tab?????????????????? + ??????????????????????????????????????????? +***************************************************************************/ + +/* + ????201209054233 + ?????????????? +*/ + +#include "rowcol.h" +#include +#include +#include + +/* ????????????????? */ +/* ??????????????????????????????????????????????? + ??????????2????????????????? +*/ + +void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) { + int i, j; + for (j = 0; j < N; j++) { + colsum[j] = 0; + for (i = 0; i < N; i++) + colsum[j] += M[i][j]; + } +} + +/* ???????????????????? */ +/* ??????????????????????? */ + +void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) { + int i, j; + for (i = 0; i < N; i++) { + rowsum[i] = colsum[i] = 0; + for (j = 0; j < N; j++) { + rowsum[i] += M[i][j]; + colsum[i] += M[j][i]; + } + } +} + +/* + ????????????????????????????????????????, COL/ROWCOL, "?????????"?? + COL?????????????????????? + ROWCOL??????????????????????? + ????????????????????????????? + ???? + {my_c_sum1, "?????????????????"}, + {my_rc_sum2, "??????????????????"}, +*/ + +rc_fun_rec rc_fun_tab[] = { + + /* ???????????????????????????????? */ + {c_sum, COL, "Best column sum"}, + /* ?????????????????????????????????? */ + {rc_sum, ROWCOL, "Best row and column sum"}, + + {c_sum, COL, "Column sum, reference implementation"}, + + {rc_sum, ROWCOL, "Row and column sum, reference implementation"}, + + /* ??????????????????????????????????????? */ + {NULL, ROWCOL, NULL}}; \ No newline at end of file diff --git a/perflab/matrix/rowcol_202302723005.o b/perflab/matrix/rowcol_202302723005.o new file mode 100644 index 0000000..b28983c Binary files /dev/null and b/perflab/matrix/rowcol_202302723005.o differ diff --git a/perflab/matrix/rowcol_test.c b/perflab/matrix/rowcol_test.c index 6b67926..e6a046e 100644 --- a/perflab/matrix/rowcol_test.c +++ b/perflab/matrix/rowcol_test.c @@ -1,9 +1,9 @@ #include #include -//#include -#include "rowcol.h" -#include "fcyc.h" +// #include #include "clock.h" +#include "fcyc.h" +#include "rowcol.h" #define MAX_ITER_COUNT 100 @@ -11,9 +11,9 @@ static struct { double cref; /* Cycles taken by reference solution */ double cbest; /* Cycles taken by our best implementation */ -} cstandard[2] = -{{7.7, 6.40}, /* Column Sum */ - {9.75, 6.60} /* Row & Column Sum */ +} cstandard[2] = { + {7.7, 6.40}, /* Column Sum */ + {9.75, 6.60} /* Row & Column Sum */ }; /* Put in code to align matrix so that it starts on a cache block boundary. @@ -26,7 +26,7 @@ static struct { #define WPB 16 int verbose = 1; -int data[N*N+WPB]; +int data[N * N + WPB]; int *mstart; typedef vector_t *row_t; @@ -37,137 +37,122 @@ vector_t rsref, csref, rcomp, ccomp; static void init_tests(void); extern void make_CPU_busy(void); -static void init_tests(void) -{ - int i, j; - size_t bytes_per_block = sizeof(int) * WPB; - /* round mstart up to nearest block boundary */ - mstart = (int *) - (((size_t) data + bytes_per_block-1) / bytes_per_block * bytes_per_block); - for (i = 0; i < N; i++) { - rsref[i] = csref[i] = 0; - } - for (i = 0; i < N; i++) { - for (j = 0; j < N; j++) { - int val = rand(); - mstart[i*N+j] = val; - rsref[i] += val; - csref[j] += val; - } +static void init_tests(void) { + int i, j; + size_t bytes_per_block = sizeof(int) * WPB; + /* round mstart up to nearest block boundary */ + mstart = (int *)(((size_t)data + bytes_per_block - 1) / bytes_per_block * + bytes_per_block); + for (i = 0; i < N; i++) { + rsref[i] = csref[i] = 0; + } + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + int val = rand(); + mstart[i * N + j] = val; + rsref[i] += val; + csref[j] += val; } + } } - /* Test function on all values */ int test_rc(rc_fun f, FILE *rpt, rc_comp_t rc_type) { - int i; - int ok = 1; + int i; + int ok = 1; - for (i = 0; i < N; i++) - rcomp[i] = ccomp[i] = 0xDEADBEEF; - f((row_t)mstart, rcomp, ccomp); - - for (i = 0; ok && i < N; i++) { - if (rc_type == ROWCOL - && rsref[i] != rcomp[i]) { - ok = 0; - if (rpt) - fprintf(rpt, - "对第%d行的计算出错!正确结果是%d,但是计算得到%d\n", - i, rsref[i], rcomp[i]); - } - if ((rc_type == ROWCOL || rc_type == COL) - && csref[i] != ccomp[i]) { - ok = 0; - if (rpt) - fprintf(rpt, - "对第%d列的计算出错!正确结果是%d,但是计算得到%d\n", - i, csref[i], ccomp[i]); - } + for (i = 0; i < N; i++) + rcomp[i] = ccomp[i] = 0xDEADBEEF; + f((row_t)mstart, rcomp, ccomp); + for (i = 0; ok && i < N; i++) { + if (rc_type == ROWCOL && rsref[i] != rcomp[i]) { + ok = 0; + if (rpt) + fprintf(rpt, "瀵圭%d琛岀殑璁$畻鍑洪敊锛佹纭粨鏋滄槸%d锛屼絾鏄绠楀緱鍒%d\n", i, + rsref[i], rcomp[i]); } - return ok; + if ((rc_type == ROWCOL || rc_type == COL) && csref[i] != ccomp[i]) { + ok = 0; + if (rpt) + fprintf(rpt, "瀵圭%d鍒楃殑璁$畻鍑洪敊锛佹纭粨鏋滄槸%d锛屼絾鏄绠楀緱鍒%d\n", i, + csref[i], ccomp[i]); + } + } + return ok; } /* Kludgy way to interface to cycle measuring code */ -void do_test(int *intf) -{ - rc_fun f = (rc_fun) intf; +void do_test(int *intf) { + rc_fun f = (rc_fun)intf; f((row_t)mstart, rcomp, ccomp); } -void time_rc(rc_fun f, rc_comp_t rc_type, char *descr, double *cycp) -{ - int i; - int *intf = (int *) f; +void time_rc(rc_fun f, rc_comp_t rc_type, char *descr, double *cycp) { + int i; + int *intf = (int *)f; double t, cme; t = 0; - if (verbose) printf("函数:%s\n", descr); + if (verbose) + printf("鍑芥暟锛%s\n", descr); if (test_rc(f, stdout, rc_type)) { - make_CPU_busy(); - for (i=0;i 1.1*(sbest-1)+1) + if (smeas > 1.1 * (sbest - 1) + 1) return 120; - return 100*((smeas-1.0)/(sbest-1.0) + 0.1); + return 100 * ((smeas - 1.0) / (sbest - 1.0) + 0.1); } -int main(int argc, char *argv[]) -{ +int main(int argc, char *argv[]) { int i; double cme; - double cme_c,cme_rc; - int EnableScore=0; - - if (argc == 3) - { - EnableScore = 1; - verbose = 0; + double cme_c, cme_rc; + int EnableScore = 0; + + if (argc == 3) { + EnableScore = 1; + verbose = 0; } init_tests(); - set_fcyc_clear_cache(1); /* Set so that clears cache between runs */ + set_fcyc_clear_cache(1); /* Set so that clears cache between runs */ for (i = 0; rc_fun_tab[i].f != NULL; i++) { - cme = 100.0; - time_rc(rc_fun_tab[i].f, - rc_fun_tab[i].rc_type, rc_fun_tab[i].descr, &cme); - if (i == 0) - { - cme_c = cme; - if (EnableScore==0) - { - printf(" 最高\"列求和\"得分 ======================== %.0f\n", - compute_score(cme, cstandard[0].cref, cstandard[0].cbest)); - } - } - if (i == 1) - { - cme_rc = cme; - if (EnableScore==0) - { - printf(" 最高\"行和列求和\"得分 ====================== %.0f\n", - compute_score(cme, cstandard[1].cref, cstandard[1].cbest)); - } - } + cme = 100.0; + time_rc(rc_fun_tab[i].f, rc_fun_tab[i].rc_type, rc_fun_tab[i].descr, &cme); + if (i == 0) { + cme_c = cme; + if (EnableScore == 0) { + printf(" 鏈楂榎"鍒楁眰鍜孿"寰楀垎 ======================== %.0f\n", + compute_score(cme, cstandard[0].cref, cstandard[0].cbest)); + } + } + if (i == 1) { + cme_rc = cme; + if (EnableScore == 0) { + printf(" 鏈楂榎"琛屽拰鍒楁眰鍜孿"寰楀垎 ====================== %.0f\n", + compute_score(cme, cstandard[1].cref, cstandard[1].cbest)); + } + } } - + if (EnableScore) - printf("%.2f\t %.0f\t %.2f\t %.0f\t 0\t 0\n",cme_c,compute_score(cme_c, cstandard[0].cref, cstandard[0].cbest), - cme_rc,compute_score(cme_rc, cstandard[1].cref, cstandard[1].cbest)); + printf("%.2f\t %.0f\t %.2f\t %.0f\t 0\t 0\n", cme_c, + compute_score(cme_c, cstandard[0].cref, cstandard[0].cbest), cme_rc, + compute_score(cme_rc, cstandard[1].cref, cstandard[1].cbest)); return 0; } diff --git a/perflab/matrix/rowcol_test.o b/perflab/matrix/rowcol_test.o new file mode 100644 index 0000000..d214ba4 Binary files /dev/null and b/perflab/matrix/rowcol_test.o differ diff --git a/perflab/poly/Makefile b/perflab/poly/Makefile new file mode 100644 index 0000000..9f55dad --- /dev/null +++ b/perflab/poly/Makefile @@ -0,0 +1,35 @@ +CC = gcc +NVCC = nvcc +CFLAGS = -Wall -O2 -g +CUDA_FLAGS = -O2 -g +LDFLAGS = -lm -lcudart + +# Source files +SRCS = poly_test.c clock.c cpe.c fcyc.c lsquare.c +CUDA_SRCS = poly.cu +OBJS = $(SRCS:.c=.o) poly.o + +# Target executable +TARGET = poly_test + +# Default target +all: $(TARGET) + +# Rule to build the executable +$(TARGET): $(OBJS) + $(CC) $(OBJS) -o $(TARGET) $(LDFLAGS) + +# Rule to build object files +%.o: %.c + $(CC) $(CFLAGS) -c $< -o $@ + +# Rule to build CUDA object files +poly.o: poly.cu + $(NVCC) $(CUDA_FLAGS) -c $< -o $@ + +# Clean rule +clean: + rm -f $(OBJS) $(TARGET) + +# Phony targets +.PHONY: all clean \ No newline at end of file diff --git a/perflab/poly/clock.c b/perflab/poly/clock.c index a587590..159ba4e 100644 --- a/perflab/poly/clock.c +++ b/perflab/poly/clock.c @@ -13,11 +13,11 @@ #include #include #include -#include -//#include -#include -#include +#include +// #include #include "clock.h" +#include +#include /* Use x86 cycle counter */ @@ -27,203 +27,195 @@ static unsigned cyc_lo = 0; /* Set *hi and *lo to the high and low order bits of the cycle counter. Implementation requires assembly code to use the rdtsc instruction. */ -void access_counter(unsigned *hi, unsigned *lo) -{ +void access_counter(unsigned *hi, unsigned *lo) { - long long counter; + long long counter; - counter = __rdtsc(); - (*hi) = (unsigned int)(counter >> 32); - (*lo) = (unsigned int)counter; -/* + counter = __rdtsc(); + (*hi) = (unsigned int)(counter >> 32); + (*lo) = (unsigned int)counter; + /* - LARGE_INTEGER lPerformanceCount; + LARGE_INTEGER lPerformanceCount; - QueryPerformanceCounter(&lPerformanceCount); - (*hi) = (unsigned int)lPerformanceCount.HighPart; - (*lo) = (unsigned int)lPerformanceCount.LowPart; -// printf("%08X %08X\n",(*hi),(*lo)); -*/ + QueryPerformanceCounter(&lPerformanceCount); + (*hi) = (unsigned int)lPerformanceCount.HighPart; + (*lo) = (unsigned int)lPerformanceCount.LowPart; + // printf("%08X %08X\n",(*hi),(*lo)); + */ } - /* Record the current value of the cycle counter. */ -void start_counter() -{ - access_counter(&cyc_hi, &cyc_lo); -} +void start_counter() { access_counter(&cyc_hi, &cyc_lo); } /* Return the number of cycles since the last call to start_counter. */ -double get_counter() -{ - unsigned ncyc_hi, ncyc_lo; - unsigned hi, lo, borrow; - double result; +double get_counter() { + unsigned ncyc_hi, ncyc_lo; + unsigned hi, lo, borrow; + double result; - /* Get cycle counter */ - access_counter(&ncyc_hi, &ncyc_lo); + /* Get cycle counter */ + access_counter(&ncyc_hi, &ncyc_lo); - /* Do double precision subtraction */ - lo = ncyc_lo - cyc_lo; - borrow = cyc_lo > ncyc_lo; - hi = ncyc_hi - cyc_hi - borrow; - result = (double) hi * (1 << 30) * 4 + lo; - return result; + /* Do double precision subtraction */ + lo = ncyc_lo - cyc_lo; + borrow = cyc_lo > ncyc_lo; + hi = ncyc_hi - cyc_hi - borrow; + result = (double)hi * (1 << 30) * 4 + lo; + return result; } -void make_CPU_busy(void) -{ - volatile double old_tick,new_tick; - start_counter(); - old_tick = get_counter(); - new_tick = get_counter(); - while (new_tick - old_tick < 1000000000) - new_tick = get_counter(); +void make_CPU_busy(void) { + volatile double old_tick, new_tick; + start_counter(); + old_tick = get_counter(); + new_tick = get_counter(); + while (new_tick - old_tick < 1000000000) + new_tick = get_counter(); } -//CPU的频率 -double mhz(int verbose) -{ - LARGE_INTEGER lFrequency; - LARGE_INTEGER lPerformanceCount_Start; - LARGE_INTEGER lPerformanceCount_End; - double mhz; - double fTime; - __int64 _i64StartCpuCounter; - __int64 _i64EndCpuCounter; - //On a multiprocessor machine, it should not matter which processor is called. - //However, you can get different results on different processors due to bugs in - //the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function. - HANDLE hThread=GetCurrentThread(); - SetThreadAffinityMask(hThread,0x1); +// CPU锟斤拷频锟斤拷 +double mhz(int verbose) { + LARGE_INTEGER lFrequency; + LARGE_INTEGER lPerformanceCount_Start; + LARGE_INTEGER lPerformanceCount_End; + double mhz; + double fTime; + __int64 _i64StartCpuCounter; + __int64 _i64EndCpuCounter; + // On a multiprocessor machine, it should not matter which processor is + // called. However, you can get different results on different processors due + // to bugs in the BIOS or the HAL. To specify processor affinity for a thread, + // use the SetThreadAffinityMask function. + HANDLE hThread = GetCurrentThread(); + SetThreadAffinityMask(hThread, 0x1); - //主板上高精度定时器的晶振频率 - //这个定时器应该就是一片8253或者8254 - //在intel ich7中集成了8254 - QueryPerformanceFrequency(&lFrequency); -// if (verbose>0) -// printf("高精度定时器的晶振频率:%1.0fHz.\n",(double)lFrequency.QuadPart); + // 锟斤拷锟斤拷锟较高撅拷锟饺讹拷时锟斤拷锟侥撅拷锟斤拷频锟斤拷 + // 锟斤拷锟斤拷锟绞憋拷锟接︼拷镁锟斤拷锟揭黄1锟78253锟斤拷锟斤拷8254 + // 锟斤拷intel ich7锟叫硷拷锟斤拷锟斤拷8254 + QueryPerformanceFrequency(&lFrequency); + // if (verbose>0) + // printf("锟竭撅拷锟饺讹拷时锟斤拷锟侥撅拷锟斤拷频锟绞o拷%1.0fHz.\n",(double)lFrequency.QuadPart); - //这个定时器每经过一个时钟周期,其计数器会+1 - QueryPerformanceCounter(&lPerformanceCount_Start); + // 锟斤拷锟斤拷锟绞憋拷锟矫匡拷锟斤拷锟揭伙拷锟绞憋拷锟斤拷锟斤拷冢锟斤拷锟斤拷锟斤拷锟斤拷锟斤拷+1 + QueryPerformanceCounter(&lPerformanceCount_Start); - //RDTSC指令:获取CPU经历的时钟周期数 - _i64StartCpuCounter=__rdtsc(); + // RDTSC指锟斤拷:锟斤拷取CPU锟斤拷锟斤拷锟斤拷时锟斤拷锟斤拷锟斤拷锟斤拷 + _i64StartCpuCounter = __rdtsc(); - //延时长一点,误差会小一点 - //int nTemp=100000; - //while (--nTemp); - Sleep(200); + // 锟斤拷时锟斤拷一锟斤拷,锟斤拷锟斤拷小一锟斤拷 + // int nTemp=100000; + // while (--nTemp); + Sleep(200); - QueryPerformanceCounter(&lPerformanceCount_End); + QueryPerformanceCounter(&lPerformanceCount_End); - _i64EndCpuCounter=__rdtsc(); + _i64EndCpuCounter = __rdtsc(); - //f=1/T => f=计数次数/(计数次数*T) - //这里的“计数次数*T”就是时间差 - fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart) - /(double)lFrequency.QuadPart; + // f=1/T => f=锟斤拷锟斤拷锟斤拷锟斤拷/(锟斤拷锟斤拷锟斤拷锟斤拷*T) + // 锟斤拷锟斤拷摹锟斤拷锟斤拷锟斤拷锟斤拷锟1锟7*T锟斤拷锟斤拷锟斤拷时锟斤拷锟1锟7 + fTime = ((double)lPerformanceCount_End.QuadPart - + (double)lPerformanceCount_Start.QuadPart) / + (double)lFrequency.QuadPart; - mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0); - if (verbose>0) - printf("CPU频率为:%1.6fMHz.\n",mhz); - return mhz; + mhz = (_i64EndCpuCounter - _i64StartCpuCounter) / (fTime * 1000000.0); + if (verbose > 0) + printf("CPU频锟斤拷为:%1.6fMHz.\n", mhz); + return mhz; } -double CPU_Factor1(void) -{ - double result; - int i,j,k,ii,jj,kk; - LARGE_INTEGER lStart,lEnd; +double CPU_Factor1(void) { + double result; + int i, j, k, ii, jj, kk; + LARGE_INTEGER lStart, lEnd; LARGE_INTEGER lFrequency; HANDLE hThread; double fTime; QueryPerformanceFrequency(&lFrequency); - ii = 43273; - kk = 1238; - result = 1; - jj = 1244; + ii = 43273; + kk = 1238; + result = 1; + jj = 1244; - hThread=GetCurrentThread(); - SetThreadAffinityMask(hThread,0x1); + hThread = GetCurrentThread(); + SetThreadAffinityMask(hThread, 0x1); QueryPerformanceCounter(&lStart); //_asm("cpuid"); - start_counter(); - for (i=0;i<100;i++) - for (j=0;j<1000;j++) - for (k=0;k<1000;k++) - kk += kk*ii+jj; + start_counter(); + for (i = 0; i < 100; i++) + for (j = 0; j < 1000; j++) + for (k = 0; k < 1000; k++) + kk += kk * ii + jj; - result = get_counter(); - QueryPerformanceCounter(&lEnd); - fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart); - printf("CPU运行时间为%f",result); - printf("\t %f\n",fTime); - return result; + result = get_counter(); + QueryPerformanceCounter(&lEnd); + fTime = ((double)lEnd.QuadPart - (double)lStart.QuadPart); + printf("CPU锟斤拷锟斤拷时锟斤拷为%f", result); + printf("\t %f\n", fTime); + return result; } -double CPU_Factor(void) -{ - double frequency; - double multiplier = 1000 * 1000 * 1000;//nano - LARGE_INTEGER lFrequency; - LARGE_INTEGER start,stop; - HANDLE hThread; - int i; - const int gigahertz= 1000*1000*1000; - const int known_instructions_per_loop = 27317; +double CPU_Factor(void) { + double frequency; + double multiplier = 1000 * 1000 * 1000; // nano + LARGE_INTEGER lFrequency; + LARGE_INTEGER start, stop; + HANDLE hThread; + int i; + const int gigahertz = 1000 * 1000 * 1000; + const int known_instructions_per_loop = 27317; - int iterations = 100000000; - int g = 0; - double normal_ticks_per_second; -double ticks; -double time; -double loops_per_sec; -double instructions_per_loop; -double ratio; -double actual_freq; + int iterations = 100000000; + int g = 0; + double normal_ticks_per_second; + double ticks; + double time; + double loops_per_sec; + double instructions_per_loop; + double ratio; + double actual_freq; - QueryPerformanceFrequency(&lFrequency); - frequency = (double)lFrequency.QuadPart; + QueryPerformanceFrequency(&lFrequency); + frequency = (double)lFrequency.QuadPart; - hThread=GetCurrentThread(); - SetThreadAffinityMask(hThread,0x1); - QueryPerformanceCounter(&start); - for( i = 0; i < iterations; i++) - { - g++; - g++; - g++; - g++; - } - QueryPerformanceCounter(&stop); + hThread = GetCurrentThread(); + SetThreadAffinityMask(hThread, 0x1); + QueryPerformanceCounter(&start); + for (i = 0; i < iterations; i++) { + g++; + g++; + g++; + g++; + } + QueryPerformanceCounter(&stop); - //normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199 - normal_ticks_per_second = frequency * 1000; - ticks = (double)((double)stop.QuadPart - (double)start.QuadPart); - time = (ticks * multiplier) /frequency; - loops_per_sec = iterations / (time/multiplier); - instructions_per_loop = normal_ticks_per_second / loops_per_sec; + // normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ + // 3199 + normal_ticks_per_second = frequency * 1000; + ticks = (double)((double)stop.QuadPart - (double)start.QuadPart); + time = (ticks * multiplier) / frequency; + loops_per_sec = iterations / (time / multiplier); + instructions_per_loop = normal_ticks_per_second / loops_per_sec; - ratio = (instructions_per_loop / known_instructions_per_loop); - actual_freq = normal_ticks_per_second / ratio; -/* - actual_freq = normal_ticks_per_second / ratio; - actual_freq = known_instructions_per_loop*iterations*multiplier/time; + ratio = (instructions_per_loop / known_instructions_per_loop); + actual_freq = normal_ticks_per_second / ratio; + /* + actual_freq = normal_ticks_per_second / ratio; + actual_freq = known_instructions_per_loop*iterations*multiplier/time; - 2293 = x/time; - - 2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000 - loops_per_sec = iterations*frequency / ticks - - instructions_per_loop = / loops_per_sec; -*/ - printf("Perf counter freq: %f\n", normal_ticks_per_second); - printf("Loops per sec: %f\n", loops_per_sec); - printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop); - printf("Presumed freq: %f\n", actual_freq); - printf("ratio: %f\n", ratio); - printf("time=%f\n",time); - return ratio; + 2293 = x/time; + + 2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000 + loops_per_sec = iterations*frequency / ticks + + instructions_per_loop = / loops_per_sec; + */ + printf("Perf counter freq: %f\n", normal_ticks_per_second); + printf("Loops per sec: %f\n", loops_per_sec); + printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop); + printf("Presumed freq: %f\n", actual_freq); + printf("ratio: %f\n", ratio); + printf("time=%f\n", time); + return ratio; } diff --git a/perflab/poly/poly.cu b/perflab/poly/poly.cu new file mode 100644 index 0000000..73347fe --- /dev/null +++ b/perflab/poly/poly.cu @@ -0,0 +1,325 @@ +/************************************************************************** + 澶氶」寮忚绠楀嚱鏁般傛寜涓嬮潰鐨勮姹傜紪杈戞鏂囦欢锛 + 1. 灏嗕綘鐨勫鍙枫佸鍚嶏紝浠ユ敞閲婄殑鏂瑰紡鍐欏埌涓嬮潰锛 + 2. 瀹炵幇涓嶅悓鐗堟湰鐨勫椤瑰紡璁$畻鍑芥暟锛 + 3. 缂栬緫peval_fun_rec peval_fun_tab鏁扮粍锛屽皢浣犵殑鏈濂界殑绛旀 + 锛堟渶灏廋PE銆佹渶灏廋10锛変綔涓烘暟缁勭殑鍓嶄袱椤 +***************************************************************************/ + +/* + 瀛﹀彿锛201209054233 + 濮撳悕锛氬鍗婂姞鐝媯 +*/ + + + +#include +#include +#include +typedef int (*peval_fun)(int*, int, int); + +typedef struct { + peval_fun f; + char *descr; +} peval_fun_rec, *peval_fun_ptr; + + +/************************************************************************** + Edit this comment to indicate your name and Andrew ID +#ifdef ASSIGN + Submission by Harry Q. Bovik, bovik@andrew.cmu.edu +#else + Instructor's version. + Created by Randal E. Bryant, Randy.Bryant@cs.cmu.edu, 10/07/02 +#endif +***************************************************************************/ + +/* + 瀹炵幇涓涓寚瀹氱殑甯哥郴鏁板椤瑰紡璁$畻 + 绗竴娆★紝璇风洿鎺ヨ繍琛岀▼搴忥紝浠ヤ究鑾风煡浣犻渶瑕佸疄鐜扮殑甯哥郴鏁版槸鍟 +*/ +int const_poly_eval(int *not_use, int not_use2, int x) +{ + int result = 0; +/* int i; + int xpwr = 1; // x鐨勫箓娆 + int a[4] = {21,90,42,88}; + for (i = 0; i <= 3; i++) { + result += a[i]*xpwr; + xpwr *= x; + } +*/ +// 90 = 64 + 32 - 4 - 2 +// 42 = 32 + 8 + 2 +// 88 = 64 + 16 + 8 + int x64,x32,x16,x8,x4,x2; + + x64 = x << 6; + x32 = x << 5; + x16 = x << 4; + x8 = x << 3; + x4 = x << 2; + x2 = x << 1; + result = 21 + x64+x32-x4-x2 + ((x32+x8+x2) + (x64+x16+x8)*x)*x; + return result; +} + + + +/* 澶氶」寮忚绠楀嚱鏁般傛敞鎰忥細杩欎釜鍙槸涓涓弬鑰冨疄鐜帮紝浣犻渶瑕佸疄鐜拌嚜宸辩殑鐗堟湰 */ + +/* + 鍙嬫儏鎻愮ず锛歭cc鏀寔ATT鏍煎紡鐨勫祵鍏ュ紡姹囩紪锛屼緥濡 + + _asm("movl %eax,%ebx"); + _asm("pushl %edx"); + + 鍙互鍦╨cc涓璸roject->configuration->Compiler->Code Generation->Generate .asm锛 + 灏嗗叾閫変腑鍚庯紝鍙互鍦╨cc鐩綍涓嬮潰鐢熸垚瀵瑰簲绋嬪簭鐨勬眹缂栦唬鐮佸疄鐜般傞氳繃鏌ョ湅姹囩紪鏂囦欢锛 + 浣犲彲浠ヤ簡瑙g紪璇戝櫒鏄浣曞疄鐜颁綘鐨勪唬鐮佺殑銆傛湁浜涘疄鐜板彲鑳介潪甯镐綆鏁堛 + 浣犲彲浠ュ湪閫傚綋鐨勫湴鏂瑰姞鍏ュ祵鍏ュ紡姹囩紪锛屾潵澶у箙搴︽彁楂樿绠楁ц兘銆 +*/ + +int poly_eval(int *a, int degree, int x) +{ + int result = 0; + int i; + int xpwr = 1; /* x鐨勫箓娆 */ +// printf("闃=%d\n",degree); + for (i = 0; i <= degree; i++) { + result += a[i]*xpwr; + xpwr *= x; + } + return result; +} + +/* CUDA浼樺寲鐨勫椤瑰紡璁$畻鍑芥暟 - 浣嶤PE鐗堟湰 */ +int cuda_poly_eval_low_cpe(int *a, int degree, int x) +{ + // 瀵逛簬浣嶤PE鐗堟湰锛屾垜浠娇鐢–UDA骞惰璁$畻澶氶」寮忕殑鍚勪釜椤 + // 鐒跺悗灏嗙粨鏋滀紶鍥炰富鏈鸿繘琛屾眰鍜 + + // 鍒嗛厤璁惧鍐呭瓨 + int *d_a, *d_results; + cudaError_t err; + + // 鍒嗛厤鍐呭瓨 + err = cudaMalloc(&d_a, (degree + 1) * sizeof(int)); + if (err != cudaSuccess) { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + return 0; + } + + err = cudaMalloc(&d_results, (degree + 1) * sizeof(int)); + if (err != cudaSuccess) { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + cudaFree(d_a); + return 0; + } + + // 灏嗙郴鏁颁粠涓绘満澶嶅埗鍒拌澶 + err = cudaMemcpy(d_a, a, (degree + 1) * sizeof(int), cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + cudaFree(d_a); + cudaFree(d_results); + return 0; + } + + // 瀹氫箟CUDA鏍稿嚱鏁 + dim3 blockDim(256); + dim3 gridDim((degree + 1 + blockDim.x - 1) / blockDim.x); + + // 鍚姩鏍稿嚱鏁 + cudaPolyEvalLowCPE<<>>(d_a, degree, x, d_results); + + // 妫鏌ユ牳鍑芥暟鎵ц閿欒 + err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + cudaFree(d_a); + cudaFree(d_results); + return 0; + } + + // 鍒嗛厤涓绘満鍐呭瓨鐢ㄤ簬缁撴灉 + int *h_results = (int *)malloc((degree + 1) * sizeof(int)); + if (h_results == NULL) { + printf("Memory allocation error\n"); + cudaFree(d_a); + cudaFree(d_results); + return 0; + } + + // 灏嗙粨鏋滀粠璁惧澶嶅埗鍥炰富鏈 + err = cudaMemcpy(h_results, d_results, (degree + 1) * sizeof(int), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + free(h_results); + cudaFree(d_a); + cudaFree(d_results); + return 0; + } + + // 鍦ㄤ富鏈轰笂姹傚拰 + int result = 0; + for (int i = 0; i <= degree; i++) { + result += h_results[i]; + } + + // 閲婃斁鍐呭瓨 + free(h_results); + cudaFree(d_a); + cudaFree(d_results); + + return result; +} + +/* CUDA浼樺寲鐨勫椤瑰紡璁$畻鍑芥暟 - 10闃朵紭鍖栫増鏈 */ +int cuda_poly_eval_degree10(int *a, int degree, int x) +{ + // 瀵逛簬10闃跺椤瑰紡锛屾垜浠彲浠ヤ娇鐢ㄦ洿浼樺寲鐨勬柟娉 + // 浣跨敤CUDA骞惰璁$畻锛屼絾閽堝10闃跺椤瑰紡杩涜鐗规畩浼樺寲 + + // 鍒嗛厤璁惧鍐呭瓨 + int *d_a, *d_result; + cudaError_t err; + + // 鍒嗛厤鍐呭瓨 + err = cudaMalloc(&d_a, (degree + 1) * sizeof(int)); + if (err != cudaSuccess) { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + return 0; + } + + err = cudaMalloc(&d_result, sizeof(int)); + if (err != cudaSuccess) { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + cudaFree(d_a); + return 0; + } + + // 灏嗙郴鏁颁粠涓绘満澶嶅埗鍒拌澶 + err = cudaMemcpy(d_a, a, (degree + 1) * sizeof(int), cudaMemcpyHostToDevice); + if (err != cudaSuccess) { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + cudaFree(d_a); + cudaFree(d_result); + return 0; + } + + // 瀹氫箟CUDA鏍稿嚱鏁 + dim3 blockDim(256); + dim3 gridDim(1); // 鍙渶瑕佷竴涓潡锛屽洜涓烘垜浠彧闇瑕佷竴涓粨鏋 + + // 鍚姩鏍稿嚱鏁 + cudaPolyEvalDegree10<<>>(d_a, degree, x, d_result); + + // 妫鏌ユ牳鍑芥暟鎵ц閿欒 + err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + cudaFree(d_a); + cudaFree(d_result); + return 0; + } + + // 鑾峰彇缁撴灉 + int result; + err = cudaMemcpy(&result, d_result, sizeof(int), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + cudaFree(d_a); + cudaFree(d_result); + return 0; + } + + // 閲婃斁鍐呭瓨 + cudaFree(d_a); + cudaFree(d_result); + + return result; +} + +/* CUDA鏍稿嚱鏁 - 浣嶤PE鐗堟湰 */ +__global__ void cudaPolyEvalLowCPE(int *a, int degree, int x, int *results) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx <= degree) { + // 璁$畻x鐨勫箓 + int xpwr = 1; + for (int i = 0; i < idx; i++) { + xpwr *= x; + } + + // 璁$畻杩欎竴椤圭殑缁撴灉 + results[idx] = a[idx] * xpwr; + } +} + +/* CUDA鏍稿嚱鏁 - 10闃朵紭鍖栫増鏈 */ +__global__ void cudaPolyEvalDegree10(int *a, int degree, int x, int *result) +{ + // 浣跨敤鍏变韩鍐呭瓨鏉ュ瓨鍌ㄤ腑闂寸粨鏋 + __shared__ int shared_result; + + // 鍙湁绗竴涓嚎绋嬪垵濮嬪寲鍏变韩缁撴灉 + if (threadIdx.x == 0) { + shared_result = 0; + } + __syncthreads(); + + // 姣忎釜绾跨▼璁$畻涓閮ㄥ垎椤 + int local_result = 0; + int xpwr = 1; + + // 璁$畻x鐨勫箓 + for (int i = 0; i < threadIdx.x; i++) { + xpwr *= x; + } + + // 璁$畻杩欎竴椤圭殑缁撴灉 + if (threadIdx.x <= degree) { + local_result = a[threadIdx.x] * xpwr; + } + + // 浣跨敤鍘熷瓙鎿嶄綔绱姞缁撴灉 + atomicAdd(&shared_result, local_result); + + // 鍚屾鎵鏈夌嚎绋 + __syncthreads(); + + // 鍙湁绗竴涓嚎绋嬪皢缁撴灉鍐欏洖鍏ㄥ眬鍐呭瓨 + if (threadIdx.x == 0) { + *result = shared_result; + } +} + +/* + 杩欎釜琛ㄦ牸鍖呭惈澶氫釜鏁扮粍鍏冪礌锛屾瘡涓缁勫厓绱狅紙鍑芥暟鍚嶅瓧, "鎻忚堪瀛楃涓"锛 + 灏嗕綘璁や负鏈濂界殑涓や釜瀹炵幇锛屾斁鍦ㄦ渶鍓嶉潰銆 + 姣斿锛 + {my_poly_eval1, "瓒呯骇鍨冨溇瀹炵幇"}, + {my_poly_eval2, "濂戒竴鐐圭殑瀹炵幇"}, +*/ + +peval_fun_rec peval_fun_tab[] = +{ + + /* 绗竴椤癸紝搴斿綋鏄綘鍐欑殑鏈濂紺PE鐨勫嚱鏁板疄鐜 */ + {cuda_poly_eval_low_cpe, "CUDA optimized low CPE implementation"}, + /* 绗簩椤癸紝搴斿綋鏄綘鍐欑殑鍦10闃舵椂鍏锋湁鏈濂芥ц兘鐨勫疄鐜 */ + {cuda_poly_eval_degree10, "CUDA optimized degree 10 implementation"}, + + {poly_eval, "poly_eval: 鍙傝冨疄鐜"}, + + /* 涓嬮潰鐨勪唬鐮佷笉鑳戒慨鏀规垨鑰呭垹闄わ紒锛佽〃鏄庢暟缁勫垪琛ㄧ粨鏉 */ + {NULL, ""} +}; + + + + + + + diff --git a/perflab/poly/poly.o b/perflab/poly/poly.o new file mode 100644 index 0000000..1b650e0 Binary files /dev/null and b/perflab/poly/poly.o differ diff --git a/perflab/poly/poly_test.c b/perflab/poly/poly_test.c index 8c68435..c2e8701 100644 --- a/perflab/poly/poly_test.c +++ b/perflab/poly/poly_test.c @@ -6,6 +6,7 @@ #include "poly.h" #include "cpe.h" #include "clock.h" +#include double CPU_Mhz; @@ -17,7 +18,7 @@ static int coeff[MAXDEGREE+1]; #define MAX_ITER_COUNT 100 -#define REF_CPU_MHZ 2292.6 // 这是我的处理器主频 +#define REF_CPU_MHZ 2292.6 // 锟斤拷锟斤拷锟揭的达拷锟斤拷锟斤拷锟斤拷频 /* Define performance standards */ static struct { @@ -26,7 +27,7 @@ static struct { } cstandard[3] = {{4.00, 1.75}, /* CPE */ {50, 43}, /* C(10) */ - {57,31} /* 常系数多项式计算 */ + {57,31} /* 锟斤拷系锟斤拷锟斤拷锟斤拷式锟斤拷锟斤拷 */ }; int coeff_const[4]; @@ -82,7 +83,7 @@ static void init_const_poly(void) coeff_const[i] = rand_div+10; } - printf("你需要修改poly.c的const_poly_eval函数,实现下面的常数多项式计算!\n"); + printf("锟斤拷锟斤拷要锟睫革拷poly.c锟斤拷const_poly_eval锟斤拷锟斤拷锟斤拷实锟斤拷锟斤拷锟斤拷某锟斤拷锟斤拷锟斤拷锟绞斤拷锟斤拷悖n"); printf("\tresult=%d+%d*x+%d*x^2+%d*x^3\n",coeff_const[0],coeff_const[1],coeff_const[2],coeff_const[3]); fixval_const = ref_poly_eval(coeff_const, 3, xval); @@ -97,15 +98,15 @@ void test_const_poly(void) int my_cal = const_poly_eval(coeff_const, 3, xval); if (fixval_const != my_cal) { - printf("常系数多项式计算const_poly_eval实现错误(x=%d),预期结果是%d,但是计算得到的是%d\n",xval,fixval_const,my_cal); + printf("锟斤拷系锟斤拷锟斤拷锟斤拷式锟斤拷锟斤拷const_poly_eval实锟街达拷锟斤拷x=%d锟斤拷锟斤拷预锟节斤拷锟斤拷锟%d锟斤拷锟斤拷锟角硷拷锟斤拷玫锟斤拷锟斤拷锟%d\n",xval,fixval_const,my_cal); exit(0); } fix_time = 0; for (i=0;i