diff --git a/branchPrediction.c b/branchPrediction.c new file mode 100755 index 0000000..83fc273 --- /dev/null +++ b/branchPrediction.c @@ -0,0 +1,653 @@ +#include "common.h" + +// 饱和计数器:加1 +static inline UINT32 SatIncrement(UINT32 x, UINT32 max) +{ + if (x0) return x - 1; + return x; +} + +#define BITS_OF_PC 13 // 选择13位的PC作为索引 + +#define STATE_MAX 3 +#define STATE_INIT 2 + +UINT32 *State; // 状态数组,用于保存分支指令的状态机,实际只使用最低2位 +UINT64 StateArraySize; + +void PREDICTOR_init(void) +{ + StateArraySize = (1 << BITS_OF_PC); // 状态数组项数 + + State = (UINT32 *)malloc(StateArraySize * sizeof(UINT32)); + + // *********** 你需要在下面书写代码 *********** + // 将状态数组,全部初始化为STATE_INIT + for(UINT32 i = 0; i < StateArraySize; i++) + { + State[i] = 2; + } + // *********** 你需要在上面书写代码 *********** +} + +// 2位状态的分支预测器(预测部分) +char GetPrediction(UINT64 PC) +{ + // *********** 你需要在下面书写代码 *********** + // 将PC的低13位,去索引状态数组State,得到对应的饱和状态 + // 如果该状态的值超过一半,则预测跳转 + // 如果该状态的值低于一半,则预测不跳转 + UINT32 index = PC>>2 & 0x1fff; + if(State[index] == 0 || State[index] == 1) return NOT_TAKEN; + return TAKEN; + //return TAKEN; + //return NOT_TAKEN; + // *********** 你需要在上面书写代码 *********** +} + +// 2位状态的分支预测器(更新部分) +void UpdatePredictor(UINT64 PC, OpType opType, char resolveDir, char predDir, UINT64 branchTarget) +{ + + // *********** 你需要在下面书写代码 *********** + // 根据分支指令实际执行结果,来更新对应的饱和计数器 + // 如果结果为跳转,则对应的饱和计数器+1 + // 如果结果为不跳转,则对应的饱和计数器-1 + UINT32 index = PC>>2 & 0x1fff; + if(resolveDir == 'T') + { + State[index] = SatIncrement(State[index], 3); + } + else + { + State[index] = SatDecrement(State[index]); + } + + // *********** 你需要在上面书写代码 *********** +} + +void PREDICTOR_free(void) +{ + free(State); +} + + +#include "common.h" + +// 饱和计数器:加1 +static inline UINT32 SatIncrement(UINT32 x, UINT32 max) +{ + if (x0) return x - 1; + return x; +} + +#define BITS_OF_PC 13 // 选择13位的PC作为索引 + +#define STATE_MAX 7 +#define STATE_INIT 3 + +UINT32 *State; // 状态数组,用于保存分支指令的状态机,实际只使用最低3位 +UINT64 StateArraySize; + +void PREDICTOR_init(void) +{ + StateArraySize = (1 << BITS_OF_PC); // 状态数组项数 + + State = (UINT32 *)malloc(StateArraySize * sizeof(UINT32)); + + // *********** 你需要在下面书写代码 *********** + // 将状态数组,全部初始化为STATE_INIT + for(UINT64 i = 0; i <= StateArraySize; i++) + { + State[i] = STATE_INIT; + } + // *********** 你需要在上面书写代码 *********** +} + +// 2位状态的分支预测器(预测部分) +char GetPrediction(UINT64 PC) +{ + // *********** 你需要在下面书写代码 *********** + // 将PC的低13位,去索引状态数组State,得到对应的饱和状态 + // 如果该状态的值超过一半,则预测跳转 + // 如果该状态的值低于一半,则预测不跳转 + UINT64 index = (PC>>2) & 0x1fff; + if(State[index] == 0 || State[index] == 1 || State[index] == 2 || State[index] == 3) return NOT_TAKEN; + return TAKEN; + //return TAKEN; + //return NOT_TAKEN; + // *********** 你需要在上面书写代码 *********** +} + +// 2位状态的分支预测器(更新部分) +void UpdatePredictor(UINT64 PC, OpType opType, char resolveDir, char predDir, UINT64 branchTarget) +{ + + // *********** 你需要在下面书写代码 *********** + // 根据分支指令实际执行结果,来更新对应的饱和计数器 + // 如果结果为跳转,则对应的饱和计数器+1 + // 如果结果为不跳转,则对应的饱和计数器-1 + UINT64 index = (PC>>2) & 0x1fff; + if(resolveDir == 'T') + { + State[index] = SatIncrement(State[index], 7); + } + else + { + State[index] = SatDecrement(State[index]); + } + + // *********** 你需要在上面书写代码 *********** +} + +void PREDICTOR_free(void) +{ + free(State); +} + + +#include "common.h" + +// 饱和计数器:加1 +static inline UINT32 SatIncrement(UINT32 x, UINT32 max) +{ + if (x < max) return x + 1; + return x; +} + +// 饱和计数器:减1 +static inline UINT32 SatDecrement(UINT32 x) +{ + if (x > 0) return x - 1; + return x; +} + +#define BITS_OF_PC 10 // 选择10位的PC作为索引 +#define LOCAL_HIST_LEN 3 // 局部历史长度,3位 +#define LOCAL_HIST_MASK ~(~0 << LOCAL_HIST_LEN) + +#define STATE_MAX 3 +#define STATE_INIT 2 + +UINT32* pht; // pattern history table 模式历史表 +UINT32 phtArraySize; // pht数组项数 +UINT32* State; // 状态数组,用于保存分支指令的状态机,实际只使用最低2位 +UINT64 StateArraySize; + +void PREDICTOR_init(void) +{ + StateArraySize = (1 << (BITS_OF_PC + LOCAL_HIST_LEN)); // 状态数组项数 + + State = (UINT32*)malloc(StateArraySize * sizeof(UINT32)); + + phtArraySize = (1 << BITS_OF_PC); // pht数组项数 + + pht = (UINT32*)malloc(phtArraySize * sizeof(UINT32)); + + // *********** 你需要在下面书写代码 *********** + // 将状态数组,全部初始化为STATE_INIT + // 将模式历史表(pht)全部初始化为0 + for(UINT64 i = 0; i < StateArraySize; i++) + { + State[i] = STATE_INIT; + } + for(UINT32 i = 0; i < phtArraySize; i++) + { + pht[i] = 0; + } + // *********** 你需要在上面书写代码 *********** +} + +// 2位状态的分支预测器(预测部分) +char GetPrediction(UINT64 PC) +{ + // *********** 你需要在下面书写代码 *********** + // 将PC的低10位,去索引模式历史表pht,得到对应的3位历史信息 + UINT32 index1 = (PC>>2) & 0x3ff; + // 将PC的低10位,与3位历史信息进行拼接,形成一个13位的状态数组索引(拼接需要使用C语言的移位、与、或等运算) + //UINT64 index2 = (index1<<3) | (pht[index1]); + //UINT64 index2 = (index1) | (pht[index1]<<10); + UINT64 index2 = (index1 & 0x3ff) | (pht[index1]<<10 & 0x1c00); + // 用13位去索引状态数组,得到对应的饱和状态 + if(State[index2] == 0 || State[index2] == 1) return NOT_TAKEN; + return TAKEN; + // 如果该状态的值超过一半,则预测跳转 + // 如果该状态的值低于一半,则预测不跳转 + + //return TAKEN; + // return NOT_TAKEN; + + // *********** 你需要在上面书写代码 *********** + +} + +// 2位状态的分支预测器(更新部分) +void UpdatePredictor(UINT64 PC, OpType opType, char resolveDir, char predDir, UINT64 branchTarget) +{ + + // *********** 你需要在下面书写代码 *********** + // 根据分支指令实际执行结果,来更新对应的饱和计数器 + // 如果结果为跳转,则对应的饱和计数器+1 + // 如果结果为不跳转,则对应的饱和计数器-1 + // 更新pht中的最近3次分支历史信息,使用移位寄存器来更新 + // 将其更新到pht中 + UINT32 index1 = (PC>>2) & 0x3ff; + //UINT64 index2 = (index1<<3) | (pht[index1]); + //UINT64 index2 = (index1) | (pht[index1]<<10); + UINT64 index2 = (index1 & 0x3ff) | (pht[index1]<<10 & 0x1c00); + if(resolveDir == 'T') + { + State[index2] = SatIncrement(State[index2], 3); + pht[index1] = (pht[index1]<<1) | 0x1; + } + else + { + State[index2] = SatDecrement(State[index2]); + pht[index1] = (pht[index1]<<1); + } + + + // *********** 你需要在上面书写代码 *********** +} + +void PREDICTOR_free(void) +{ + free(State); + free(pht); +} + + +#include "common.h" + +// 饱和计数器:加1 +static inline UINT32 SatIncrement(UINT32 x, UINT32 max) +{ + if (x < max) return x + 1; + return x; +} + +// 饱和计数器:减1 +static inline UINT32 SatDecrement(UINT32 x) +{ + if (x > 0) return x - 1; + return x; +} + +#define BITS_OF_PC 9 // 选择9位的PC作为索引 +#define LOCAL_HIST_LEN 4 // 局部历史长度,4位 +#define LOCAL_HIST_MASK ~(~0 << LOCAL_HIST_LEN) + +#define STATE_MAX 3 +#define STATE_INIT 2 + +UINT32* pht; // pattern history table 模式历史表 +UINT32 phtArraySize; // pht数组项数 +UINT32* State; // 状态数组,用于保存分支指令的状态机,实际只使用最低2位 +UINT64 StateArraySize; + +void PREDICTOR_init(void) +{ + StateArraySize = (1 << (BITS_OF_PC + LOCAL_HIST_LEN)); // 状态数组项数 + + State = (UINT32*)malloc(StateArraySize * sizeof(UINT32)); + + phtArraySize = (1 << BITS_OF_PC); // pht数组项数 + + pht = (UINT32*)malloc(phtArraySize * sizeof(UINT32)); + + // *********** 你需要在下面书写代码 *********** + // 将状态数组,全部初始化为STATE_INIT + // 将模式历史表(pht)全部初始化为0 + for(UINT64 i = 0; i < StateArraySize; i++) + { + State[i] = STATE_INIT; + } + for(UINT32 i = 0; i < phtArraySize; i++) + { + pht[i] = 0; + } + // *********** 你需要在上面书写代码 *********** +} + +// 2位状态的分支预测器(预测部分) +char GetPrediction(UINT64 PC) +{ + // *********** 你需要在下面书写代码 *********** + // 将PC的低10位,去索引模式历史表pht,得到对应的3位历史信息 + // 将PC的低10位,与3位历史信息进行拼接,形成一个13位的状态数组索引(拼接需要使用C语言的移位、与、或等运算) + // 用13位去索引状态数组,得到对应的饱和状态 + // 如果该状态的值超过一半,则预测跳转 + // 如果该状态的值低于一半,则预测不跳转 + UINT32 index1 = (PC>>2) & 0x1ff; + UINT64 index2 = (index1 & 0x1ff) | (pht[index1]<<9 & 0x1e00); + if(State[index2] == 0 || State[index2] == 1) return NOT_TAKEN; + return TAKEN; + //return TAKEN; + // return NOT_TAKEN; + + // *********** 你需要在上面书写代码 *********** + +} + +// 2位状态的分支预测器(更新部分) +void UpdatePredictor(UINT64 PC, OpType opType, char resolveDir, char predDir, UINT64 branchTarget) +{ + + // *********** 你需要在下面书写代码 *********** + // 根据分支指令实际执行结果,来更新对应的饱和计数器 + // 如果结果为跳转,则对应的饱和计数器+1 + // 如果结果为不跳转,则对应的饱和计数器-1 + // 更新pht中的最近3次分支历史信息,使用移位寄存器来更新 + // 将其更新到pht中 + UINT32 index1 = (PC>>2) & 0x1ff; + UINT64 index2 = (index1 & 0x1ff) | (pht[index1]<<9 & 0x1e00); + if(resolveDir == 'T') + { + State[index2] = SatIncrement(State[index2], 3); + pht[index1] = (pht[index1]<<1) | 0x1; + } + else + { + State[index2] = SatDecrement(State[index2]); + pht[index1] = (pht[index1]<<1); + } + // *********** 你需要在上面书写代码 *********** +} + +void PREDICTOR_free(void) +{ + free(State); + free(pht); +} + + +#include "common.h" + +// 饱和计数器:加1 +static inline UINT32 SatIncrement(UINT32 x, UINT32 max) +{ + if (x < max) return x + 1; + return x; +} + +// 饱和计数器:减1 +static inline UINT32 SatDecrement(UINT32 x) +{ + if (x > 0) return x - 1; + return x; +} + +#define GLOBAL_HIST_LEN 13 // 全局历史长度,13位 +#define GLOBAL_HIST_MASK ~(~0 << GLOBAL_HIST_LEN) + +#define STATE_MAX 3 +#define STATE_INIT 2 + +UINT32 GHR; // Global History Register,全局历史寄存器 +UINT32* State; // 状态数组,用于保存分支指令的状态机,实际只使用最低2位 +UINT64 StateArraySize; + +void PREDICTOR_init(void) +{ + StateArraySize = (1 << GLOBAL_HIST_LEN); // 状态数组项数 + + State = (UINT32*)malloc(StateArraySize * sizeof(UINT32)); + + // *********** 你需要在下面书写代码 *********** + // 将状态数组,全部初始化为STATE_INIT + // 将全局历史寄存器(GHR)初始化为0 + for(UINT64 i = 0; i < StateArraySize; i++) + { + State[i] = STATE_INIT; + } + GHR = 0; + // *********** 你需要在上面书写代码 *********** +} + +// Gshare分支预测器(预测部分) +char GetPrediction(UINT64 PC) +{ + // *********** 你需要在下面书写代码 *********** + // 用13位的GHR去索引状态数组,得到对应的饱和状态 + // 如果该状态的值超过一半,则预测跳转 + // 如果该状态的值低于一半,则预测不跳转 + UINT64 index = GHR & 0x1fff; + if(State[index] == 0 || State[index] == 1) return NOT_TAKEN; + return TAKEN; + // return TAKEN; + // return NOT_TAKEN; + + // *********** 你需要在上面书写代码 *********** + +} + +// Gshare分支预测器(更新部分) +void UpdatePredictor(UINT64 PC, OpType opType, char resolveDir, char predDir, UINT64 branchTarget) +{ + + // *********** 你需要在下面书写代码 *********** + // 根据分支指令实际执行结果,来更新对应的饱和计数器 + // 如果结果为跳转,则对应的饱和计数器+1 + // 如果结果为不跳转,则对应的饱和计数器-1 + // 更新GHR中的最近1次分支历史信息,使用移位寄存器来更新 + UINT64 index = GHR & 0x1fff; + if(resolveDir == 'T') + { + State[index] = SatIncrement(State[index], 3); + GHR = GHR << 1 | 0x1; + } + else + { + State[index] = SatDecrement(State[index]); + GHR = GHR << 1; + } + + // *********** 你需要在上面书写代码 *********** +} + +void PREDICTOR_free(void) +{ + free(State); +} + + + +#include "common.h" + +// 饱和计数器:加1 +static inline UINT32 SatIncrement(UINT32 x, UINT32 max) +{ + if (x < max) return x + 1; + return x; +} + +// 饱和计数器:减1 +static inline UINT32 SatDecrement(UINT32 x) +{ + if (x > 0) return x - 1; + return x; +} + +#define BITS_OF_PC 3 // 选择3位的PC作为索引 +#define GLOBAL_HIST_LEN 10 // 全局历史长度,10位 +#define STATE_INDEX_MASK ~(~0 << (BITS_OF_PC + GLOBAL_HIST_LEN)) + +#define STATE_MAX 3 +#define STATE_INIT 2 + +UINT32 GHR; // Global History Register,全局历史寄存器 +UINT32* State; // 状态数组,用于保存分支指令的状态机,实际只使用最低2位 +UINT64 StateArraySize; + +void PREDICTOR_init(void) +{ + StateArraySize = (1 << (BITS_OF_PC +GLOBAL_HIST_LEN)); // 状态数组项数 + + State = (UINT32*)malloc(StateArraySize * sizeof(UINT32)); + + // *********** 你需要在下面书写代码 *********** + // 将状态数组,全部初始化为STATE_INIT + // 将全局历史寄存器(GHR)初始化为0 + for(UINT64 i = 0; i < StateArraySize; i++) + { + State[i] = STATE_INIT; + } + GHR = 0; + // *********** 你需要在上面书写代码 *********** +} + +// Gshare分支预测器(预测部分) +char GetPrediction(UINT64 PC) +{ + // *********** 你需要在下面书写代码 *********** + // 将PC的低3位,与10位GHR进行拼接,形成一个13位的状态数组索引 + // 用13位去索引状态数组,得到对应的饱和状态 + // 如果该状态的值超过一半,则预测跳转 + // 如果该状态的值低于一半,则预测不跳转 + UINT64 index = (((PC>>2 & 0x7) << 10) & 0x1c00) | (GHR & 0x3ff); + if(State[index] == 0 || State[index] == 1) return NOT_TAKEN; + return TAKEN; + // return TAKEN; + // return NOT_TAKEN; + + // *********** 你需要在上面书写代码 *********** + +} + +// Gshare分支预测器(更新部分) +void UpdatePredictor(UINT64 PC, OpType opType, char resolveDir, char predDir, UINT64 branchTarget) +{ + + // *********** 你需要在下面书写代码 *********** + // 根据分支指令实际执行结果,来更新对应的饱和计数器 + // 如果结果为跳转,则对应的饱和计数器+1 + // 如果结果为不跳转,则对应的饱和计数器-1 + // 更新GHR中的最近1次分支历史信息,使用移位寄存器来更新 + UINT64 index = (((PC>>2 & 0x7) << 10) & 0x1c00) | (GHR & 0x3ff); + if(resolveDir == 'T') + { + State[index] = SatIncrement(State[index], 3); + GHR = GHR << 1 | 0x1; + } + else + { + State[index] = SatDecrement(State[index]); + GHR = GHR << 1; + } + // *********** 你需要在上面书写代码 *********** +} + +void PREDICTOR_free(void) +{ + free(State); +} + + + +#include "common.h" + +// 饱和计数器:加1 +static inline UINT32 SatIncrement(UINT32 x, UINT32 max) +{ + if (x < max) return x + 1; + return x; +} + +// 饱和计数器:减1 +static inline UINT32 SatDecrement(UINT32 x) +{ + if (x > 0) return x - 1; + return x; +} + +#define GLOBAL_HIST_LEN 13 // 全局历史长度,13位 +#define GLOBAL_HIST_MASK ~(~0 << GLOBAL_HIST_LEN) + +#define STATE_MAX 3 +#define STATE_INIT 2 + +UINT32 GHR; // Global History Register,全局历史寄存器 +UINT32* State; // 状态数组,用于保存分支指令的状态机,实际只使用最低2位 +UINT64 StateArraySize; + +void PREDICTOR_init(void) +{ + StateArraySize = (1 << GLOBAL_HIST_LEN); // 状态数组项数 + + State = (UINT32*)malloc(StateArraySize * sizeof(UINT32)); + + // *********** 你需要在下面书写代码 *********** + // 将状态数组,全部初始化为STATE_INIT + // 将全局历史寄存器(GHR)初始化为0 + for(UINT64 i = 0; i < StateArraySize; i++) + { + State[i] = STATE_INIT; + } + GHR = 0; + // *********** 你需要在上面书写代码 *********** +} + +// Gshare分支预测器(预测部分) +char GetPrediction(UINT64 PC) +{ + // *********** 你需要在下面书写代码 *********** + // 将PC的低13位,与13位GHR进行异或,形成一个13位的状态数组索引 + // 用13位去索引状态数组,得到对应的饱和状态 + // 如果该状态的值超过一半,则预测跳转 + // 如果该状态的值低于一半,则预测不跳转7] + UINT64 nPC = (PC) & 0x1fff; + UINT64 index = nPC ^ (GHR & 0x1fff); + if(State[index] == 0 || State[index] == 1) return NOT_TAKEN; + return TAKEN; + // return TAKEN; + // return NOT_TAKEN; + + // *********** 你需要在上面书写代码 *********** + +} + +// Gshare分支预测器(更新部分) +void UpdatePredictor(UINT64 PC, OpType opType, char resolveDir, char predDir, UINT64 branchTarget) +{ + + // *********** 你需要在下面书写代码 *********** + // 根据分支指令实际执行结果,来更新对应的饱和计数器 + // 如果结果为跳转,则对应的饱和计数器+1 + // 如果结果为不跳转,则对应的饱和计数器-1 + // 更新GHR中的最近1次分支历史信息,使用移位寄存器来更新 + UINT64 nPC = (PC) & 0x1fff; + UINT64 index = nPC ^ (GHR & 0x1fff); + if(resolveDir == 'T') + { + State[index] = SatIncrement(State[index], 3); + GHR = GHR << 1 | 0x1; + } + else + { + State[index] = SatDecrement(State[index]); + GHR = GHR << 1; + } + + // *********** 你需要在上面书写代码 *********** +} + +void PREDICTOR_free(void) +{ + free(State); +} + + diff --git a/perflab/matrix/Makefile b/perflab/matrix/Makefile deleted file mode 100644 index 2dc672d..0000000 --- a/perflab/matrix/Makefile +++ /dev/null @@ -1,34 +0,0 @@ -CC = gcc -CFLAGS = -Wall -O1 -g -#LDFLAGS = -lm -lcudart -lcuda - -# Source files -SRCS = rowcol_test.c clock.c cpe.c fcyc.c lsquare.c rowcol_202302723005.c -#CUDA_SRCS = rowcol.cu -OBJS = $(SRCS:.c=.o) -#rowcol.o - -# Target executable -TARGET = matrix_test - -# Default target -all: $(TARGET) - -# Rule to build the executable -$(TARGET): $(OBJS) - $(CC) $(OBJS) -o $(TARGET) $(LDFLAGS) - -# Rule to build object files -%.o: %.c - $(CC) $(CFLAGS) -c $< -o $@ - -# Rule to build CUDA object files -#rowcol.o: rowcol.cu -# $(NVCC) $(CUDA_FLAGS) -c $< -o $@ - -# Clean rule -clean: - rm -f $(OBJS) $(TARGET) - -# Phony targets -.PHONY: all clean diff --git a/perflab/matrix/a.exe b/perflab/matrix/a.exe new file mode 100755 index 0000000..12640dd Binary files /dev/null and b/perflab/matrix/a.exe differ diff --git a/perflab/matrix/clock.c b/perflab/matrix/clock.c index b826af4..a587590 100644 --- a/perflab/matrix/clock.c +++ b/perflab/matrix/clock.c @@ -1,196 +1,229 @@ -/* clock.c - * Retrofitted to use thread-specific timers - * and to get clock information from /proc/cpuinfo - * (C) R. E. Bryant, 2010 - * Modified for cross-platform compatibility - */ - -#define _GNU_SOURCE // For sched_setaffinity on Linux -#include -#include -#include -#include - -#ifdef _WIN32 -#include -#include -#else -#include -#include -#include -#include -typedef struct { - uint64_t QuadPart; -} LARGE_INTEGER; -typedef void *HANDLE; -#define __int64 long long -#define Sleep(ms) usleep((ms) * 1000) -#endif - -#include "clock.h" - -/* Use x86 cycle counter */ -static unsigned cyc_hi = 0; -static unsigned cyc_lo = 0; - -void access_counter(unsigned *hi, unsigned *lo) { - uint64_t counter = __rdtsc(); - *hi = (unsigned)(counter >> 32); - *lo = (unsigned)counter; -} - -void start_counter() { access_counter(&cyc_hi, &cyc_lo); } - -double get_counter() { - unsigned ncyc_hi, ncyc_lo; - access_counter(&ncyc_hi, &ncyc_lo); - uint64_t start = ((uint64_t)cyc_hi << 32) | cyc_lo; - uint64_t end = ((uint64_t)ncyc_hi << 32) | ncyc_lo; - return (double)(end - start); -} - -void make_CPU_busy(void) { - volatile double old_tick = get_counter(); - volatile double new_tick; - while ((new_tick - old_tick) < 1000000000) { - new_tick = get_counter(); - } -} - -#ifdef _WIN32 -#define GET_TIME(dest) QueryPerformanceCounter(dest) -#else -static inline void GET_TIME(LARGE_INTEGER *dest) { - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - dest->QuadPart = (uint64_t)ts.tv_sec * 1000000000 + ts.tv_nsec; -} -#define QueryPerformanceFrequency(freq) ((freq)->QuadPart = 1000000000) -#endif - -double mhz(int verbose) { - LARGE_INTEGER lFrequency; - LARGE_INTEGER lPerformanceCount_Start; - LARGE_INTEGER lPerformanceCount_End; - double mhz; - double fTime; - __int64 _i64StartCpuCounter; - __int64 _i64EndCpuCounter; - -#ifdef _WIN32 - HANDLE hThread = GetCurrentThread(); - SetThreadAffinityMask(hThread, 0x1); -#else - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(0, &cpuset); - sched_setaffinity(0, sizeof(cpuset), &cpuset); -#endif - - QueryPerformanceFrequency(&lFrequency); - GET_TIME(&lPerformanceCount_Start); - _i64StartCpuCounter = __rdtsc(); - Sleep(200); - GET_TIME(&lPerformanceCount_End); - _i64EndCpuCounter = __rdtsc(); - - fTime = (lPerformanceCount_End.QuadPart - lPerformanceCount_Start.QuadPart) / - (double)lFrequency.QuadPart; - mhz = (_i64EndCpuCounter - _i64StartCpuCounter) / (fTime * 1000000.0); - - if (verbose > 0) { - printf("CPU频率为: %.6fMHz.\n", mhz); - } - return mhz; -} - -double CPU_Factor1(void) { - double result; - int i, j, k; - LARGE_INTEGER lStart, lEnd; - LARGE_INTEGER lFrequency; - double fTime; - -#ifdef _WIN32 - HANDLE hThread = GetCurrentThread(); - SetThreadAffinityMask(hThread, 0x1); -#else - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(0, &cpuset); - sched_setaffinity(0, sizeof(cpuset), &cpuset); -#endif - - QueryPerformanceFrequency(&lFrequency); - GET_TIME(&lStart); - start_counter(); - - for (i = 0; i < 100; i++) - for (j = 0; j < 1000; j++) - for (k = 0; k < 1000; k++) - ; - - result = get_counter(); - GET_TIME(&lEnd); - - fTime = (lEnd.QuadPart - lStart.QuadPart) / (double)lFrequency.QuadPart; - printf("CPU计算时长为: %f", result); - printf("\t %f\n", fTime); - return result; -} - -double CPU_Factor(void) { - double frequency; - double multiplier = 1000 * 1000 * 1000; // nano - LARGE_INTEGER lFrequency; - LARGE_INTEGER start, stop; - int i; - const int known_instructions_per_loop = 27317; - int iterations = 100000000; - int g = 0; - double normal_ticks_per_second; - double ticks; - double time; - double loops_per_sec; - double instructions_per_loop; - double ratio; - double actual_freq; - -#ifdef _WIN32 - HANDLE hThread = GetCurrentThread(); - SetThreadAffinityMask(hThread, 0x1); -#else - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(0, &cpuset); - sched_setaffinity(0, sizeof(cpuset), &cpuset); -#endif - - QueryPerformanceFrequency(&lFrequency); - frequency = (double)lFrequency.QuadPart; - GET_TIME(&start); - - for (i = 0; i < iterations; i++) { - g++; - g++; - g++; - g++; - } - - GET_TIME(&stop); - normal_ticks_per_second = frequency * 1000; - ticks = (double)(stop.QuadPart - start.QuadPart); - time = (ticks * multiplier) / frequency; - loops_per_sec = iterations / (time / multiplier); - instructions_per_loop = normal_ticks_per_second / loops_per_sec; - ratio = instructions_per_loop / known_instructions_per_loop; - actual_freq = normal_ticks_per_second / ratio; - - printf("Perf counter freq: %f\n", normal_ticks_per_second); - printf("Loops per sec: %f\n", loops_per_sec); - printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop); - printf("Presumed freq: %f\n", actual_freq); - printf("ratio: %f\n", ratio); - printf("time=%f\n", time); - return ratio; -} +/* clock.c + * Retrofitted to use thread-specific timers + * and to get clock information from /proc/cpuinfo + * (C) R. E. Bryant, 2010 + * + */ + +/* When this constant is not defined, uses time stamp counter */ +#define USE_POSIX 0 + +/* Choice to use cpu_gettime call or Intel time stamp counter directly */ + +#include +#include +#include +#include +//#include +#include +#include +#include "clock.h" + +/* Use x86 cycle counter */ + +/* Initialize the cycle counter */ +static unsigned cyc_hi = 0; +static unsigned cyc_lo = 0; + +/* Set *hi and *lo to the high and low order bits of the cycle counter. + Implementation requires assembly code to use the rdtsc instruction. */ +void access_counter(unsigned *hi, unsigned *lo) +{ + + long long counter; + + counter = __rdtsc(); + (*hi) = (unsigned int)(counter >> 32); + (*lo) = (unsigned int)counter; +/* + + LARGE_INTEGER lPerformanceCount; + + QueryPerformanceCounter(&lPerformanceCount); + (*hi) = (unsigned int)lPerformanceCount.HighPart; + (*lo) = (unsigned int)lPerformanceCount.LowPart; +// printf("%08X %08X\n",(*hi),(*lo)); +*/ +} + + +/* Record the current value of the cycle counter. */ +void start_counter() +{ + access_counter(&cyc_hi, &cyc_lo); +} + +/* Return the number of cycles since the last call to start_counter. */ +double get_counter() +{ + unsigned ncyc_hi, ncyc_lo; + unsigned hi, lo, borrow; + double result; + + /* Get cycle counter */ + access_counter(&ncyc_hi, &ncyc_lo); + + /* Do double precision subtraction */ + lo = ncyc_lo - cyc_lo; + borrow = cyc_lo > ncyc_lo; + hi = ncyc_hi - cyc_hi - borrow; + result = (double) hi * (1 << 30) * 4 + lo; + return result; +} +void make_CPU_busy(void) +{ + volatile double old_tick,new_tick; + start_counter(); + old_tick = get_counter(); + new_tick = get_counter(); + while (new_tick - old_tick < 1000000000) + new_tick = get_counter(); +} + +//CPUƵ +double mhz(int verbose) +{ + LARGE_INTEGER lFrequency; + LARGE_INTEGER lPerformanceCount_Start; + LARGE_INTEGER lPerformanceCount_End; + double mhz; + double fTime; + __int64 _i64StartCpuCounter; + __int64 _i64EndCpuCounter; + //On a multiprocessor machine, it should not matter which processor is called. + //However, you can get different results on different processors due to bugs in + //the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function. + HANDLE hThread=GetCurrentThread(); + SetThreadAffinityMask(hThread,0x1); + + //ϸ߾ȶʱľƵ + //ʱӦþһƬ82538254 + //intel ich7м8254 + QueryPerformanceFrequency(&lFrequency); +// if (verbose>0) +// printf("߾ȶʱľƵʣ%1.0fHz.\n",(double)lFrequency.QuadPart); + + //ʱÿһʱڣ+1 + QueryPerformanceCounter(&lPerformanceCount_Start); + + //RDTSCָ:ȡCPUʱ + _i64StartCpuCounter=__rdtsc(); + + //ʱһ,Сһ + //int nTemp=100000; + //while (--nTemp); + Sleep(200); + + QueryPerformanceCounter(&lPerformanceCount_End); + + _i64EndCpuCounter=__rdtsc(); + + //f=1/T => f=/(*T) + //ġ*Tʱ + fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart) + /(double)lFrequency.QuadPart; + + mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0); + if (verbose>0) + printf("CPUƵΪ:%1.6fMHz.\n",mhz); + return mhz; +} + +double CPU_Factor1(void) +{ + double result; + int i,j,k,ii,jj,kk; + LARGE_INTEGER lStart,lEnd; + LARGE_INTEGER lFrequency; + HANDLE hThread; + double fTime; + + QueryPerformanceFrequency(&lFrequency); + + ii = 43273; + kk = 1238; + result = 1; + jj = 1244; + + hThread=GetCurrentThread(); + SetThreadAffinityMask(hThread,0x1); + QueryPerformanceCounter(&lStart); + //_asm("cpuid"); + start_counter(); + for (i=0;i<100;i++) + for (j=0;j<1000;j++) + for (k=0;k<1000;k++) + kk += kk*ii+jj; + + result = get_counter(); + QueryPerformanceCounter(&lEnd); + fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart); + printf("CPUʱΪ%f",result); + printf("\t %f\n",fTime); + return result; +} + +double CPU_Factor(void) +{ + double frequency; + double multiplier = 1000 * 1000 * 1000;//nano + LARGE_INTEGER lFrequency; + LARGE_INTEGER start,stop; + HANDLE hThread; + int i; + const int gigahertz= 1000*1000*1000; + const int known_instructions_per_loop = 27317; + + int iterations = 100000000; + int g = 0; + double normal_ticks_per_second; +double ticks; +double time; +double loops_per_sec; +double instructions_per_loop; +double ratio; +double actual_freq; + + QueryPerformanceFrequency(&lFrequency); + frequency = (double)lFrequency.QuadPart; + + hThread=GetCurrentThread(); + SetThreadAffinityMask(hThread,0x1); + QueryPerformanceCounter(&start); + for( i = 0; i < iterations; i++) + { + g++; + g++; + g++; + g++; + } + QueryPerformanceCounter(&stop); + + //normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199 + normal_ticks_per_second = frequency * 1000; + ticks = (double)((double)stop.QuadPart - (double)start.QuadPart); + time = (ticks * multiplier) /frequency; + loops_per_sec = iterations / (time/multiplier); + instructions_per_loop = normal_ticks_per_second / loops_per_sec; + + ratio = (instructions_per_loop / known_instructions_per_loop); + actual_freq = normal_ticks_per_second / ratio; +/* + actual_freq = normal_ticks_per_second / ratio; + actual_freq = known_instructions_per_loop*iterations*multiplier/time; + + 2293 = x/time; + + 2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000 + loops_per_sec = iterations*frequency / ticks + + instructions_per_loop = / loops_per_sec; +*/ + printf("Perf counter freq: %f\n", normal_ticks_per_second); + printf("Loops per sec: %f\n", loops_per_sec); + printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop); + printf("Presumed freq: %f\n", actual_freq); + printf("ratio: %f\n", ratio); + printf("time=%f\n",time); + return ratio; +} diff --git a/perflab/matrix/clock.c.bak b/perflab/matrix/clock.c.bak deleted file mode 100644 index 3b2a198..0000000 --- a/perflab/matrix/clock.c.bak +++ /dev/null @@ -1,229 +0,0 @@ -/* clock.c - * Retrofitted to use thread-specific timers - * and to get clock information from /proc/cpuinfo - * (C) R. E. Bryant, 2010 - * - */ - -/* When this constant is not defined, uses time stamp counter */ -#define USE_POSIX 0 - -/* Choice to use cpu_gettime call or Intel time stamp counter directly */ - -#include -#include -#include -#include -//#include -//#include -#include -#include "clock.h" - -/* Use x86 cycle counter */ - -/* Initialize the cycle counter */ -static unsigned cyc_hi = 0; -static unsigned cyc_lo = 0; - -/* Set *hi and *lo to the high and low order bits of the cycle counter. - Implementation requires assembly code to use the rdtsc instruction. */ -void access_counter(unsigned *hi, unsigned *lo) -{ - - long long counter; - - counter = __rdtsc(); - (*hi) = (unsigned int)(counter >> 32); - (*lo) = (unsigned int)counter; -/* - - LARGE_INTEGER lPerformanceCount; - - QueryPerformanceCounter(&lPerformanceCount); - (*hi) = (unsigned int)lPerformanceCount.HighPart; - (*lo) = (unsigned int)lPerformanceCount.LowPart; -// printf("%08X %08X\n",(*hi),(*lo)); -*/ -} - - -/* Record the current value of the cycle counter. */ -void start_counter() -{ - access_counter(&cyc_hi, &cyc_lo); -} - -/* Return the number of cycles since the last call to start_counter. */ -double get_counter() -{ - unsigned ncyc_hi, ncyc_lo; - unsigned hi, lo, borrow; - double result; - - /* Get cycle counter */ - access_counter(&ncyc_hi, &ncyc_lo); - - /* Do double precision subtraction */ - lo = ncyc_lo - cyc_lo; - borrow = cyc_lo > ncyc_lo; - hi = ncyc_hi - cyc_hi - borrow; - result = (double) hi * (1 << 30) * 4 + lo; - return result; -} -void make_CPU_busy(void) -{ - volatile double old_tick,new_tick; - start_counter(); - old_tick = get_counter(); - new_tick = get_counter(); - while (new_tick - old_tick < 1000000000) - new_tick = get_counter(); -} - -//CPUƵ -double mhz(int verbose) -{ - LARGE_INTEGER lFrequency; - LARGE_INTEGER lPerformanceCount_Start; - LARGE_INTEGER lPerformanceCount_End; - double mhz; - double fTime; - __int64 _i64StartCpuCounter; - __int64 _i64EndCpuCounter; - //On a multiprocessor machine, it should not matter which processor is called. - //However, you can get different results on different processors due to bugs in - //the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function. - HANDLE hThread=GetCurrentThread(); - SetThreadAffinityMask(hThread,0x1); - - //ϸ߾ȶʱľƵ - //ʱӦþһƬ82538254 - //intel ich7м8254 - QueryPerformanceFrequency(&lFrequency); -// if (verbose>0) -// printf("߾ȶʱľƵʣ%1.0fHz.\n",(double)lFrequency.QuadPart); - - //ʱÿһʱڣ+1 - QueryPerformanceCounter(&lPerformanceCount_Start); - - //RDTSCָ:ȡCPUʱ - _i64StartCpuCounter=__rdtsc(); - - //ʱһ,Сһ - //int nTemp=100000; - //while (--nTemp); - Sleep(200); - - QueryPerformanceCounter(&lPerformanceCount_End); - - _i64EndCpuCounter=__rdtsc(); - - //f=1/T => f=/(*T) - //ġ*Tʱ - fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart) - /(double)lFrequency.QuadPart; - - mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0); - if (verbose>0) - printf("CPUƵΪ:%1.6fMHz.\n",mhz); - return mhz; -} - -double CPU_Factor1(void) -{ - double result; - int i,j,k,ii,jj,kk; - LARGE_INTEGER lStart,lEnd; - LARGE_INTEGER lFrequency; - HANDLE hThread; - double fTime; - - QueryPerformanceFrequency(&lFrequency); - - ii = 43273; - kk = 1238; - result = 1; - jj = 1244; - - hThread=GetCurrentThread(); - SetThreadAffinityMask(hThread,0x1); - QueryPerformanceCounter(&lStart); - //_asm("cpuid"); - start_counter(); - for (i=0;i<100;i++) - for (j=0;j<1000;j++) - for (k=0;k<1000;k++) - kk += kk*ii+jj; - - result = get_counter(); - QueryPerformanceCounter(&lEnd); - fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart); - printf("CPUʱΪ%f",result); - printf("\t %f\n",fTime); - return result; -} - -double CPU_Factor(void) -{ - double frequency; - double multiplier = 1000 * 1000 * 1000;//nano - LARGE_INTEGER lFrequency; - LARGE_INTEGER start,stop; - HANDLE hThread; - int i; - const int gigahertz= 1000*1000*1000; - const int known_instructions_per_loop = 27317; - - int iterations = 100000000; - int g = 0; - double normal_ticks_per_second; -double ticks; -double time; -double loops_per_sec; -double instructions_per_loop; -double ratio; -double actual_freq; - - QueryPerformanceFrequency(&lFrequency); - frequency = (double)lFrequency.QuadPart; - - hThread=GetCurrentThread(); - SetThreadAffinityMask(hThread,0x1); - QueryPerformanceCounter(&start); - for( i = 0; i < iterations; i++) - { - g++; - g++; - g++; - g++; - } - QueryPerformanceCounter(&stop); - - //normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199 - normal_ticks_per_second = frequency * 1000; - ticks = (double)((double)stop.QuadPart - (double)start.QuadPart); - time = (ticks * multiplier) /frequency; - loops_per_sec = iterations / (time/multiplier); - instructions_per_loop = normal_ticks_per_second / loops_per_sec; - - ratio = (instructions_per_loop / known_instructions_per_loop); - actual_freq = normal_ticks_per_second / ratio; -/* - actual_freq = normal_ticks_per_second / ratio; - actual_freq = known_instructions_per_loop*iterations*multiplier/time; - - 2293 = x/time; - - 2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000 - loops_per_sec = iterations*frequency / ticks - - instructions_per_loop = / loops_per_sec; -*/ - printf("Perf counter freq: %f\n", normal_ticks_per_second); - printf("Loops per sec: %f\n", loops_per_sec); - printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop); - printf("Presumed freq: %f\n", actual_freq); - printf("ratio: %f\n", ratio); - printf("time=%f\n",time); - return ratio; -} diff --git a/perflab/matrix/clock.o b/perflab/matrix/clock.o deleted file mode 100644 index 0e68ee3..0000000 Binary files a/perflab/matrix/clock.o and /dev/null differ diff --git a/perflab/matrix/cpe.o b/perflab/matrix/cpe.o deleted file mode 100644 index 96d717d..0000000 Binary files a/perflab/matrix/cpe.o and /dev/null differ diff --git a/perflab/matrix/fcyc.c b/perflab/matrix/fcyc.c index a5f4077..9246307 100644 --- a/perflab/matrix/fcyc.c +++ b/perflab/matrix/fcyc.c @@ -119,7 +119,7 @@ double fcyc(test_funct f, int *params) if (clear_cache) clear(); start_counter(); - f((long*)params); + f((long int*)params); cyc = get_counter(); if (cyc > 0.0) add_sample(cyc); @@ -131,7 +131,7 @@ double fcyc(test_funct f, int *params) clear(); start_counter(); for (i=0;i 0.0) add_sample(cyc); diff --git a/perflab/matrix/fcyc.o b/perflab/matrix/fcyc.o deleted file mode 100644 index 0943503..0000000 Binary files a/perflab/matrix/fcyc.o and /dev/null differ diff --git a/perflab/matrix/lsquare.o b/perflab/matrix/lsquare.o deleted file mode 100644 index f36c57e..0000000 Binary files a/perflab/matrix/lsquare.o and /dev/null differ diff --git a/perflab/matrix/matrix_test b/perflab/matrix/matrix_test deleted file mode 100644 index 9fbc0a4..0000000 Binary files a/perflab/matrix/matrix_test and /dev/null differ diff --git a/perflab/matrix/rowcol.c b/perflab/matrix/rowcol.c deleted file mode 100644 index b504582..0000000 --- a/perflab/matrix/rowcol.c +++ /dev/null @@ -1,69 +0,0 @@ -/************************************************************************** - ??/??????????????????????????????? - 1. ??????????????????????????????? - 2. ?????????????????????? - 3. ??rc_fun_rec rc_fun_tab?????????????????? - ??????????????????????????????????????????? -***************************************************************************/ - -/* - ????201209054233 - ?????????????? -*/ - -#include "rowcol.h" -#include -#include -#include - -/* ????????????????? */ -/* ??????????????????????????????????????????????? - ??????????2????????????????? -*/ - -void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) { - int i, j; - for (j = 0; j < N; j++) { - colsum[j] = 0; - for (i = 0; i < N; i++) - colsum[j] += M[i][j]; - } -} - -/* ???????????????????? */ -/* ??????????????????????? */ - -void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) { - int i, j; - for (i = 0; i < N; i++) { - rowsum[i] = colsum[i] = 0; - for (j = 0; j < N; j++) { - rowsum[i] += M[i][j]; - colsum[i] += M[j][i]; - } - } -} - -/* - ????????????????????????????????????????, COL/ROWCOL, "?????????"?? - COL?????????????????????? - ROWCOL??????????????????????? - ????????????????????????????? - ???? - {my_c_sum1, "?????????????????"}, - {my_rc_sum2, "??????????????????"}, -*/ - -rc_fun_rec rc_fun_tab[] = { - - /* ???????????????????????????????? */ - {c_sum, COL, "Best column sum"}, - /* ?????????????????????????????????? */ - {rc_sum, ROWCOL, "Best row and column sum"}, - - {c_sum, COL, "Column sum, reference implementation"}, - - {rc_sum, ROWCOL, "Row and column sum, reference implementation"}, - - /* ??????????????????????????????????????? */ - {NULL, ROWCOL, NULL}}; \ No newline at end of file diff --git a/perflab/matrix/rowcol.c~ b/perflab/matrix/rowcol.c~ deleted file mode 100644 index 990ce83..0000000 --- a/perflab/matrix/rowcol.c~ +++ /dev/null @@ -1,162 +0,0 @@ -/************************************************************************** - /ͺҪ༭ļ - 1. ѧšע͵ķʽд棻 - 2. ʵֲͬ汾ͺ - 3. ༭rc_fun_rec rc_fun_tab飬õĴ - õк͡õͣΪǰ -***************************************************************************/ - -/* - ѧţ202302723005 - ̾ -*/ - - -#include -#include -#include "rowcol.h" -#include -#include - -/* οͺʵ */ -/* еÿһеĺ͡עк˵ò - һģֻǵ2õ -*/ - -void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) -{ - int i,j; - for (j = 0; j < N; j++) { - colsum[j] = 0; - for (i = 0; i < N; i++) - colsum[j] += M[i][j]; - } -} - - -/* οкͺʵ */ -/* еÿһСÿһеĺ͡ */ - -void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) -{ - int i,j; - for (i = 0; i < N; i++) { - rowsum[i] = colsum[i] = 0; - for (j = 0; j < N; j++) { - rowsum[i] += M[i][j]; - colsum[i] += M[j][i]; - } - } -} - -/* CUDAŻͺ */ -void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum) -{ - // 豸ڴ - int *d_M, *d_colsum; - cudaMalloc(&d_M, N * N * sizeof(int)); - cudaMalloc(&d_colsum, N * sizeof(int)); - - // ݴƵ豸 - cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice); - - // CUDA˺ - dim3 blockDim(256); - dim3 gridDim((N + blockDim.x - 1) / blockDim.x); - - // ˺ - cudaColumnSum<<>>(d_M, d_colsum); - - // 豸ƻ - cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost); - - // ͷ豸ڴ - cudaFree(d_M); - cudaFree(d_colsum); -} - -/* CUDAŻͺ */ -void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) -{ - // 豸ڴ - int *d_M, *d_rowsum, *d_colsum; - cudaMalloc(&d_M, N * N * sizeof(int)); - cudaMalloc(&d_rowsum, N * sizeof(int)); - cudaMalloc(&d_colsum, N * sizeof(int)); - - // ݴƵ豸 - cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice); - - // CUDA˺ - dim3 blockDim(256); - dim3 gridDim((N + blockDim.x - 1) / blockDim.x); - - // ˺ - cudaRowColSum<<>>(d_M, d_rowsum, d_colsum); - - // 豸ƻ - cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost); - cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost); - - // ͷ豸ڴ - cudaFree(d_M); - cudaFree(d_rowsum); - cudaFree(d_colsum); -} - -/* CUDA˺ - */ -__global__ void cudaColumnSum(int *M, int *colsum) -{ - int col = blockIdx.x * blockDim.x + threadIdx.x; - if (col < N) { - colsum[col] = 0; - for (int row = 0; row < N; row++) { - colsum[col] += M[row * N + col]; - } - } -} - -/* CUDA˺ - */ -__global__ void cudaRowColSum(int *M, int *rowsum, int *colsum) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < N) { - // к - rowsum[idx] = 0; - for (int j = 0; j < N; j++) { - rowsum[idx] += M[idx * N + j]; - } - - // к - colsum[idx] = 0; - for (int i = 0; i < N; i++) { - colsum[idx] += M[i * N + idx]; - } - } -} - -/* - ԪأÿһԪأ, COL/ROWCOL, "ַ" - COLʾúÿһеĺ - ROWCOLʾúÿһСÿһеĺ - Ϊõʵ֣ǰ档 - 磺 - {my_c_sum1, "ʵ"}, - {my_rc_sum2, "һʵ"}, -*/ - -rc_fun_rec rc_fun_tab[] = -{ - - /* һӦд͵ĺʵ */ - {cuda_c_sum, COL, "CUDA optimized column sum"}, - /* ڶӦд͵ĺʵ */ - {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"}, - - {c_sum, COL, "Column sum, reference implementation"}, - - {rc_sum, ROWCOL, "Row and column sum, reference implementation"}, - - /* Ĵ벻޸Ļɾб */ - {NULL,ROWCOL,NULL} -}; diff --git a/perflab/matrix/rowcol.o b/perflab/matrix/rowcol.o deleted file mode 100644 index abada5f..0000000 Binary files a/perflab/matrix/rowcol.o and /dev/null differ diff --git a/perflab/matrix/rowcol.y~ b/perflab/matrix/rowcol.y~ deleted file mode 100644 index 5d3310a..0000000 --- a/perflab/matrix/rowcol.y~ +++ /dev/null @@ -1,240 +0,0 @@ -/************************************************************************** - / - 1. - 2. - 3. rc_fun_rec rc_fun_tab - -***************************************************************************/ - -/* - 201209054233 - -*/ - - -#include -#include -#include "rowcol.h" -#include - -/* */ -/* 򿿿 - 2 -*/ - -void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) -{ - int i,j; - for (j = 0; j < N; j++) { - colsum[j] = 0; - for (i = 0; i < N; i++) - colsum[j] += M[i][j]; - } -} - - -/* */ -/* */ - -void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) -{ - int i,j; - for (i = 0; i < N; i++) { - rowsum[i] = colsum[i] = 0; - for (j = 0; j < N; j++) { - rowsum[i] += M[i][j]; - colsum[i] += M[j][i]; - } - } -} - - - -/* - , COL/ROWCOL, "" - COL - ROWCOL - - - {my_c_sum1, ""}, - {my_rc_sum2, ""}, -*/ - -rc_fun_rec rc_fun_tab[] = -{ - - /* */ - {c_sum, COL, "Best column sum"}, - /* */ - {rc_sum, ROWCOL, "Best row and column sum"}, - - {c_sum, COL, "Column sum, reference implementation"}, - - {rc_sum, ROWCOL, "Row and column sum, reference implementation"}, - - /* */ - {NULL,ROWCOL,NULL} -}; - -// /************************************************************************** -// /ͺҪ༭ļ -// 1. ѧšע͵ķʽд棻 -// 2. ʵֲͬ汾ͺ -// 3. ༭rc_fun_rec rc_fun_tab飬õĴ -// õк͡õͣΪǰ -// ***************************************************************************/ -// -// /* -// ѧţ202302723005 -// ̾ -// */ -// -// -// #include -// #include -// #include "rowcol.h" -// #include -// #include -// -// /* οͺʵ */ -// /* еÿһеĺ͡עк˵ò -// һģֻǵ2õ -// */ -// -// void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) -// { -// int i,j; -// for (j = 0; j < N; j++) { -// colsum[j] = 0; -// for (i = 0; i < N; i++) -// colsum[j] += M[i][j]; -// } -// } -// -// -// /* οкͺʵ */ -// /* еÿһСÿһеĺ͡ */ -// -// void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) -// { -// int i,j; -// for (i = 0; i < N; i++) { -// rowsum[i] = colsum[i] = 0; -// for (j = 0; j < N; j++) { -// rowsum[i] += M[i][j]; -// colsum[i] += M[j][i]; -// } -// } -// } -// -// /* CUDAŻͺ */ -// void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum) -// { -// // 豸ڴ -// int *d_M, *d_colsum; -// cudaMalloc(&d_M, N * N * sizeof(int)); -// cudaMalloc(&d_colsum, N * sizeof(int)); -// -// // ݴƵ豸 -// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice); -// -// // CUDA˺ -// dim3 blockDim(256); -// dim3 gridDim((N + blockDim.x - 1) / blockDim.x); -// -// // ˺ -// cudaColumnSum<<>>(d_M, d_colsum); -// -// // 豸ƻ -// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost); -// -// // ͷ豸ڴ -// cudaFree(d_M); -// cudaFree(d_colsum); -// } -// -// /* CUDAŻͺ */ -// void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) -// { -// // 豸ڴ -// int *d_M, *d_rowsum, *d_colsum; -// cudaMalloc(&d_M, N * N * sizeof(int)); -// cudaMalloc(&d_rowsum, N * sizeof(int)); -// cudaMalloc(&d_colsum, N * sizeof(int)); -// -// // ݴƵ豸 -// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice); -// -// // CUDA˺ -// dim3 blockDim(256); -// dim3 gridDim((N + blockDim.x - 1) / blockDim.x); -// -// // ˺ -// cudaRowColSum<<>>(d_M, d_rowsum, d_colsum); -// -// // 豸ƻ -// cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost); -// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost); -// -// // ͷ豸ڴ -// cudaFree(d_M); -// cudaFree(d_rowsum); -// cudaFree(d_colsum); -// } -// -// /* CUDA˺ - */ -// __global__ void cudaColumnSum(int *M, int *colsum) -// { -// int col = blockIdx.x * blockDim.x + threadIdx.x; -// if (col < N) { -// colsum[col] = 0; -// for (int row = 0; row < N; row++) { -// colsum[col] += M[row * N + col]; -// } -// } -// } -// -// /* CUDA˺ - */ -// __global__ void cudaRowColSum(int *M, int *rowsum, int *colsum) -// { -// int idx = blockIdx.x * blockDim.x + threadIdx.x; -// if (idx < N) { -// // к -// rowsum[idx] = 0; -// for (int j = 0; j < N; j++) { -// rowsum[idx] += M[idx * N + j]; -// } -// -// // к -// colsum[idx] = 0; -// for (int i = 0; i < N; i++) { -// colsum[idx] += M[i * N + idx]; -// } -// } -// } -// -// /* -// ԪأÿһԪأ, COL/ROWCOL, "ַ" -// COLʾúÿһеĺ -// ROWCOLʾúÿһСÿһеĺ -// Ϊõʵ֣ǰ档 -// 磺 -// {my_c_sum1, "ʵ"}, -// {my_rc_sum2, "һʵ"}, -// */ -// -// rc_fun_rec rc_fun_tab[] = -// { -// -// /* һӦд͵ĺʵ */ -// {cuda_c_sum, COL, "CUDA optimized column sum"}, -// /* ڶӦд͵ĺʵ */ -// {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"}, -// -// {c_sum, COL, "Column sum, reference implementation"}, -// -// {rc_sum, ROWCOL, "Row and column sum, reference implementation"}, -// -// /* Ĵ벻޸Ļɾб */ -// {NULL,ROWCOL,NULL} -// }; diff --git a/perflab/matrix/rowcol.z~ b/perflab/matrix/rowcol.z~ deleted file mode 100644 index 5d3310a..0000000 --- a/perflab/matrix/rowcol.z~ +++ /dev/null @@ -1,240 +0,0 @@ -/************************************************************************** - / - 1. - 2. - 3. rc_fun_rec rc_fun_tab - -***************************************************************************/ - -/* - 201209054233 - -*/ - - -#include -#include -#include "rowcol.h" -#include - -/* */ -/* 򿿿 - 2 -*/ - -void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) -{ - int i,j; - for (j = 0; j < N; j++) { - colsum[j] = 0; - for (i = 0; i < N; i++) - colsum[j] += M[i][j]; - } -} - - -/* */ -/* */ - -void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) -{ - int i,j; - for (i = 0; i < N; i++) { - rowsum[i] = colsum[i] = 0; - for (j = 0; j < N; j++) { - rowsum[i] += M[i][j]; - colsum[i] += M[j][i]; - } - } -} - - - -/* - , COL/ROWCOL, "" - COL - ROWCOL - - - {my_c_sum1, ""}, - {my_rc_sum2, ""}, -*/ - -rc_fun_rec rc_fun_tab[] = -{ - - /* */ - {c_sum, COL, "Best column sum"}, - /* */ - {rc_sum, ROWCOL, "Best row and column sum"}, - - {c_sum, COL, "Column sum, reference implementation"}, - - {rc_sum, ROWCOL, "Row and column sum, reference implementation"}, - - /* */ - {NULL,ROWCOL,NULL} -}; - -// /************************************************************************** -// /ͺҪ༭ļ -// 1. ѧšע͵ķʽд棻 -// 2. ʵֲͬ汾ͺ -// 3. ༭rc_fun_rec rc_fun_tab飬õĴ -// õк͡õͣΪǰ -// ***************************************************************************/ -// -// /* -// ѧţ202302723005 -// ̾ -// */ -// -// -// #include -// #include -// #include "rowcol.h" -// #include -// #include -// -// /* οͺʵ */ -// /* еÿһеĺ͡עк˵ò -// һģֻǵ2õ -// */ -// -// void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) -// { -// int i,j; -// for (j = 0; j < N; j++) { -// colsum[j] = 0; -// for (i = 0; i < N; i++) -// colsum[j] += M[i][j]; -// } -// } -// -// -// /* οкͺʵ */ -// /* еÿһСÿһеĺ͡ */ -// -// void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) -// { -// int i,j; -// for (i = 0; i < N; i++) { -// rowsum[i] = colsum[i] = 0; -// for (j = 0; j < N; j++) { -// rowsum[i] += M[i][j]; -// colsum[i] += M[j][i]; -// } -// } -// } -// -// /* CUDAŻͺ */ -// void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum) -// { -// // 豸ڴ -// int *d_M, *d_colsum; -// cudaMalloc(&d_M, N * N * sizeof(int)); -// cudaMalloc(&d_colsum, N * sizeof(int)); -// -// // ݴƵ豸 -// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice); -// -// // CUDA˺ -// dim3 blockDim(256); -// dim3 gridDim((N + blockDim.x - 1) / blockDim.x); -// -// // ˺ -// cudaColumnSum<<>>(d_M, d_colsum); -// -// // 豸ƻ -// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost); -// -// // ͷ豸ڴ -// cudaFree(d_M); -// cudaFree(d_colsum); -// } -// -// /* CUDAŻͺ */ -// void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) -// { -// // 豸ڴ -// int *d_M, *d_rowsum, *d_colsum; -// cudaMalloc(&d_M, N * N * sizeof(int)); -// cudaMalloc(&d_rowsum, N * sizeof(int)); -// cudaMalloc(&d_colsum, N * sizeof(int)); -// -// // ݴƵ豸 -// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice); -// -// // CUDA˺ -// dim3 blockDim(256); -// dim3 gridDim((N + blockDim.x - 1) / blockDim.x); -// -// // ˺ -// cudaRowColSum<<>>(d_M, d_rowsum, d_colsum); -// -// // 豸ƻ -// cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost); -// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost); -// -// // ͷ豸ڴ -// cudaFree(d_M); -// cudaFree(d_rowsum); -// cudaFree(d_colsum); -// } -// -// /* CUDA˺ - */ -// __global__ void cudaColumnSum(int *M, int *colsum) -// { -// int col = blockIdx.x * blockDim.x + threadIdx.x; -// if (col < N) { -// colsum[col] = 0; -// for (int row = 0; row < N; row++) { -// colsum[col] += M[row * N + col]; -// } -// } -// } -// -// /* CUDA˺ - */ -// __global__ void cudaRowColSum(int *M, int *rowsum, int *colsum) -// { -// int idx = blockIdx.x * blockDim.x + threadIdx.x; -// if (idx < N) { -// // к -// rowsum[idx] = 0; -// for (int j = 0; j < N; j++) { -// rowsum[idx] += M[idx * N + j]; -// } -// -// // к -// colsum[idx] = 0; -// for (int i = 0; i < N; i++) { -// colsum[idx] += M[i * N + idx]; -// } -// } -// } -// -// /* -// ԪأÿһԪأ, COL/ROWCOL, "ַ" -// COLʾúÿһеĺ -// ROWCOLʾúÿһСÿһеĺ -// Ϊõʵ֣ǰ档 -// 磺 -// {my_c_sum1, "ʵ"}, -// {my_rc_sum2, "һʵ"}, -// */ -// -// rc_fun_rec rc_fun_tab[] = -// { -// -// /* һӦд͵ĺʵ */ -// {cuda_c_sum, COL, "CUDA optimized column sum"}, -// /* ڶӦд͵ĺʵ */ -// {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"}, -// -// {c_sum, COL, "Column sum, reference implementation"}, -// -// {rc_sum, ROWCOL, "Row and column sum, reference implementation"}, -// -// /* Ĵ벻޸Ļɾб */ -// {NULL,ROWCOL,NULL} -// }; diff --git a/perflab/matrix/rowcol_202302723005.c b/perflab/matrix/rowcol_202302723005.c deleted file mode 100644 index b504582..0000000 --- a/perflab/matrix/rowcol_202302723005.c +++ /dev/null @@ -1,69 +0,0 @@ -/************************************************************************** - ??/??????????????????????????????? - 1. ??????????????????????????????? - 2. ?????????????????????? - 3. ??rc_fun_rec rc_fun_tab?????????????????? - ??????????????????????????????????????????? -***************************************************************************/ - -/* - ????201209054233 - ?????????????? -*/ - -#include "rowcol.h" -#include -#include -#include - -/* ????????????????? */ -/* ??????????????????????????????????????????????? - ??????????2????????????????? -*/ - -void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) { - int i, j; - for (j = 0; j < N; j++) { - colsum[j] = 0; - for (i = 0; i < N; i++) - colsum[j] += M[i][j]; - } -} - -/* ???????????????????? */ -/* ??????????????????????? */ - -void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) { - int i, j; - for (i = 0; i < N; i++) { - rowsum[i] = colsum[i] = 0; - for (j = 0; j < N; j++) { - rowsum[i] += M[i][j]; - colsum[i] += M[j][i]; - } - } -} - -/* - ????????????????????????????????????????, COL/ROWCOL, "?????????"?? - COL?????????????????????? - ROWCOL??????????????????????? - ????????????????????????????? - ???? - {my_c_sum1, "?????????????????"}, - {my_rc_sum2, "??????????????????"}, -*/ - -rc_fun_rec rc_fun_tab[] = { - - /* ???????????????????????????????? */ - {c_sum, COL, "Best column sum"}, - /* ?????????????????????????????????? */ - {rc_sum, ROWCOL, "Best row and column sum"}, - - {c_sum, COL, "Column sum, reference implementation"}, - - {rc_sum, ROWCOL, "Row and column sum, reference implementation"}, - - /* ??????????????????????????????????????? */ - {NULL, ROWCOL, NULL}}; \ No newline at end of file diff --git a/perflab/matrix/rowcol_202302723005.o b/perflab/matrix/rowcol_202302723005.o deleted file mode 100644 index b28983c..0000000 Binary files a/perflab/matrix/rowcol_202302723005.o and /dev/null differ diff --git a/perflab/matrix/rowcol_723005.c b/perflab/matrix/rowcol_723005.c new file mode 100644 index 0000000..6002639 --- /dev/null +++ b/perflab/matrix/rowcol_723005.c @@ -0,0 +1,121 @@ +/************************************************************************** + /ͺҪ༭ļ + 1. ѧšע͵ķʽд棻 + 2. ʵֲͬ汾ͺ + 3. ༭rc_fun_rec rc_fun_tab飬õĴ + õк͡õͣΪǰ +***************************************************************************/ + +/* + ѧţ202302723005 + ̾ +*/ + + +#include +#include +#include "rowcol.h" +#include + +/* οͺʵ */ +/* еÿһеĺ͡עк˵ò + һģֻǵ2õ +*/ +/* +void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) +{ + int i,j; + for (j = 0; j < N; j++) { + colsum[j] = 0; + for (i = 0; i < N; i++) + colsum[j] += M[i][j]; + } +} +*/ +void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) { + int i, j; + for (j = 0; j < N; j += 4) { + int sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0; + for (i = 0; i < N; i++) { + sum0 += M[i][j]; + sum1 += M[i][j + 1]; + sum2 += M[i][j + 2]; + sum3 += M[i][j + 3]; + } + colsum[j] = sum0; + colsum[j + 1] = sum1; + colsum[j + 2] = sum2; + colsum[j + 3] = sum3; + } +} + + +/* οкͺʵ */ +/* еÿһСÿһеĺ͡ */ +/* +void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) +{ + int i,j; + for (i = 0; i < N; i++) { + rowsum[i] = colsum[i] = 0; + for (j = 0; j < N; j++) { + rowsum[i] += M[i][j]; + colsum[i] += M[j][i]; + } + } +} +*/ +void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) { + int i, j; + // ʼcolsum + for (i = 0; i < N; i++) { + colsum[i] = 0; + } + + // ֿ鴦߻ + for (i = 0; i < N; i += 4) { + int row_sum0 = 0, row_sum1 = 0, row_sum2 = 0, row_sum3 = 0; + for (j = 0; j < N; j++) { + // к + row_sum0 += M[i][j]; + row_sum1 += M[i + 1][j]; + row_sum2 += M[i + 2][j]; + row_sum3 += M[i + 3][j]; + + // ͬʱк + colsum[j] += M[i][j] + M[i + 1][j] + M[i + 2][j] + M[i + 3][j]; + } + rowsum[i] = row_sum0; + rowsum[i + 1] = row_sum1; + rowsum[i + 2] = row_sum2; + rowsum[i + 3] = row_sum3; + } +} + + + +/* + ԪأÿһԪأ, COL/ROWCOL, "ַ" + COLʾúÿһеĺ + ROWCOLʾúÿһСÿһеĺ + Ϊõʵ֣ǰ档 + 磺 + {my_c_sum1, "ʵ"}, + {my_rc_sum2, "һʵ"}, +*/ + +rc_fun_rec rc_fun_tab[] = +{ + + /* һӦд͵ĺʵ */ + {c_sum, COL, "Best column sum"}, + /* ڶӦд͵ĺʵ */ + {rc_sum, ROWCOL, "Best row and column sum"}, + + {c_sum, COL, "Column sum, reference implementation"}, + + {rc_sum, ROWCOL, "Row and column sum, reference implementation"}, + + /* Ĵ벻޸Ļɾб */ + {NULL,ROWCOL,NULL} +}; diff --git a/perflab/matrix/rowcol_test.c b/perflab/matrix/rowcol_test.c index e6a046e..b785f11 100644 --- a/perflab/matrix/rowcol_test.c +++ b/perflab/matrix/rowcol_test.c @@ -1,9 +1,9 @@ #include #include -// #include -#include "clock.h" -#include "fcyc.h" +//#include #include "rowcol.h" +#include "fcyc.h" +#include "clock.h" #define MAX_ITER_COUNT 100 @@ -11,9 +11,9 @@ static struct { double cref; /* Cycles taken by reference solution */ double cbest; /* Cycles taken by our best implementation */ -} cstandard[2] = { - {7.7, 6.40}, /* Column Sum */ - {9.75, 6.60} /* Row & Column Sum */ +} cstandard[2] = +{{7.7, 6.40}, /* Column Sum */ + {9.75, 6.60} /* Row & Column Sum */ }; /* Put in code to align matrix so that it starts on a cache block boundary. @@ -26,7 +26,7 @@ static struct { #define WPB 16 int verbose = 1; -int data[N * N + WPB]; +int data[N*N+WPB]; int *mstart; typedef vector_t *row_t; @@ -37,122 +37,137 @@ vector_t rsref, csref, rcomp, ccomp; static void init_tests(void); extern void make_CPU_busy(void); -static void init_tests(void) { - int i, j; - size_t bytes_per_block = sizeof(int) * WPB; - /* round mstart up to nearest block boundary */ - mstart = (int *)(((size_t)data + bytes_per_block - 1) / bytes_per_block * - bytes_per_block); - for (i = 0; i < N; i++) { - rsref[i] = csref[i] = 0; - } - for (i = 0; i < N; i++) { - for (j = 0; j < N; j++) { - int val = rand(); - mstart[i * N + j] = val; - rsref[i] += val; - csref[j] += val; +static void init_tests(void) +{ + int i, j; + size_t bytes_per_block = sizeof(int) * WPB; + /* round mstart up to nearest block boundary */ + mstart = (int *) + (((size_t) data + bytes_per_block-1) / bytes_per_block * bytes_per_block); + for (i = 0; i < N; i++) { + rsref[i] = csref[i] = 0; + } + for (i = 0; i < N; i++) { + for (j = 0; j < N; j++) { + int val = rand(); + mstart[i*N+j] = val; + rsref[i] += val; + csref[j] += val; + } } - } } + /* Test function on all values */ int test_rc(rc_fun f, FILE *rpt, rc_comp_t rc_type) { - int i; - int ok = 1; + int i; + int ok = 1; - for (i = 0; i < N; i++) - rcomp[i] = ccomp[i] = 0xDEADBEEF; - f((row_t)mstart, rcomp, ccomp); + for (i = 0; i < N; i++) + rcomp[i] = ccomp[i] = 0xDEADBEEF; + f((row_t)mstart, rcomp, ccomp); + + for (i = 0; ok && i < N; i++) { + if (rc_type == ROWCOL + && rsref[i] != rcomp[i]) { + ok = 0; + if (rpt) + fprintf(rpt, + "Ե%dеļȷ%dǼõ%d\n", + i, rsref[i], rcomp[i]); + } + if ((rc_type == ROWCOL || rc_type == COL) + && csref[i] != ccomp[i]) { + ok = 0; + if (rpt) + fprintf(rpt, + "Ե%dеļȷ%dǼõ%d\n", + i, csref[i], ccomp[i]); + } - for (i = 0; ok && i < N; i++) { - if (rc_type == ROWCOL && rsref[i] != rcomp[i]) { - ok = 0; - if (rpt) - fprintf(rpt, "对第%d行的计算出错!正确结果是%d,但是计算得到%d\n", i, - rsref[i], rcomp[i]); } - if ((rc_type == ROWCOL || rc_type == COL) && csref[i] != ccomp[i]) { - ok = 0; - if (rpt) - fprintf(rpt, "对第%d列的计算出错!正确结果是%d,但是计算得到%d\n", i, - csref[i], ccomp[i]); - } - } - return ok; + return ok; } /* Kludgy way to interface to cycle measuring code */ -void do_test(int *intf) { - rc_fun f = (rc_fun)intf; +void do_test(int *intf) +{ + rc_fun f = (rc_fun) intf; f((row_t)mstart, rcomp, ccomp); } -void time_rc(rc_fun f, rc_comp_t rc_type, char *descr, double *cycp) { - int i; - int *intf = (int *)f; +void time_rc(rc_fun f, rc_comp_t rc_type, char *descr, double *cycp) +{ + int i; + int *intf = (int *) f; double t, cme; t = 0; - if (verbose) - printf("函数:%s\n", descr); + if (verbose) printf("%s\n", descr); if (test_rc(f, stdout, rc_type)) { - make_CPU_busy(); - for (i = 0; i < MAX_ITER_COUNT; i++) - t += fcyc((void (*)(long *))do_test, intf); - t = t / MAX_ITER_COUNT; - cme = t / (N * N); - if (verbose) - printf(" 总周期数 = %.2f, 平均周期/元素 = %.2f\n", t, cme); + make_CPU_busy(); + for (i=0;i 1.1 * (sbest - 1) + 1) + if (smeas > 1.1*(sbest-1)+1) return 120; - return 100 * ((smeas - 1.0) / (sbest - 1.0) + 0.1); + return 100*((smeas-1.0)/(sbest-1.0) + 0.1); } -int main(int argc, char *argv[]) { +int main(int argc, char *argv[]) +{ int i; double cme; - double cme_c, cme_rc; - int EnableScore = 0; - - if (argc == 3) { - EnableScore = 1; - verbose = 0; + double cme_c,cme_rc; + int EnableScore=0; + + if (argc == 3) + { + EnableScore = 1; + verbose = 0; } init_tests(); - set_fcyc_clear_cache(1); /* Set so that clears cache between runs */ + set_fcyc_clear_cache(1); /* Set so that clears cache between runs */ for (i = 0; rc_fun_tab[i].f != NULL; i++) { - cme = 100.0; - time_rc(rc_fun_tab[i].f, rc_fun_tab[i].rc_type, rc_fun_tab[i].descr, &cme); - if (i == 0) { - cme_c = cme; - if (EnableScore == 0) { - printf(" 最高\"列求和\"得分 ======================== %.0f\n", - compute_score(cme, cstandard[0].cref, cstandard[0].cbest)); - } - } - if (i == 1) { - cme_rc = cme; - if (EnableScore == 0) { - printf(" 最高\"行和列求和\"得分 ====================== %.0f\n", - compute_score(cme, cstandard[1].cref, cstandard[1].cbest)); - } - } + cme = 100.0; + time_rc(rc_fun_tab[i].f, + rc_fun_tab[i].rc_type, rc_fun_tab[i].descr, &cme); + if (i == 0) + { + cme_c = cme; + if (EnableScore==0) + { + printf(" \"\"÷ ======================== %.0f\n", + compute_score(cme, cstandard[0].cref, cstandard[0].cbest)); + } + } + if (i == 1) + { + cme_rc = cme; + if (EnableScore==0) + { + printf(" \"к\"÷ ====================== %.0f\n", + compute_score(cme, cstandard[1].cref, cstandard[1].cbest)); + } + } } - + if (EnableScore) - printf("%.2f\t %.0f\t %.2f\t %.0f\t 0\t 0\n", cme_c, - compute_score(cme_c, cstandard[0].cref, cstandard[0].cbest), cme_rc, - compute_score(cme_rc, cstandard[1].cref, cstandard[1].cbest)); + printf("%.2f\t %.0f\t %.2f\t %.0f\t 0\t 0\n",cme_c,compute_score(cme_c, cstandard[0].cref, cstandard[0].cbest), + cme_rc,compute_score(cme_rc, cstandard[1].cref, cstandard[1].cbest)); return 0; } diff --git a/perflab/matrix/rowcol_test.o b/perflab/matrix/rowcol_test.o deleted file mode 100644 index d214ba4..0000000 Binary files a/perflab/matrix/rowcol_test.o and /dev/null differ diff --git a/perflab/poly/Makefile b/perflab/poly/Makefile deleted file mode 100644 index 9f55dad..0000000 --- a/perflab/poly/Makefile +++ /dev/null @@ -1,35 +0,0 @@ -CC = gcc -NVCC = nvcc -CFLAGS = -Wall -O2 -g -CUDA_FLAGS = -O2 -g -LDFLAGS = -lm -lcudart - -# Source files -SRCS = poly_test.c clock.c cpe.c fcyc.c lsquare.c -CUDA_SRCS = poly.cu -OBJS = $(SRCS:.c=.o) poly.o - -# Target executable -TARGET = poly_test - -# Default target -all: $(TARGET) - -# Rule to build the executable -$(TARGET): $(OBJS) - $(CC) $(OBJS) -o $(TARGET) $(LDFLAGS) - -# Rule to build object files -%.o: %.c - $(CC) $(CFLAGS) -c $< -o $@ - -# Rule to build CUDA object files -poly.o: poly.cu - $(NVCC) $(CUDA_FLAGS) -c $< -o $@ - -# Clean rule -clean: - rm -f $(OBJS) $(TARGET) - -# Phony targets -.PHONY: all clean \ No newline at end of file diff --git a/perflab/poly/a.exe b/perflab/poly/a.exe new file mode 100755 index 0000000..305d39b Binary files /dev/null and b/perflab/poly/a.exe differ diff --git a/perflab/poly/clock.c b/perflab/poly/clock.c index 159ba4e..a587590 100644 --- a/perflab/poly/clock.c +++ b/perflab/poly/clock.c @@ -13,11 +13,11 @@ #include #include #include -#include -// #include -#include "clock.h" -#include +#include +//#include #include +#include +#include "clock.h" /* Use x86 cycle counter */ @@ -27,195 +27,203 @@ static unsigned cyc_lo = 0; /* Set *hi and *lo to the high and low order bits of the cycle counter. Implementation requires assembly code to use the rdtsc instruction. */ -void access_counter(unsigned *hi, unsigned *lo) { +void access_counter(unsigned *hi, unsigned *lo) +{ - long long counter; + long long counter; - counter = __rdtsc(); - (*hi) = (unsigned int)(counter >> 32); - (*lo) = (unsigned int)counter; - /* + counter = __rdtsc(); + (*hi) = (unsigned int)(counter >> 32); + (*lo) = (unsigned int)counter; +/* - LARGE_INTEGER lPerformanceCount; + LARGE_INTEGER lPerformanceCount; - QueryPerformanceCounter(&lPerformanceCount); - (*hi) = (unsigned int)lPerformanceCount.HighPart; - (*lo) = (unsigned int)lPerformanceCount.LowPart; - // printf("%08X %08X\n",(*hi),(*lo)); - */ + QueryPerformanceCounter(&lPerformanceCount); + (*hi) = (unsigned int)lPerformanceCount.HighPart; + (*lo) = (unsigned int)lPerformanceCount.LowPart; +// printf("%08X %08X\n",(*hi),(*lo)); +*/ } + /* Record the current value of the cycle counter. */ -void start_counter() { access_counter(&cyc_hi, &cyc_lo); } +void start_counter() +{ + access_counter(&cyc_hi, &cyc_lo); +} /* Return the number of cycles since the last call to start_counter. */ -double get_counter() { - unsigned ncyc_hi, ncyc_lo; - unsigned hi, lo, borrow; - double result; +double get_counter() +{ + unsigned ncyc_hi, ncyc_lo; + unsigned hi, lo, borrow; + double result; - /* Get cycle counter */ - access_counter(&ncyc_hi, &ncyc_lo); + /* Get cycle counter */ + access_counter(&ncyc_hi, &ncyc_lo); - /* Do double precision subtraction */ - lo = ncyc_lo - cyc_lo; - borrow = cyc_lo > ncyc_lo; - hi = ncyc_hi - cyc_hi - borrow; - result = (double)hi * (1 << 30) * 4 + lo; - return result; + /* Do double precision subtraction */ + lo = ncyc_lo - cyc_lo; + borrow = cyc_lo > ncyc_lo; + hi = ncyc_hi - cyc_hi - borrow; + result = (double) hi * (1 << 30) * 4 + lo; + return result; } -void make_CPU_busy(void) { - volatile double old_tick, new_tick; - start_counter(); - old_tick = get_counter(); - new_tick = get_counter(); - while (new_tick - old_tick < 1000000000) - new_tick = get_counter(); +void make_CPU_busy(void) +{ + volatile double old_tick,new_tick; + start_counter(); + old_tick = get_counter(); + new_tick = get_counter(); + while (new_tick - old_tick < 1000000000) + new_tick = get_counter(); } -// CPU��Ƶ�� -double mhz(int verbose) { - LARGE_INTEGER lFrequency; - LARGE_INTEGER lPerformanceCount_Start; - LARGE_INTEGER lPerformanceCount_End; - double mhz; - double fTime; - __int64 _i64StartCpuCounter; - __int64 _i64EndCpuCounter; - // On a multiprocessor machine, it should not matter which processor is - // called. However, you can get different results on different processors due - // to bugs in the BIOS or the HAL. To specify processor affinity for a thread, - // use the SetThreadAffinityMask function. - HANDLE hThread = GetCurrentThread(); - SetThreadAffinityMask(hThread, 0x1); +//CPUƵ +double mhz(int verbose) +{ + LARGE_INTEGER lFrequency; + LARGE_INTEGER lPerformanceCount_Start; + LARGE_INTEGER lPerformanceCount_End; + double mhz; + double fTime; + __int64 _i64StartCpuCounter; + __int64 _i64EndCpuCounter; + //On a multiprocessor machine, it should not matter which processor is called. + //However, you can get different results on different processors due to bugs in + //the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function. + HANDLE hThread=GetCurrentThread(); + SetThreadAffinityMask(hThread,0x1); - // �����ϸ߾��ȶ�ʱ���ľ���Ƶ�� - // �����ʱ��Ӧ�þ���һƄ1�78253����8254 - // ��intel ich7�м�����8254 - QueryPerformanceFrequency(&lFrequency); - // if (verbose>0) - // printf("�߾��ȶ�ʱ���ľ���Ƶ�ʣ�%1.0fHz.\n",(double)lFrequency.QuadPart); + //ϸ߾ȶʱľƵ + //ʱӦþһƬ82538254 + //intel ich7м8254 + QueryPerformanceFrequency(&lFrequency); +// if (verbose>0) +// printf("߾ȶʱľƵʣ%1.0fHz.\n",(double)lFrequency.QuadPart); - // �����ʱ��ÿ����һ��ʱ�����ڣ����������+1 - QueryPerformanceCounter(&lPerformanceCount_Start); + //ʱÿһʱڣ+1 + QueryPerformanceCounter(&lPerformanceCount_Start); - // RDTSCָ��:��ȡCPU������ʱ�������� - _i64StartCpuCounter = __rdtsc(); + //RDTSCָ:ȡCPUʱ + _i64StartCpuCounter=__rdtsc(); - // ��ʱ��һ��,����Сһ�� - // int nTemp=100000; - // while (--nTemp); - Sleep(200); + //ʱһ,Сһ + //int nTemp=100000; + //while (--nTemp); + Sleep(200); - QueryPerformanceCounter(&lPerformanceCount_End); + QueryPerformanceCounter(&lPerformanceCount_End); - _i64EndCpuCounter = __rdtsc(); + _i64EndCpuCounter=__rdtsc(); - // f=1/T => f=��������/(��������*T) - // ����ġ��������ᅣ1�7*T������ʱ��ᅣ1�7 - fTime = ((double)lPerformanceCount_End.QuadPart - - (double)lPerformanceCount_Start.QuadPart) / - (double)lFrequency.QuadPart; + //f=1/T => f=/(*T) + //ġ*Tʱ + fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart) + /(double)lFrequency.QuadPart; - mhz = (_i64EndCpuCounter - _i64StartCpuCounter) / (fTime * 1000000.0); - if (verbose > 0) - printf("CPUƵ��Ϊ:%1.6fMHz.\n", mhz); - return mhz; + mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0); + if (verbose>0) + printf("CPUƵΪ:%1.6fMHz.\n",mhz); + return mhz; } -double CPU_Factor1(void) { - double result; - int i, j, k, ii, jj, kk; - LARGE_INTEGER lStart, lEnd; +double CPU_Factor1(void) +{ + double result; + int i,j,k,ii,jj,kk; + LARGE_INTEGER lStart,lEnd; LARGE_INTEGER lFrequency; HANDLE hThread; double fTime; QueryPerformanceFrequency(&lFrequency); - ii = 43273; - kk = 1238; - result = 1; - jj = 1244; + ii = 43273; + kk = 1238; + result = 1; + jj = 1244; - hThread = GetCurrentThread(); - SetThreadAffinityMask(hThread, 0x1); + hThread=GetCurrentThread(); + SetThreadAffinityMask(hThread,0x1); QueryPerformanceCounter(&lStart); //_asm("cpuid"); - start_counter(); - for (i = 0; i < 100; i++) - for (j = 0; j < 1000; j++) - for (k = 0; k < 1000; k++) - kk += kk * ii + jj; + start_counter(); + for (i=0;i<100;i++) + for (j=0;j<1000;j++) + for (k=0;k<1000;k++) + kk += kk*ii+jj; - result = get_counter(); - QueryPerformanceCounter(&lEnd); - fTime = ((double)lEnd.QuadPart - (double)lStart.QuadPart); - printf("CPU����ʱ��Ϊ%f", result); - printf("\t %f\n", fTime); - return result; + result = get_counter(); + QueryPerformanceCounter(&lEnd); + fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart); + printf("CPUʱΪ%f",result); + printf("\t %f\n",fTime); + return result; } -double CPU_Factor(void) { - double frequency; - double multiplier = 1000 * 1000 * 1000; // nano - LARGE_INTEGER lFrequency; - LARGE_INTEGER start, stop; - HANDLE hThread; - int i; - const int gigahertz = 1000 * 1000 * 1000; - const int known_instructions_per_loop = 27317; +double CPU_Factor(void) +{ + double frequency; + double multiplier = 1000 * 1000 * 1000;//nano + LARGE_INTEGER lFrequency; + LARGE_INTEGER start,stop; + HANDLE hThread; + int i; + const int gigahertz= 1000*1000*1000; + const int known_instructions_per_loop = 27317; - int iterations = 100000000; - int g = 0; - double normal_ticks_per_second; - double ticks; - double time; - double loops_per_sec; - double instructions_per_loop; - double ratio; - double actual_freq; + int iterations = 100000000; + int g = 0; + double normal_ticks_per_second; +double ticks; +double time; +double loops_per_sec; +double instructions_per_loop; +double ratio; +double actual_freq; - QueryPerformanceFrequency(&lFrequency); - frequency = (double)lFrequency.QuadPart; + QueryPerformanceFrequency(&lFrequency); + frequency = (double)lFrequency.QuadPart; - hThread = GetCurrentThread(); - SetThreadAffinityMask(hThread, 0x1); - QueryPerformanceCounter(&start); - for (i = 0; i < iterations; i++) { - g++; - g++; - g++; - g++; - } - QueryPerformanceCounter(&stop); + hThread=GetCurrentThread(); + SetThreadAffinityMask(hThread,0x1); + QueryPerformanceCounter(&start); + for( i = 0; i < iterations; i++) + { + g++; + g++; + g++; + g++; + } + QueryPerformanceCounter(&stop); - // normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ - // 3199 - normal_ticks_per_second = frequency * 1000; - ticks = (double)((double)stop.QuadPart - (double)start.QuadPart); - time = (ticks * multiplier) / frequency; - loops_per_sec = iterations / (time / multiplier); - instructions_per_loop = normal_ticks_per_second / loops_per_sec; + //normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199 + normal_ticks_per_second = frequency * 1000; + ticks = (double)((double)stop.QuadPart - (double)start.QuadPart); + time = (ticks * multiplier) /frequency; + loops_per_sec = iterations / (time/multiplier); + instructions_per_loop = normal_ticks_per_second / loops_per_sec; - ratio = (instructions_per_loop / known_instructions_per_loop); - actual_freq = normal_ticks_per_second / ratio; - /* - actual_freq = normal_ticks_per_second / ratio; - actual_freq = known_instructions_per_loop*iterations*multiplier/time; + ratio = (instructions_per_loop / known_instructions_per_loop); + actual_freq = normal_ticks_per_second / ratio; +/* + actual_freq = normal_ticks_per_second / ratio; + actual_freq = known_instructions_per_loop*iterations*multiplier/time; - 2293 = x/time; - - 2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000 - loops_per_sec = iterations*frequency / ticks - - instructions_per_loop = / loops_per_sec; - */ - printf("Perf counter freq: %f\n", normal_ticks_per_second); - printf("Loops per sec: %f\n", loops_per_sec); - printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop); - printf("Presumed freq: %f\n", actual_freq); - printf("ratio: %f\n", ratio); - printf("time=%f\n", time); - return ratio; + 2293 = x/time; + + 2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000 + loops_per_sec = iterations*frequency / ticks + + instructions_per_loop = / loops_per_sec; +*/ + printf("Perf counter freq: %f\n", normal_ticks_per_second); + printf("Loops per sec: %f\n", loops_per_sec); + printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop); + printf("Presumed freq: %f\n", actual_freq); + printf("ratio: %f\n", ratio); + printf("time=%f\n",time); + return ratio; } diff --git a/perflab/poly/fcyc.c b/perflab/poly/fcyc.c index dc2f735..9246307 100644 --- a/perflab/poly/fcyc.c +++ b/perflab/poly/fcyc.c @@ -119,7 +119,7 @@ double fcyc(test_funct f, int *params) if (clear_cache) clear(); start_counter(); - f(params); + f((long int*)params); cyc = get_counter(); if (cyc > 0.0) add_sample(cyc); @@ -131,7 +131,7 @@ double fcyc(test_funct f, int *params) clear(); start_counter(); for (i=0;i 0.0) add_sample(cyc); diff --git a/perflab/poly/poly.cu b/perflab/poly/poly.cu deleted file mode 100644 index 73347fe..0000000 --- a/perflab/poly/poly.cu +++ /dev/null @@ -1,325 +0,0 @@ -/************************************************************************** - 多项式计算函数。按下面的要求编辑此文件: - 1. 将你的学号、姓名,以注释的方式写到下面; - 2. 实现不同版本的多项式计算函数; - 3. 编辑peval_fun_rec peval_fun_tab数组,将你的最好的答案 - (最小CPE、最小C10)作为数组的前两项 -***************************************************************************/ - -/* - 学号:201209054233 - 姓名:夜半加班狂 -*/ - - - -#include -#include -#include -typedef int (*peval_fun)(int*, int, int); - -typedef struct { - peval_fun f; - char *descr; -} peval_fun_rec, *peval_fun_ptr; - - -/************************************************************************** - Edit this comment to indicate your name and Andrew ID -#ifdef ASSIGN - Submission by Harry Q. Bovik, bovik@andrew.cmu.edu -#else - Instructor's version. - Created by Randal E. Bryant, Randy.Bryant@cs.cmu.edu, 10/07/02 -#endif -***************************************************************************/ - -/* - 实现一个指定的常系数多项式计算 - 第一次,请直接运行程序,以便获知你需要实现的常系数是啥 -*/ -int const_poly_eval(int *not_use, int not_use2, int x) -{ - int result = 0; -/* int i; - int xpwr = 1; // x的幂次 - int a[4] = {21,90,42,88}; - for (i = 0; i <= 3; i++) { - result += a[i]*xpwr; - xpwr *= x; - } -*/ -// 90 = 64 + 32 - 4 - 2 -// 42 = 32 + 8 + 2 -// 88 = 64 + 16 + 8 - int x64,x32,x16,x8,x4,x2; - - x64 = x << 6; - x32 = x << 5; - x16 = x << 4; - x8 = x << 3; - x4 = x << 2; - x2 = x << 1; - result = 21 + x64+x32-x4-x2 + ((x32+x8+x2) + (x64+x16+x8)*x)*x; - return result; -} - - - -/* 多项式计算函数。注意:这个只是一个参考实现,你需要实现自己的版本 */ - -/* - 友情提示:lcc支持ATT格式的嵌入式汇编,例如 - - _asm("movl %eax,%ebx"); - _asm("pushl %edx"); - - 可以在lcc中project->configuration->Compiler->Code Generation->Generate .asm, - 将其选中后,可以在lcc目录下面生成对应程序的汇编代码实现。通过查看汇编文件, - 你可以了解编译器是如何实现你的代码的。有些实现可能非常低效。 - 你可以在适当的地方加入嵌入式汇编,来大幅度提高计算性能。 -*/ - -int poly_eval(int *a, int degree, int x) -{ - int result = 0; - int i; - int xpwr = 1; /* x的幂次 */ -// printf("阶=%d\n",degree); - for (i = 0; i <= degree; i++) { - result += a[i]*xpwr; - xpwr *= x; - } - return result; -} - -/* CUDA优化的多项式计算函数 - 低CPE版本 */ -int cuda_poly_eval_low_cpe(int *a, int degree, int x) -{ - // 对于低CPE版本,我们使用CUDA并行计算多项式的各个项 - // 然后将结果传回主机进行求和 - - // 分配设备内存 - int *d_a, *d_results; - cudaError_t err; - - // 分配内存 - err = cudaMalloc(&d_a, (degree + 1) * sizeof(int)); - if (err != cudaSuccess) { - printf("CUDA Error: %s\n", cudaGetErrorString(err)); - return 0; - } - - err = cudaMalloc(&d_results, (degree + 1) * sizeof(int)); - if (err != cudaSuccess) { - printf("CUDA Error: %s\n", cudaGetErrorString(err)); - cudaFree(d_a); - return 0; - } - - // 将系数从主机复制到设备 - err = cudaMemcpy(d_a, a, (degree + 1) * sizeof(int), cudaMemcpyHostToDevice); - if (err != cudaSuccess) { - printf("CUDA Error: %s\n", cudaGetErrorString(err)); - cudaFree(d_a); - cudaFree(d_results); - return 0; - } - - // 定义CUDA核函数 - dim3 blockDim(256); - dim3 gridDim((degree + 1 + blockDim.x - 1) / blockDim.x); - - // 启动核函数 - cudaPolyEvalLowCPE<<>>(d_a, degree, x, d_results); - - // 检查核函数执行错误 - err = cudaGetLastError(); - if (err != cudaSuccess) { - printf("CUDA Error: %s\n", cudaGetErrorString(err)); - cudaFree(d_a); - cudaFree(d_results); - return 0; - } - - // 分配主机内存用于结果 - int *h_results = (int *)malloc((degree + 1) * sizeof(int)); - if (h_results == NULL) { - printf("Memory allocation error\n"); - cudaFree(d_a); - cudaFree(d_results); - return 0; - } - - // 将结果从设备复制回主机 - err = cudaMemcpy(h_results, d_results, (degree + 1) * sizeof(int), cudaMemcpyDeviceToHost); - if (err != cudaSuccess) { - printf("CUDA Error: %s\n", cudaGetErrorString(err)); - free(h_results); - cudaFree(d_a); - cudaFree(d_results); - return 0; - } - - // 在主机上求和 - int result = 0; - for (int i = 0; i <= degree; i++) { - result += h_results[i]; - } - - // 释放内存 - free(h_results); - cudaFree(d_a); - cudaFree(d_results); - - return result; -} - -/* CUDA优化的多项式计算函数 - 10阶优化版本 */ -int cuda_poly_eval_degree10(int *a, int degree, int x) -{ - // 对于10阶多项式,我们可以使用更优化的方法 - // 使用CUDA并行计算,但针对10阶多项式进行特殊优化 - - // 分配设备内存 - int *d_a, *d_result; - cudaError_t err; - - // 分配内存 - err = cudaMalloc(&d_a, (degree + 1) * sizeof(int)); - if (err != cudaSuccess) { - printf("CUDA Error: %s\n", cudaGetErrorString(err)); - return 0; - } - - err = cudaMalloc(&d_result, sizeof(int)); - if (err != cudaSuccess) { - printf("CUDA Error: %s\n", cudaGetErrorString(err)); - cudaFree(d_a); - return 0; - } - - // 将系数从主机复制到设备 - err = cudaMemcpy(d_a, a, (degree + 1) * sizeof(int), cudaMemcpyHostToDevice); - if (err != cudaSuccess) { - printf("CUDA Error: %s\n", cudaGetErrorString(err)); - cudaFree(d_a); - cudaFree(d_result); - return 0; - } - - // 定义CUDA核函数 - dim3 blockDim(256); - dim3 gridDim(1); // 只需要一个块,因为我们只需要一个结果 - - // 启动核函数 - cudaPolyEvalDegree10<<>>(d_a, degree, x, d_result); - - // 检查核函数执行错误 - err = cudaGetLastError(); - if (err != cudaSuccess) { - printf("CUDA Error: %s\n", cudaGetErrorString(err)); - cudaFree(d_a); - cudaFree(d_result); - return 0; - } - - // 获取结果 - int result; - err = cudaMemcpy(&result, d_result, sizeof(int), cudaMemcpyDeviceToHost); - if (err != cudaSuccess) { - printf("CUDA Error: %s\n", cudaGetErrorString(err)); - cudaFree(d_a); - cudaFree(d_result); - return 0; - } - - // 释放内存 - cudaFree(d_a); - cudaFree(d_result); - - return result; -} - -/* CUDA核函数 - 低CPE版本 */ -__global__ void cudaPolyEvalLowCPE(int *a, int degree, int x, int *results) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx <= degree) { - // 计算x的幂 - int xpwr = 1; - for (int i = 0; i < idx; i++) { - xpwr *= x; - } - - // 计算这一项的结果 - results[idx] = a[idx] * xpwr; - } -} - -/* CUDA核函数 - 10阶优化版本 */ -__global__ void cudaPolyEvalDegree10(int *a, int degree, int x, int *result) -{ - // 使用共享内存来存储中间结果 - __shared__ int shared_result; - - // 只有第一个线程初始化共享结果 - if (threadIdx.x == 0) { - shared_result = 0; - } - __syncthreads(); - - // 每个线程计算一部分项 - int local_result = 0; - int xpwr = 1; - - // 计算x的幂 - for (int i = 0; i < threadIdx.x; i++) { - xpwr *= x; - } - - // 计算这一项的结果 - if (threadIdx.x <= degree) { - local_result = a[threadIdx.x] * xpwr; - } - - // 使用原子操作累加结果 - atomicAdd(&shared_result, local_result); - - // 同步所有线程 - __syncthreads(); - - // 只有第一个线程将结果写回全局内存 - if (threadIdx.x == 0) { - *result = shared_result; - } -} - -/* - 这个表格包含多个数组元素,每一组元素(函数名字, "描述字符串") - 将你认为最好的两个实现,放在最前面。 - 比如: - {my_poly_eval1, "超级垃圾实现"}, - {my_poly_eval2, "好一点的实现"}, -*/ - -peval_fun_rec peval_fun_tab[] = -{ - - /* 第一项,应当是你写的最好CPE的函数实现 */ - {cuda_poly_eval_low_cpe, "CUDA optimized low CPE implementation"}, - /* 第二项,应当是你写的在10阶时具有最好性能的实现 */ - {cuda_poly_eval_degree10, "CUDA optimized degree 10 implementation"}, - - {poly_eval, "poly_eval: 参考实现"}, - - /* 下面的代码不能修改或者删除!!表明数组列表结束 */ - {NULL, ""} -}; - - - - - - - diff --git a/perflab/poly/poly.o b/perflab/poly/poly.o deleted file mode 100644 index 1b650e0..0000000 Binary files a/perflab/poly/poly.o and /dev/null differ diff --git a/perflab/poly/poly.c b/perflab/poly/poly_723005.c similarity index 53% rename from perflab/poly/poly.c rename to perflab/poly/poly_723005.c index 130a2cc..a02fe8b 100644 --- a/perflab/poly/poly.c +++ b/perflab/poly/poly_723005.c @@ -1,125 +1,143 @@ -/************************************************************************** - ʽ㺯Ҫ༭ļ - 1. ѧšע͵ķʽд棻 - 2. ʵֲͬ汾Ķʽ㺯 - 3. ༭peval_fun_rec peval_fun_tab飬õĴ - СCPEСC10Ϊǰ -***************************************************************************/ - -/* - ѧţ201209054233 - ҹӰ -*/ - - - -#include -#include -typedef int (*peval_fun)(int*, int, int); - -typedef struct { - peval_fun f; - char *descr; -} peval_fun_rec, *peval_fun_ptr; - - -/************************************************************************** - Edit this comment to indicate your name and Andrew ID -#ifdef ASSIGN - Submission by Harry Q. Bovik, bovik@andrew.cmu.edu -#else - Instructor's version. - Created by Randal E. Bryant, Randy.Bryant@cs.cmu.edu, 10/07/02 -#endif -***************************************************************************/ - -/* - ʵһָijϵʽ - һΣֱгԱ֪Ҫʵֵijϵɶ -*/ -int const_poly_eval(int *not_use, int not_use2, int x) -{ - int result = 0; -/* int i; - int xpwr = 1; // xݴ - int a[4] = {21,90,42,88}; - for (i = 0; i <= 3; i++) { - result += a[i]*xpwr; - xpwr *= x; - } -*/ -// 90 = 64 + 32 - 4 - 2 -// 42 = 32 + 8 + 2 -// 88 = 64 + 16 + 8 - int x64,x32,x16,x8,x4,x2; - - x64 = x << 6; - x32 = x << 5; - x16 = x << 4; - x8 = x << 3; - x4 = x << 2; - x2 = x << 1; - result = 21 + x64+x32-x4-x2 + ((x32+x8+x2) + (x64+x16+x8)*x)*x; - return result; -} - - - -/* ʽ㺯ע⣺ֻһοʵ֣ҪʵԼİ汾 */ - -/* - ʾlcc֧ATTʽǶʽ࣬ - - _asm("movl %eax,%ebx"); - _asm("pushl %edx"); - - lccproject->configuration->Compiler->Code Generation->Generate .asm - ѡк󣬿lccĿ¼ɶӦĻʵ֡ͨ鿴ļ - ˽ʵĴġЩʵֿܷdzЧ - ʵĵطǶʽ࣬߼ܡ -*/ - -int poly_eval(int *a, int degree, int x) -{ - int result = 0; - int i; - int xpwr = 1; /* xݴ */ -// printf("=%d\n",degree); - for (i = 0; i <= degree; i++) { - result += a[i]*xpwr; - xpwr *= x; - } - return result; -} - - - - -/* - ԪأÿһԪأ, "ַ" - Ϊõʵ֣ǰ档 - 磺 - {my_poly_eval1, "ʵ"}, - {my_poly_eval2, "һʵ"}, -*/ - -peval_fun_rec peval_fun_tab[] = -{ - - /* һӦдCPEĺʵ */ - {poly_eval, "ҹӰCPE"}, - /* ڶӦд10ʱܵʵ */ - {poly_eval, "ҹӰ10ʵ"}, - - {poly_eval, "poly_eval: οʵ"}, - - /* Ĵ벻޸Ļɾб */ - {NULL, ""} -}; - - - - - - - + +/* + ϵʽ㺯 + ͨٳ˷ʹλŻܡ + ʽresult = 37 + 72*x + 84*x^2 + 52*x^3 +*/ + +/************************************************************************** + ʽ㺯Ҫ༭ļ + 1. ѧšע͵ķʽд棻 + 2. ʵֲͬ汾Ķʽ㺯 + 3. ༭peval_fun_rec peval_fun_tab飬õĴ + СCPEСC10Ϊǰ +***************************************************************************/ + +/* + ѧţ202302723005 + ̾ +*/ + + + +#include +#include +typedef int (*peval_fun)(int*, int, int); + +typedef struct { + peval_fun f; + char *descr; +} peval_fun_rec, *peval_fun_ptr; + + +/************************************************************************** + Edit this comment to indicate your name and Andrew ID +#ifdef ASSIGN + Submission by Harry Q. Bovik, bovik@andrew.cmu.edu +#else + Instructor's version. + Created by Randal E. Bryant, Randy.Bryant@cs.cmu.edu, 10/07/02 +#endif +***************************************************************************/ + +/* + ʵһָijϵʽ + һΣֱгԱ֪Ҫʵֵijϵɶ +*/ +int poly_eval(int *a, int degree, int x) +{ + int result = 0; + int i; + int xpwr = 1; // xݴ + + for (i = 0; i <= degree; i++) { + result += a[i]*xpwr; + xpwr *= x; + } + return result; +} + +/* ʽ㺯ע⣺ֻһοʵ֣ҪʵԼİ汾 */ + +/* + ʾlcc֧ATTʽǶʽ࣬ + + _asm("movl %eax,%ebx"); + _asm("pushl %edx"); + + lccproject->configuration->Compiler->Code Generation->Generate .asm + ѡк󣬿lccĿ¼ɶӦĻʵ֡ͨ鿴ļ + ˽ʵĴġЩʵֿܷdzЧ + ʵĵطǶʽ࣬߼ܡ +*/ +int const_poly_eval(int *not_use, int not_use2, int x) +{ + register int result = 0; + register int x1, x2, x3; + register int tmp = x; // tmp = x + register int tmp1 = tmp * tmp; // tmp1 = x^2 + register int tmp2 = tmp1 * tmp;// tmp2 = x^3 + + // 72x: 64x + 8x = (x << 6) + (x << 3) + x1 = (tmp << 6) + (tmp << 3); + + // 84x^2: 64x2 + 16x2 + 4x2 = (x2 << 6) + (x2 << 4) + (x2 << 2) + x2 = (tmp1 << 6) + (tmp1 << 4) + (tmp1 << 2); + + // 52x^3: 32x3 + 16x3 + 4x3 = (x3 << 5) + (x3 << 4) + (x3 << 2) + x3 = (tmp2 << 5) + (tmp2 << 4) + (tmp2 << 2); + + // ϲ37 + 72x + 84x2 + 52x3 + result = 37 + x1 + x2 + x3; + return result; +} + +int poly_eval12(int* a, int degree, int x) { + if (degree == 10) { + // 10ȫչɷ򣨱ԭ߼䣩 + int result = a[10]; + result = result * x + a[9]; + result = result * x + a[8]; + result = result * x + a[7]; + result = result * x + a[6]; + result = result * x + a[5]; + result = result * x + a[4]; + result = result * x + a[3]; + result = result * x + a[2]; + result = result * x + a[1]; + return result * x + a[0]; + } else { + // ͨð汾ԭ߼䣩 + int result = 0; + int x2 = x * x; + int i = degree; + for (; i > 0; i -= 2) { + result = result * x2 + a[i] * x + a[i - 1]; + } + if (i == 0) { + result = result * x + a[0]; + } + return result; + } +} +/* + ԪأÿһԪأ, "ַ" + Ϊõʵ֣ǰ档 + 磺 + {my_poly_eval1, "ʵ"}, + {my_poly_eval2, "һʵ"}, +*/ + +peval_fun_rec peval_fun_tab[] = +{ + + /* һӦдCPEĺʵ */ + {poly_eval12, "̾CPE"}, + /* ڶӦд10ʱܵʵ */ + {poly_eval12, "̾10ʵ"}, + + {poly_eval, "poly_eval: οʵ"}, + + /* Ĵ벻޸Ļɾб */ + {NULL, ""} +}; \ No newline at end of file diff --git a/perflab/poly/poly_test.c b/perflab/poly/poly_test.c index c2e8701..f08a5ab 100644 --- a/perflab/poly/poly_test.c +++ b/perflab/poly/poly_test.c @@ -2,11 +2,11 @@ #include #include #include +#include //#include #include "poly.h" #include "cpe.h" #include "clock.h" -#include double CPU_Mhz; @@ -18,7 +18,7 @@ static int coeff[MAXDEGREE+1]; #define MAX_ITER_COUNT 100 -#define REF_CPU_MHZ 2292.6 // �����ҵĴ�������Ƶ +#define REF_CPU_MHZ 2292.6 // ҵĴƵ /* Define performance standards */ static struct { @@ -27,7 +27,7 @@ static struct { } cstandard[3] = {{4.00, 1.75}, /* CPE */ {50, 43}, /* C(10) */ - {57,31} /* ��ϵ������ʽ���� */ + {57,31} /* ϵʽ */ }; int coeff_const[4]; @@ -83,7 +83,7 @@ static void init_const_poly(void) coeff_const[i] = rand_div+10; } - printf("����Ҫ�޸�poly.c��const_poly_eval������ʵ������ij�������ʽ���㣡\n"); + printf("Ҫ޸poly.cconst_poly_evalʵijʽ㣡\n"); printf("\tresult=%d+%d*x+%d*x^2+%d*x^3\n",coeff_const[0],coeff_const[1],coeff_const[2],coeff_const[3]); fixval_const = ref_poly_eval(coeff_const, 3, xval); @@ -98,15 +98,15 @@ void test_const_poly(void) int my_cal = const_poly_eval(coeff_const, 3, xval); if (fixval_const != my_cal) { - printf("��ϵ������ʽ����const_poly_evalʵ�ִ���x=%d����Ԥ�ڽ����%d�����Ǽ���õ�����%d\n",xval,fixval_const,my_cal); + printf("ϵʽconst_poly_evalʵִx=%dԤڽ%dǼõ%d\n",xval,fixval_const,my_cal); exit(0); } fix_time = 0; for (i=0;i #include #include #include @@ -26,13 +25,28 @@ typedef void (*lower_fun_t)(char *s); /* Lower case conversion routines */ -/* Convert string to lower case: slow */ -void lower1(char *s) { - int i; +/* Convert string to lower case: optimized with lookup table */ +static unsigned char lcase_table[256]; +static int table_initialized = 0; - for (i = 0; i < Strlen(s); i++) - if (s[i] >= 'A' && s[i] <= 'Z') - s[i] -= ('A' - 'a'); +void init_lcase_table() { + if (!table_initialized) { + int i; + for (i = 0; i < 256; i++) + lcase_table[i] = i; + for (i = 'A'; i <= 'Z'; i++) + lcase_table[i] = i - ('A' - 'a'); + table_initialized = 1; + } +} + +void lower1(char *s) { + init_lcase_table(); + unsigned char *us = (unsigned char *)s; + while (*us) { + *us = lcase_table[*us]; + us++; + } } /* Convert string to lower case: faster */ @@ -137,9 +151,20 @@ unsigned h_xor(char *s) { return val % tsize; } -#define HCNT 3 -hash_fun_t hash_fun_set[HCNT] = {h_mod, h_add, h_xor}; -char *hash_fun_names[HCNT] = {"h_mod", "h_add", "h_xor"}; +/* FNV-1a hash function */ +unsigned h_fnv1a(char *s) { + unsigned hash = 2166136261u; + unsigned char *us = (unsigned char *)s; + while (*us) { + hash ^= *us++; + hash *= 16777619u; + } + return hash % tsize; +} + +#define HCNT 4 +hash_fun_t hash_fun_set[HCNT] = {h_mod, h_add, h_xor, h_fnv1a}; +char *hash_fun_names[HCNT] = {"h_mod", "h_add", "h_xor", "h_fnv1a"}; char *save_string(char *s) { char *result = (char *)malloc(Strlen(s) + 1); @@ -194,7 +219,6 @@ h_ptr find_ele_iter_f(h_ptr ls, char *s) { h_ptr find_ele_iter_r(h_ptr ls, char *s) { h_ptr ele = ls; h_ptr last = NULL; -#pragma omp parallel shared(ls, s, last) for (ele = ls; ele; ele = ele->next) { char *word = ele->word; if (strcmp(s, word) == 0) { @@ -220,10 +244,10 @@ h_ptr find_ele_iter_r(h_ptr ls, char *s) { typedef h_ptr (*find_ele_fun_t)(h_ptr, char *); #define FCNT 3 -find_ele_fun_t find_ele_fun_set[FCNT] = {find_ele_iter_r, find_ele_iter_f, - find_ele_rec}; -char *find_ele_fun_names[FCNT] = {"find_ele_iter_r", "find_ele_iter_f", - "find_ele_rec"}; +find_ele_fun_t find_ele_fun_set[FCNT] = {find_ele_rec, find_ele_iter_f, + find_ele_iter_r}; +char *find_ele_fun_names[FCNT] = {"find_ele_rec", "find_ele_iter_f", + "find_ele_iter_r"}; /* Comparision function for sorting */ int compare_ele(const void *vele1, const void *vele2) { @@ -386,8 +410,8 @@ void word_freq(FILE *src, int verbose, int ngram, int size, int quick, int main(int argc, char *argv[]) { int verbose = 1; - int size = 1024; - int hash_fun_index = 0; + int size = 32768; // 修改默认值为32768 + int hash_fun_index = 3; int lower_fun_index = 0; int find_fun_index = 0; int ngram = 1;