perflab finished
This commit is contained in:
653
branchPrediction.c
Executable file
653
branchPrediction.c
Executable file
@ -0,0 +1,653 @@
|
||||
#include "common.h"
|
||||
|
||||
// 饱和计数器:加1
|
||||
static inline UINT32 SatIncrement(UINT32 x, UINT32 max)
|
||||
{
|
||||
if (x<max) return x + 1;
|
||||
return x;
|
||||
}
|
||||
|
||||
// 饱和计数器:减1
|
||||
static inline UINT32 SatDecrement(UINT32 x)
|
||||
{
|
||||
if (x>0) return x - 1;
|
||||
return x;
|
||||
}
|
||||
|
||||
#define BITS_OF_PC 13 // 选择13位的PC作为索引
|
||||
|
||||
#define STATE_MAX 3
|
||||
#define STATE_INIT 2
|
||||
|
||||
UINT32 *State; // 状态数组,用于保存分支指令的状态机,实际只使用最低2位
|
||||
UINT64 StateArraySize;
|
||||
|
||||
void PREDICTOR_init(void)
|
||||
{
|
||||
StateArraySize = (1 << BITS_OF_PC); // 状态数组项数
|
||||
|
||||
State = (UINT32 *)malloc(StateArraySize * sizeof(UINT32));
|
||||
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 将状态数组,全部初始化为STATE_INIT
|
||||
for(UINT32 i = 0; i < StateArraySize; i++)
|
||||
{
|
||||
State[i] = 2;
|
||||
}
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
}
|
||||
|
||||
// 2位状态的分支预测器(预测部分)
|
||||
char GetPrediction(UINT64 PC)
|
||||
{
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 将PC的低13位,去索引状态数组State,得到对应的饱和状态
|
||||
// 如果该状态的值超过一半,则预测跳转
|
||||
// 如果该状态的值低于一半,则预测不跳转
|
||||
UINT32 index = PC>>2 & 0x1fff;
|
||||
if(State[index] == 0 || State[index] == 1) return NOT_TAKEN;
|
||||
return TAKEN;
|
||||
//return TAKEN;
|
||||
//return NOT_TAKEN;
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
}
|
||||
|
||||
// 2位状态的分支预测器(更新部分)
|
||||
void UpdatePredictor(UINT64 PC, OpType opType, char resolveDir, char predDir, UINT64 branchTarget)
|
||||
{
|
||||
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 根据分支指令实际执行结果,来更新对应的饱和计数器
|
||||
// 如果结果为跳转,则对应的饱和计数器+1
|
||||
// 如果结果为不跳转,则对应的饱和计数器-1
|
||||
UINT32 index = PC>>2 & 0x1fff;
|
||||
if(resolveDir == 'T')
|
||||
{
|
||||
State[index] = SatIncrement(State[index], 3);
|
||||
}
|
||||
else
|
||||
{
|
||||
State[index] = SatDecrement(State[index]);
|
||||
}
|
||||
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
}
|
||||
|
||||
void PREDICTOR_free(void)
|
||||
{
|
||||
free(State);
|
||||
}
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
// 饱和计数器:加1
|
||||
static inline UINT32 SatIncrement(UINT32 x, UINT32 max)
|
||||
{
|
||||
if (x<max) return x + 1;
|
||||
return x;
|
||||
}
|
||||
|
||||
// 饱和计数器:减1
|
||||
static inline UINT32 SatDecrement(UINT32 x)
|
||||
{
|
||||
if (x>0) return x - 1;
|
||||
return x;
|
||||
}
|
||||
|
||||
#define BITS_OF_PC 13 // 选择13位的PC作为索引
|
||||
|
||||
#define STATE_MAX 7
|
||||
#define STATE_INIT 3
|
||||
|
||||
UINT32 *State; // 状态数组,用于保存分支指令的状态机,实际只使用最低3位
|
||||
UINT64 StateArraySize;
|
||||
|
||||
void PREDICTOR_init(void)
|
||||
{
|
||||
StateArraySize = (1 << BITS_OF_PC); // 状态数组项数
|
||||
|
||||
State = (UINT32 *)malloc(StateArraySize * sizeof(UINT32));
|
||||
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 将状态数组,全部初始化为STATE_INIT
|
||||
for(UINT64 i = 0; i <= StateArraySize; i++)
|
||||
{
|
||||
State[i] = STATE_INIT;
|
||||
}
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
}
|
||||
|
||||
// 2位状态的分支预测器(预测部分)
|
||||
char GetPrediction(UINT64 PC)
|
||||
{
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 将PC的低13位,去索引状态数组State,得到对应的饱和状态
|
||||
// 如果该状态的值超过一半,则预测跳转
|
||||
// 如果该状态的值低于一半,则预测不跳转
|
||||
UINT64 index = (PC>>2) & 0x1fff;
|
||||
if(State[index] == 0 || State[index] == 1 || State[index] == 2 || State[index] == 3) return NOT_TAKEN;
|
||||
return TAKEN;
|
||||
//return TAKEN;
|
||||
//return NOT_TAKEN;
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
}
|
||||
|
||||
// 2位状态的分支预测器(更新部分)
|
||||
void UpdatePredictor(UINT64 PC, OpType opType, char resolveDir, char predDir, UINT64 branchTarget)
|
||||
{
|
||||
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 根据分支指令实际执行结果,来更新对应的饱和计数器
|
||||
// 如果结果为跳转,则对应的饱和计数器+1
|
||||
// 如果结果为不跳转,则对应的饱和计数器-1
|
||||
UINT64 index = (PC>>2) & 0x1fff;
|
||||
if(resolveDir == 'T')
|
||||
{
|
||||
State[index] = SatIncrement(State[index], 7);
|
||||
}
|
||||
else
|
||||
{
|
||||
State[index] = SatDecrement(State[index]);
|
||||
}
|
||||
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
}
|
||||
|
||||
void PREDICTOR_free(void)
|
||||
{
|
||||
free(State);
|
||||
}
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
// 饱和计数器:加1
|
||||
static inline UINT32 SatIncrement(UINT32 x, UINT32 max)
|
||||
{
|
||||
if (x < max) return x + 1;
|
||||
return x;
|
||||
}
|
||||
|
||||
// 饱和计数器:减1
|
||||
static inline UINT32 SatDecrement(UINT32 x)
|
||||
{
|
||||
if (x > 0) return x - 1;
|
||||
return x;
|
||||
}
|
||||
|
||||
#define BITS_OF_PC 10 // 选择10位的PC作为索引
|
||||
#define LOCAL_HIST_LEN 3 // 局部历史长度,3位
|
||||
#define LOCAL_HIST_MASK ~(~0 << LOCAL_HIST_LEN)
|
||||
|
||||
#define STATE_MAX 3
|
||||
#define STATE_INIT 2
|
||||
|
||||
UINT32* pht; // pattern history table 模式历史表
|
||||
UINT32 phtArraySize; // pht数组项数
|
||||
UINT32* State; // 状态数组,用于保存分支指令的状态机,实际只使用最低2位
|
||||
UINT64 StateArraySize;
|
||||
|
||||
void PREDICTOR_init(void)
|
||||
{
|
||||
StateArraySize = (1 << (BITS_OF_PC + LOCAL_HIST_LEN)); // 状态数组项数
|
||||
|
||||
State = (UINT32*)malloc(StateArraySize * sizeof(UINT32));
|
||||
|
||||
phtArraySize = (1 << BITS_OF_PC); // pht数组项数
|
||||
|
||||
pht = (UINT32*)malloc(phtArraySize * sizeof(UINT32));
|
||||
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 将状态数组,全部初始化为STATE_INIT
|
||||
// 将模式历史表(pht)全部初始化为0
|
||||
for(UINT64 i = 0; i < StateArraySize; i++)
|
||||
{
|
||||
State[i] = STATE_INIT;
|
||||
}
|
||||
for(UINT32 i = 0; i < phtArraySize; i++)
|
||||
{
|
||||
pht[i] = 0;
|
||||
}
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
}
|
||||
|
||||
// 2位状态的分支预测器(预测部分)
|
||||
char GetPrediction(UINT64 PC)
|
||||
{
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 将PC的低10位,去索引模式历史表pht,得到对应的3位历史信息
|
||||
UINT32 index1 = (PC>>2) & 0x3ff;
|
||||
// 将PC的低10位,与3位历史信息进行拼接,形成一个13位的状态数组索引(拼接需要使用C语言的移位、与、或等运算)
|
||||
//UINT64 index2 = (index1<<3) | (pht[index1]);
|
||||
//UINT64 index2 = (index1) | (pht[index1]<<10);
|
||||
UINT64 index2 = (index1 & 0x3ff) | (pht[index1]<<10 & 0x1c00);
|
||||
// 用13位去索引状态数组,得到对应的饱和状态
|
||||
if(State[index2] == 0 || State[index2] == 1) return NOT_TAKEN;
|
||||
return TAKEN;
|
||||
// 如果该状态的值超过一半,则预测跳转
|
||||
// 如果该状态的值低于一半,则预测不跳转
|
||||
|
||||
//return TAKEN;
|
||||
// return NOT_TAKEN;
|
||||
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
|
||||
}
|
||||
|
||||
// 2位状态的分支预测器(更新部分)
|
||||
void UpdatePredictor(UINT64 PC, OpType opType, char resolveDir, char predDir, UINT64 branchTarget)
|
||||
{
|
||||
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 根据分支指令实际执行结果,来更新对应的饱和计数器
|
||||
// 如果结果为跳转,则对应的饱和计数器+1
|
||||
// 如果结果为不跳转,则对应的饱和计数器-1
|
||||
// 更新pht中的最近3次分支历史信息,使用移位寄存器来更新
|
||||
// 将其更新到pht中
|
||||
UINT32 index1 = (PC>>2) & 0x3ff;
|
||||
//UINT64 index2 = (index1<<3) | (pht[index1]);
|
||||
//UINT64 index2 = (index1) | (pht[index1]<<10);
|
||||
UINT64 index2 = (index1 & 0x3ff) | (pht[index1]<<10 & 0x1c00);
|
||||
if(resolveDir == 'T')
|
||||
{
|
||||
State[index2] = SatIncrement(State[index2], 3);
|
||||
pht[index1] = (pht[index1]<<1) | 0x1;
|
||||
}
|
||||
else
|
||||
{
|
||||
State[index2] = SatDecrement(State[index2]);
|
||||
pht[index1] = (pht[index1]<<1);
|
||||
}
|
||||
|
||||
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
}
|
||||
|
||||
void PREDICTOR_free(void)
|
||||
{
|
||||
free(State);
|
||||
free(pht);
|
||||
}
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
// 饱和计数器:加1
|
||||
static inline UINT32 SatIncrement(UINT32 x, UINT32 max)
|
||||
{
|
||||
if (x < max) return x + 1;
|
||||
return x;
|
||||
}
|
||||
|
||||
// 饱和计数器:减1
|
||||
static inline UINT32 SatDecrement(UINT32 x)
|
||||
{
|
||||
if (x > 0) return x - 1;
|
||||
return x;
|
||||
}
|
||||
|
||||
#define BITS_OF_PC 9 // 选择9位的PC作为索引
|
||||
#define LOCAL_HIST_LEN 4 // 局部历史长度,4位
|
||||
#define LOCAL_HIST_MASK ~(~0 << LOCAL_HIST_LEN)
|
||||
|
||||
#define STATE_MAX 3
|
||||
#define STATE_INIT 2
|
||||
|
||||
UINT32* pht; // pattern history table 模式历史表
|
||||
UINT32 phtArraySize; // pht数组项数
|
||||
UINT32* State; // 状态数组,用于保存分支指令的状态机,实际只使用最低2位
|
||||
UINT64 StateArraySize;
|
||||
|
||||
void PREDICTOR_init(void)
|
||||
{
|
||||
StateArraySize = (1 << (BITS_OF_PC + LOCAL_HIST_LEN)); // 状态数组项数
|
||||
|
||||
State = (UINT32*)malloc(StateArraySize * sizeof(UINT32));
|
||||
|
||||
phtArraySize = (1 << BITS_OF_PC); // pht数组项数
|
||||
|
||||
pht = (UINT32*)malloc(phtArraySize * sizeof(UINT32));
|
||||
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 将状态数组,全部初始化为STATE_INIT
|
||||
// 将模式历史表(pht)全部初始化为0
|
||||
for(UINT64 i = 0; i < StateArraySize; i++)
|
||||
{
|
||||
State[i] = STATE_INIT;
|
||||
}
|
||||
for(UINT32 i = 0; i < phtArraySize; i++)
|
||||
{
|
||||
pht[i] = 0;
|
||||
}
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
}
|
||||
|
||||
// 2位状态的分支预测器(预测部分)
|
||||
char GetPrediction(UINT64 PC)
|
||||
{
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 将PC的低10位,去索引模式历史表pht,得到对应的3位历史信息
|
||||
// 将PC的低10位,与3位历史信息进行拼接,形成一个13位的状态数组索引(拼接需要使用C语言的移位、与、或等运算)
|
||||
// 用13位去索引状态数组,得到对应的饱和状态
|
||||
// 如果该状态的值超过一半,则预测跳转
|
||||
// 如果该状态的值低于一半,则预测不跳转
|
||||
UINT32 index1 = (PC>>2) & 0x1ff;
|
||||
UINT64 index2 = (index1 & 0x1ff) | (pht[index1]<<9 & 0x1e00);
|
||||
if(State[index2] == 0 || State[index2] == 1) return NOT_TAKEN;
|
||||
return TAKEN;
|
||||
//return TAKEN;
|
||||
// return NOT_TAKEN;
|
||||
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
|
||||
}
|
||||
|
||||
// 2位状态的分支预测器(更新部分)
|
||||
void UpdatePredictor(UINT64 PC, OpType opType, char resolveDir, char predDir, UINT64 branchTarget)
|
||||
{
|
||||
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 根据分支指令实际执行结果,来更新对应的饱和计数器
|
||||
// 如果结果为跳转,则对应的饱和计数器+1
|
||||
// 如果结果为不跳转,则对应的饱和计数器-1
|
||||
// 更新pht中的最近3次分支历史信息,使用移位寄存器来更新
|
||||
// 将其更新到pht中
|
||||
UINT32 index1 = (PC>>2) & 0x1ff;
|
||||
UINT64 index2 = (index1 & 0x1ff) | (pht[index1]<<9 & 0x1e00);
|
||||
if(resolveDir == 'T')
|
||||
{
|
||||
State[index2] = SatIncrement(State[index2], 3);
|
||||
pht[index1] = (pht[index1]<<1) | 0x1;
|
||||
}
|
||||
else
|
||||
{
|
||||
State[index2] = SatDecrement(State[index2]);
|
||||
pht[index1] = (pht[index1]<<1);
|
||||
}
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
}
|
||||
|
||||
void PREDICTOR_free(void)
|
||||
{
|
||||
free(State);
|
||||
free(pht);
|
||||
}
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
// 饱和计数器:加1
|
||||
static inline UINT32 SatIncrement(UINT32 x, UINT32 max)
|
||||
{
|
||||
if (x < max) return x + 1;
|
||||
return x;
|
||||
}
|
||||
|
||||
// 饱和计数器:减1
|
||||
static inline UINT32 SatDecrement(UINT32 x)
|
||||
{
|
||||
if (x > 0) return x - 1;
|
||||
return x;
|
||||
}
|
||||
|
||||
#define GLOBAL_HIST_LEN 13 // 全局历史长度,13位
|
||||
#define GLOBAL_HIST_MASK ~(~0 << GLOBAL_HIST_LEN)
|
||||
|
||||
#define STATE_MAX 3
|
||||
#define STATE_INIT 2
|
||||
|
||||
UINT32 GHR; // Global History Register,全局历史寄存器
|
||||
UINT32* State; // 状态数组,用于保存分支指令的状态机,实际只使用最低2位
|
||||
UINT64 StateArraySize;
|
||||
|
||||
void PREDICTOR_init(void)
|
||||
{
|
||||
StateArraySize = (1 << GLOBAL_HIST_LEN); // 状态数组项数
|
||||
|
||||
State = (UINT32*)malloc(StateArraySize * sizeof(UINT32));
|
||||
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 将状态数组,全部初始化为STATE_INIT
|
||||
// 将全局历史寄存器(GHR)初始化为0
|
||||
for(UINT64 i = 0; i < StateArraySize; i++)
|
||||
{
|
||||
State[i] = STATE_INIT;
|
||||
}
|
||||
GHR = 0;
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
}
|
||||
|
||||
// Gshare分支预测器(预测部分)
|
||||
char GetPrediction(UINT64 PC)
|
||||
{
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 用13位的GHR去索引状态数组,得到对应的饱和状态
|
||||
// 如果该状态的值超过一半,则预测跳转
|
||||
// 如果该状态的值低于一半,则预测不跳转
|
||||
UINT64 index = GHR & 0x1fff;
|
||||
if(State[index] == 0 || State[index] == 1) return NOT_TAKEN;
|
||||
return TAKEN;
|
||||
// return TAKEN;
|
||||
// return NOT_TAKEN;
|
||||
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
|
||||
}
|
||||
|
||||
// Gshare分支预测器(更新部分)
|
||||
void UpdatePredictor(UINT64 PC, OpType opType, char resolveDir, char predDir, UINT64 branchTarget)
|
||||
{
|
||||
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 根据分支指令实际执行结果,来更新对应的饱和计数器
|
||||
// 如果结果为跳转,则对应的饱和计数器+1
|
||||
// 如果结果为不跳转,则对应的饱和计数器-1
|
||||
// 更新GHR中的最近1次分支历史信息,使用移位寄存器来更新
|
||||
UINT64 index = GHR & 0x1fff;
|
||||
if(resolveDir == 'T')
|
||||
{
|
||||
State[index] = SatIncrement(State[index], 3);
|
||||
GHR = GHR << 1 | 0x1;
|
||||
}
|
||||
else
|
||||
{
|
||||
State[index] = SatDecrement(State[index]);
|
||||
GHR = GHR << 1;
|
||||
}
|
||||
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
}
|
||||
|
||||
void PREDICTOR_free(void)
|
||||
{
|
||||
free(State);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
// 饱和计数器:加1
|
||||
static inline UINT32 SatIncrement(UINT32 x, UINT32 max)
|
||||
{
|
||||
if (x < max) return x + 1;
|
||||
return x;
|
||||
}
|
||||
|
||||
// 饱和计数器:减1
|
||||
static inline UINT32 SatDecrement(UINT32 x)
|
||||
{
|
||||
if (x > 0) return x - 1;
|
||||
return x;
|
||||
}
|
||||
|
||||
#define BITS_OF_PC 3 // 选择3位的PC作为索引
|
||||
#define GLOBAL_HIST_LEN 10 // 全局历史长度,10位
|
||||
#define STATE_INDEX_MASK ~(~0 << (BITS_OF_PC + GLOBAL_HIST_LEN))
|
||||
|
||||
#define STATE_MAX 3
|
||||
#define STATE_INIT 2
|
||||
|
||||
UINT32 GHR; // Global History Register,全局历史寄存器
|
||||
UINT32* State; // 状态数组,用于保存分支指令的状态机,实际只使用最低2位
|
||||
UINT64 StateArraySize;
|
||||
|
||||
void PREDICTOR_init(void)
|
||||
{
|
||||
StateArraySize = (1 << (BITS_OF_PC +GLOBAL_HIST_LEN)); // 状态数组项数
|
||||
|
||||
State = (UINT32*)malloc(StateArraySize * sizeof(UINT32));
|
||||
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 将状态数组,全部初始化为STATE_INIT
|
||||
// 将全局历史寄存器(GHR)初始化为0
|
||||
for(UINT64 i = 0; i < StateArraySize; i++)
|
||||
{
|
||||
State[i] = STATE_INIT;
|
||||
}
|
||||
GHR = 0;
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
}
|
||||
|
||||
// Gshare分支预测器(预测部分)
|
||||
char GetPrediction(UINT64 PC)
|
||||
{
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 将PC的低3位,与10位GHR进行拼接,形成一个13位的状态数组索引
|
||||
// 用13位去索引状态数组,得到对应的饱和状态
|
||||
// 如果该状态的值超过一半,则预测跳转
|
||||
// 如果该状态的值低于一半,则预测不跳转
|
||||
UINT64 index = (((PC>>2 & 0x7) << 10) & 0x1c00) | (GHR & 0x3ff);
|
||||
if(State[index] == 0 || State[index] == 1) return NOT_TAKEN;
|
||||
return TAKEN;
|
||||
// return TAKEN;
|
||||
// return NOT_TAKEN;
|
||||
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
|
||||
}
|
||||
|
||||
// Gshare分支预测器(更新部分)
|
||||
void UpdatePredictor(UINT64 PC, OpType opType, char resolveDir, char predDir, UINT64 branchTarget)
|
||||
{
|
||||
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 根据分支指令实际执行结果,来更新对应的饱和计数器
|
||||
// 如果结果为跳转,则对应的饱和计数器+1
|
||||
// 如果结果为不跳转,则对应的饱和计数器-1
|
||||
// 更新GHR中的最近1次分支历史信息,使用移位寄存器来更新
|
||||
UINT64 index = (((PC>>2 & 0x7) << 10) & 0x1c00) | (GHR & 0x3ff);
|
||||
if(resolveDir == 'T')
|
||||
{
|
||||
State[index] = SatIncrement(State[index], 3);
|
||||
GHR = GHR << 1 | 0x1;
|
||||
}
|
||||
else
|
||||
{
|
||||
State[index] = SatDecrement(State[index]);
|
||||
GHR = GHR << 1;
|
||||
}
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
}
|
||||
|
||||
void PREDICTOR_free(void)
|
||||
{
|
||||
free(State);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
// 饱和计数器:加1
|
||||
static inline UINT32 SatIncrement(UINT32 x, UINT32 max)
|
||||
{
|
||||
if (x < max) return x + 1;
|
||||
return x;
|
||||
}
|
||||
|
||||
// 饱和计数器:减1
|
||||
static inline UINT32 SatDecrement(UINT32 x)
|
||||
{
|
||||
if (x > 0) return x - 1;
|
||||
return x;
|
||||
}
|
||||
|
||||
#define GLOBAL_HIST_LEN 13 // 全局历史长度,13位
|
||||
#define GLOBAL_HIST_MASK ~(~0 << GLOBAL_HIST_LEN)
|
||||
|
||||
#define STATE_MAX 3
|
||||
#define STATE_INIT 2
|
||||
|
||||
UINT32 GHR; // Global History Register,全局历史寄存器
|
||||
UINT32* State; // 状态数组,用于保存分支指令的状态机,实际只使用最低2位
|
||||
UINT64 StateArraySize;
|
||||
|
||||
void PREDICTOR_init(void)
|
||||
{
|
||||
StateArraySize = (1 << GLOBAL_HIST_LEN); // 状态数组项数
|
||||
|
||||
State = (UINT32*)malloc(StateArraySize * sizeof(UINT32));
|
||||
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 将状态数组,全部初始化为STATE_INIT
|
||||
// 将全局历史寄存器(GHR)初始化为0
|
||||
for(UINT64 i = 0; i < StateArraySize; i++)
|
||||
{
|
||||
State[i] = STATE_INIT;
|
||||
}
|
||||
GHR = 0;
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
}
|
||||
|
||||
// Gshare分支预测器(预测部分)
|
||||
char GetPrediction(UINT64 PC)
|
||||
{
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 将PC的低13位,与13位GHR进行异或,形成一个13位的状态数组索引
|
||||
// 用13位去索引状态数组,得到对应的饱和状态
|
||||
// 如果该状态的值超过一半,则预测跳转
|
||||
// 如果该状态的值低于一半,则预测不跳转7]
|
||||
UINT64 nPC = (PC) & 0x1fff;
|
||||
UINT64 index = nPC ^ (GHR & 0x1fff);
|
||||
if(State[index] == 0 || State[index] == 1) return NOT_TAKEN;
|
||||
return TAKEN;
|
||||
// return TAKEN;
|
||||
// return NOT_TAKEN;
|
||||
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
|
||||
}
|
||||
|
||||
// Gshare分支预测器(更新部分)
|
||||
void UpdatePredictor(UINT64 PC, OpType opType, char resolveDir, char predDir, UINT64 branchTarget)
|
||||
{
|
||||
|
||||
// *********** 你需要在下面书写代码 ***********
|
||||
// 根据分支指令实际执行结果,来更新对应的饱和计数器
|
||||
// 如果结果为跳转,则对应的饱和计数器+1
|
||||
// 如果结果为不跳转,则对应的饱和计数器-1
|
||||
// 更新GHR中的最近1次分支历史信息,使用移位寄存器来更新
|
||||
UINT64 nPC = (PC) & 0x1fff;
|
||||
UINT64 index = nPC ^ (GHR & 0x1fff);
|
||||
if(resolveDir == 'T')
|
||||
{
|
||||
State[index] = SatIncrement(State[index], 3);
|
||||
GHR = GHR << 1 | 0x1;
|
||||
}
|
||||
else
|
||||
{
|
||||
State[index] = SatDecrement(State[index]);
|
||||
GHR = GHR << 1;
|
||||
}
|
||||
|
||||
// *********** 你需要在上面书写代码 ***********
|
||||
}
|
||||
|
||||
void PREDICTOR_free(void)
|
||||
{
|
||||
free(State);
|
||||
}
|
||||
|
||||
|
||||
@ -1,34 +0,0 @@
|
||||
CC = gcc
|
||||
CFLAGS = -Wall -O1 -g
|
||||
#LDFLAGS = -lm -lcudart -lcuda
|
||||
|
||||
# Source files
|
||||
SRCS = rowcol_test.c clock.c cpe.c fcyc.c lsquare.c rowcol_202302723005.c
|
||||
#CUDA_SRCS = rowcol.cu
|
||||
OBJS = $(SRCS:.c=.o)
|
||||
#rowcol.o
|
||||
|
||||
# Target executable
|
||||
TARGET = matrix_test
|
||||
|
||||
# Default target
|
||||
all: $(TARGET)
|
||||
|
||||
# Rule to build the executable
|
||||
$(TARGET): $(OBJS)
|
||||
$(CC) $(OBJS) -o $(TARGET) $(LDFLAGS)
|
||||
|
||||
# Rule to build object files
|
||||
%.o: %.c
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
# Rule to build CUDA object files
|
||||
#rowcol.o: rowcol.cu
|
||||
# $(NVCC) $(CUDA_FLAGS) -c $< -o $@
|
||||
|
||||
# Clean rule
|
||||
clean:
|
||||
rm -f $(OBJS) $(TARGET)
|
||||
|
||||
# Phony targets
|
||||
.PHONY: all clean
|
||||
BIN
perflab/matrix/a.exe
Executable file
BIN
perflab/matrix/a.exe
Executable file
Binary file not shown.
@ -1,196 +1,229 @@
|
||||
/* clock.c
|
||||
* Retrofitted to use thread-specific timers
|
||||
* and to get clock information from /proc/cpuinfo
|
||||
* (C) R. E. Bryant, 2010
|
||||
* Modified for cross-platform compatibility
|
||||
*/
|
||||
|
||||
#define _GNU_SOURCE // For sched_setaffinity on Linux
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <intrin.h>
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <sched.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <x86intrin.h>
|
||||
typedef struct {
|
||||
uint64_t QuadPart;
|
||||
} LARGE_INTEGER;
|
||||
typedef void *HANDLE;
|
||||
#define __int64 long long
|
||||
#define Sleep(ms) usleep((ms) * 1000)
|
||||
#endif
|
||||
|
||||
#include "clock.h"
|
||||
|
||||
/* Use x86 cycle counter */
|
||||
static unsigned cyc_hi = 0;
|
||||
static unsigned cyc_lo = 0;
|
||||
|
||||
void access_counter(unsigned *hi, unsigned *lo) {
|
||||
uint64_t counter = __rdtsc();
|
||||
*hi = (unsigned)(counter >> 32);
|
||||
*lo = (unsigned)counter;
|
||||
}
|
||||
|
||||
void start_counter() { access_counter(&cyc_hi, &cyc_lo); }
|
||||
|
||||
double get_counter() {
|
||||
unsigned ncyc_hi, ncyc_lo;
|
||||
access_counter(&ncyc_hi, &ncyc_lo);
|
||||
uint64_t start = ((uint64_t)cyc_hi << 32) | cyc_lo;
|
||||
uint64_t end = ((uint64_t)ncyc_hi << 32) | ncyc_lo;
|
||||
return (double)(end - start);
|
||||
}
|
||||
|
||||
void make_CPU_busy(void) {
|
||||
volatile double old_tick = get_counter();
|
||||
volatile double new_tick;
|
||||
while ((new_tick - old_tick) < 1000000000) {
|
||||
new_tick = get_counter();
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
#define GET_TIME(dest) QueryPerformanceCounter(dest)
|
||||
#else
|
||||
static inline void GET_TIME(LARGE_INTEGER *dest) {
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
dest->QuadPart = (uint64_t)ts.tv_sec * 1000000000 + ts.tv_nsec;
|
||||
}
|
||||
#define QueryPerformanceFrequency(freq) ((freq)->QuadPart = 1000000000)
|
||||
#endif
|
||||
|
||||
double mhz(int verbose) {
|
||||
LARGE_INTEGER lFrequency;
|
||||
LARGE_INTEGER lPerformanceCount_Start;
|
||||
LARGE_INTEGER lPerformanceCount_End;
|
||||
double mhz;
|
||||
double fTime;
|
||||
__int64 _i64StartCpuCounter;
|
||||
__int64 _i64EndCpuCounter;
|
||||
|
||||
#ifdef _WIN32
|
||||
HANDLE hThread = GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread, 0x1);
|
||||
#else
|
||||
cpu_set_t cpuset;
|
||||
CPU_ZERO(&cpuset);
|
||||
CPU_SET(0, &cpuset);
|
||||
sched_setaffinity(0, sizeof(cpuset), &cpuset);
|
||||
#endif
|
||||
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
GET_TIME(&lPerformanceCount_Start);
|
||||
_i64StartCpuCounter = __rdtsc();
|
||||
Sleep(200);
|
||||
GET_TIME(&lPerformanceCount_End);
|
||||
_i64EndCpuCounter = __rdtsc();
|
||||
|
||||
fTime = (lPerformanceCount_End.QuadPart - lPerformanceCount_Start.QuadPart) /
|
||||
(double)lFrequency.QuadPart;
|
||||
mhz = (_i64EndCpuCounter - _i64StartCpuCounter) / (fTime * 1000000.0);
|
||||
|
||||
if (verbose > 0) {
|
||||
printf("CPU频率为: %.6fMHz.\n", mhz);
|
||||
}
|
||||
return mhz;
|
||||
}
|
||||
|
||||
double CPU_Factor1(void) {
|
||||
double result;
|
||||
int i, j, k;
|
||||
LARGE_INTEGER lStart, lEnd;
|
||||
LARGE_INTEGER lFrequency;
|
||||
double fTime;
|
||||
|
||||
#ifdef _WIN32
|
||||
HANDLE hThread = GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread, 0x1);
|
||||
#else
|
||||
cpu_set_t cpuset;
|
||||
CPU_ZERO(&cpuset);
|
||||
CPU_SET(0, &cpuset);
|
||||
sched_setaffinity(0, sizeof(cpuset), &cpuset);
|
||||
#endif
|
||||
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
GET_TIME(&lStart);
|
||||
start_counter();
|
||||
|
||||
for (i = 0; i < 100; i++)
|
||||
for (j = 0; j < 1000; j++)
|
||||
for (k = 0; k < 1000; k++)
|
||||
;
|
||||
|
||||
result = get_counter();
|
||||
GET_TIME(&lEnd);
|
||||
|
||||
fTime = (lEnd.QuadPart - lStart.QuadPart) / (double)lFrequency.QuadPart;
|
||||
printf("CPU计算时长为: %f", result);
|
||||
printf("\t %f\n", fTime);
|
||||
return result;
|
||||
}
|
||||
|
||||
double CPU_Factor(void) {
|
||||
double frequency;
|
||||
double multiplier = 1000 * 1000 * 1000; // nano
|
||||
LARGE_INTEGER lFrequency;
|
||||
LARGE_INTEGER start, stop;
|
||||
int i;
|
||||
const int known_instructions_per_loop = 27317;
|
||||
int iterations = 100000000;
|
||||
int g = 0;
|
||||
double normal_ticks_per_second;
|
||||
double ticks;
|
||||
double time;
|
||||
double loops_per_sec;
|
||||
double instructions_per_loop;
|
||||
double ratio;
|
||||
double actual_freq;
|
||||
|
||||
#ifdef _WIN32
|
||||
HANDLE hThread = GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread, 0x1);
|
||||
#else
|
||||
cpu_set_t cpuset;
|
||||
CPU_ZERO(&cpuset);
|
||||
CPU_SET(0, &cpuset);
|
||||
sched_setaffinity(0, sizeof(cpuset), &cpuset);
|
||||
#endif
|
||||
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
frequency = (double)lFrequency.QuadPart;
|
||||
GET_TIME(&start);
|
||||
|
||||
for (i = 0; i < iterations; i++) {
|
||||
g++;
|
||||
g++;
|
||||
g++;
|
||||
g++;
|
||||
}
|
||||
|
||||
GET_TIME(&stop);
|
||||
normal_ticks_per_second = frequency * 1000;
|
||||
ticks = (double)(stop.QuadPart - start.QuadPart);
|
||||
time = (ticks * multiplier) / frequency;
|
||||
loops_per_sec = iterations / (time / multiplier);
|
||||
instructions_per_loop = normal_ticks_per_second / loops_per_sec;
|
||||
ratio = instructions_per_loop / known_instructions_per_loop;
|
||||
actual_freq = normal_ticks_per_second / ratio;
|
||||
|
||||
printf("Perf counter freq: %f\n", normal_ticks_per_second);
|
||||
printf("Loops per sec: %f\n", loops_per_sec);
|
||||
printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
|
||||
printf("Presumed freq: %f\n", actual_freq);
|
||||
printf("ratio: %f\n", ratio);
|
||||
printf("time=%f\n", time);
|
||||
return ratio;
|
||||
}
|
||||
/* clock.c
|
||||
* Retrofitted to use thread-specific timers
|
||||
* and to get clock information from /proc/cpuinfo
|
||||
* (C) R. E. Bryant, 2010
|
||||
*
|
||||
*/
|
||||
|
||||
/* When this constant is not defined, uses time stamp counter */
|
||||
#define USE_POSIX 0
|
||||
|
||||
/* Choice to use cpu_gettime call or Intel time stamp counter directly */
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <intrin.h>
|
||||
//#include <intrinsics.h>
|
||||
#include <windows.h>
|
||||
#include <time.h>
|
||||
#include "clock.h"
|
||||
|
||||
/* Use x86 cycle counter */
|
||||
|
||||
/* Initialize the cycle counter */
|
||||
static unsigned cyc_hi = 0;
|
||||
static unsigned cyc_lo = 0;
|
||||
|
||||
/* Set *hi and *lo to the high and low order bits of the cycle counter.
|
||||
Implementation requires assembly code to use the rdtsc instruction. */
|
||||
void access_counter(unsigned *hi, unsigned *lo)
|
||||
{
|
||||
|
||||
long long counter;
|
||||
|
||||
counter = __rdtsc();
|
||||
(*hi) = (unsigned int)(counter >> 32);
|
||||
(*lo) = (unsigned int)counter;
|
||||
/*
|
||||
|
||||
LARGE_INTEGER lPerformanceCount;
|
||||
|
||||
QueryPerformanceCounter(&lPerformanceCount);
|
||||
(*hi) = (unsigned int)lPerformanceCount.HighPart;
|
||||
(*lo) = (unsigned int)lPerformanceCount.LowPart;
|
||||
// printf("%08X %08X\n",(*hi),(*lo));
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
/* Record the current value of the cycle counter. */
|
||||
void start_counter()
|
||||
{
|
||||
access_counter(&cyc_hi, &cyc_lo);
|
||||
}
|
||||
|
||||
/* Return the number of cycles since the last call to start_counter. */
|
||||
double get_counter()
|
||||
{
|
||||
unsigned ncyc_hi, ncyc_lo;
|
||||
unsigned hi, lo, borrow;
|
||||
double result;
|
||||
|
||||
/* Get cycle counter */
|
||||
access_counter(&ncyc_hi, &ncyc_lo);
|
||||
|
||||
/* Do double precision subtraction */
|
||||
lo = ncyc_lo - cyc_lo;
|
||||
borrow = cyc_lo > ncyc_lo;
|
||||
hi = ncyc_hi - cyc_hi - borrow;
|
||||
result = (double) hi * (1 << 30) * 4 + lo;
|
||||
return result;
|
||||
}
|
||||
void make_CPU_busy(void)
|
||||
{
|
||||
volatile double old_tick,new_tick;
|
||||
start_counter();
|
||||
old_tick = get_counter();
|
||||
new_tick = get_counter();
|
||||
while (new_tick - old_tick < 1000000000)
|
||||
new_tick = get_counter();
|
||||
}
|
||||
|
||||
//CPU<50><55>Ƶ<EFBFBD><C6B5>
|
||||
double mhz(int verbose)
|
||||
{
|
||||
LARGE_INTEGER lFrequency;
|
||||
LARGE_INTEGER lPerformanceCount_Start;
|
||||
LARGE_INTEGER lPerformanceCount_End;
|
||||
double mhz;
|
||||
double fTime;
|
||||
__int64 _i64StartCpuCounter;
|
||||
__int64 _i64EndCpuCounter;
|
||||
//On a multiprocessor machine, it should not matter which processor is called.
|
||||
//However, you can get different results on different processors due to bugs in
|
||||
//the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function.
|
||||
HANDLE hThread=GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread,0x1);
|
||||
|
||||
//<2F><><EFBFBD><EFBFBD><EFBFBD>ϸ߾<CFB8><DFBE>ȶ<EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD>ľ<EFBFBD><C4BE><EFBFBD>Ƶ<EFBFBD><C6B5>
|
||||
//<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1>Ӧ<EFBFBD>þ<EFBFBD><C3BE><EFBFBD>һƬ8253<35><33><EFBFBD><EFBFBD>8254
|
||||
//<2F><>intel ich7<68>м<EFBFBD><D0BC><EFBFBD><EFBFBD><EFBFBD>8254
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
// if (verbose>0)
|
||||
// printf("<22>߾<EFBFBD><DFBE>ȶ<EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD>ľ<EFBFBD><C4BE><EFBFBD>Ƶ<EFBFBD>ʣ<EFBFBD>%1.0fHz.\n",(double)lFrequency.QuadPart);
|
||||
|
||||
//<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1>ÿ<EFBFBD><C3BF><EFBFBD><EFBFBD>һ<EFBFBD><D2BB>ʱ<EFBFBD><CAB1><EFBFBD><EFBFBD><EFBFBD>ڣ<EFBFBD><DAA3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>+1
|
||||
QueryPerformanceCounter(&lPerformanceCount_Start);
|
||||
|
||||
//RDTSCָ<43><D6B8>:<3A><>ȡCPU<50><55><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
_i64StartCpuCounter=__rdtsc();
|
||||
|
||||
//<2F><>ʱ<EFBFBD><CAB1>һ<EFBFBD><D2BB>,<2C><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Сһ<D0A1><D2BB>
|
||||
//int nTemp=100000;
|
||||
//while (--nTemp);
|
||||
Sleep(200);
|
||||
|
||||
QueryPerformanceCounter(&lPerformanceCount_End);
|
||||
|
||||
_i64EndCpuCounter=__rdtsc();
|
||||
|
||||
//f=1/T => f=<3D><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>/(<28><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*T)
|
||||
//<2F><><EFBFBD><EFBFBD><EFBFBD>ġ<EFBFBD><C4A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*T<><54><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD><EFBFBD>
|
||||
fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart)
|
||||
/(double)lFrequency.QuadPart;
|
||||
|
||||
mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0);
|
||||
if (verbose>0)
|
||||
printf("CPUƵ<EFBFBD><EFBFBD>Ϊ:%1.6fMHz.\n",mhz);
|
||||
return mhz;
|
||||
}
|
||||
|
||||
double CPU_Factor1(void)
|
||||
{
|
||||
double result;
|
||||
int i,j,k,ii,jj,kk;
|
||||
LARGE_INTEGER lStart,lEnd;
|
||||
LARGE_INTEGER lFrequency;
|
||||
HANDLE hThread;
|
||||
double fTime;
|
||||
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
|
||||
ii = 43273;
|
||||
kk = 1238;
|
||||
result = 1;
|
||||
jj = 1244;
|
||||
|
||||
hThread=GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread,0x1);
|
||||
QueryPerformanceCounter(&lStart);
|
||||
//_asm("cpuid");
|
||||
start_counter();
|
||||
for (i=0;i<100;i++)
|
||||
for (j=0;j<1000;j++)
|
||||
for (k=0;k<1000;k++)
|
||||
kk += kk*ii+jj;
|
||||
|
||||
result = get_counter();
|
||||
QueryPerformanceCounter(&lEnd);
|
||||
fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart);
|
||||
printf("CPU<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD>Ϊ%f",result);
|
||||
printf("\t %f\n",fTime);
|
||||
return result;
|
||||
}
|
||||
|
||||
double CPU_Factor(void)
|
||||
{
|
||||
double frequency;
|
||||
double multiplier = 1000 * 1000 * 1000;//nano
|
||||
LARGE_INTEGER lFrequency;
|
||||
LARGE_INTEGER start,stop;
|
||||
HANDLE hThread;
|
||||
int i;
|
||||
const int gigahertz= 1000*1000*1000;
|
||||
const int known_instructions_per_loop = 27317;
|
||||
|
||||
int iterations = 100000000;
|
||||
int g = 0;
|
||||
double normal_ticks_per_second;
|
||||
double ticks;
|
||||
double time;
|
||||
double loops_per_sec;
|
||||
double instructions_per_loop;
|
||||
double ratio;
|
||||
double actual_freq;
|
||||
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
frequency = (double)lFrequency.QuadPart;
|
||||
|
||||
hThread=GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread,0x1);
|
||||
QueryPerformanceCounter(&start);
|
||||
for( i = 0; i < iterations; i++)
|
||||
{
|
||||
g++;
|
||||
g++;
|
||||
g++;
|
||||
g++;
|
||||
}
|
||||
QueryPerformanceCounter(&stop);
|
||||
|
||||
//normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199
|
||||
normal_ticks_per_second = frequency * 1000;
|
||||
ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
|
||||
time = (ticks * multiplier) /frequency;
|
||||
loops_per_sec = iterations / (time/multiplier);
|
||||
instructions_per_loop = normal_ticks_per_second / loops_per_sec;
|
||||
|
||||
ratio = (instructions_per_loop / known_instructions_per_loop);
|
||||
actual_freq = normal_ticks_per_second / ratio;
|
||||
/*
|
||||
actual_freq = normal_ticks_per_second / ratio;
|
||||
actual_freq = known_instructions_per_loop*iterations*multiplier/time;
|
||||
|
||||
2293 = x/time;
|
||||
|
||||
2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
|
||||
loops_per_sec = iterations*frequency / ticks
|
||||
|
||||
instructions_per_loop = / loops_per_sec;
|
||||
*/
|
||||
printf("Perf counter freq: %f\n", normal_ticks_per_second);
|
||||
printf("Loops per sec: %f\n", loops_per_sec);
|
||||
printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
|
||||
printf("Presumed freq: %f\n", actual_freq);
|
||||
printf("ratio: %f\n", ratio);
|
||||
printf("time=%f\n",time);
|
||||
return ratio;
|
||||
}
|
||||
|
||||
@ -1,229 +0,0 @@
|
||||
/* clock.c
|
||||
* Retrofitted to use thread-specific timers
|
||||
* and to get clock information from /proc/cpuinfo
|
||||
* (C) R. E. Bryant, 2010
|
||||
*
|
||||
*/
|
||||
|
||||
/* When this constant is not defined, uses time stamp counter */
|
||||
#define USE_POSIX 0
|
||||
|
||||
/* Choice to use cpu_gettime call or Intel time stamp counter directly */
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <x86intrin.h>
|
||||
//#include <intrinsics.h>
|
||||
//#include <windows.h>
|
||||
#include <time.h>
|
||||
#include "clock.h"
|
||||
|
||||
/* Use x86 cycle counter */
|
||||
|
||||
/* Initialize the cycle counter */
|
||||
static unsigned cyc_hi = 0;
|
||||
static unsigned cyc_lo = 0;
|
||||
|
||||
/* Set *hi and *lo to the high and low order bits of the cycle counter.
|
||||
Implementation requires assembly code to use the rdtsc instruction. */
|
||||
void access_counter(unsigned *hi, unsigned *lo)
|
||||
{
|
||||
|
||||
long long counter;
|
||||
|
||||
counter = __rdtsc();
|
||||
(*hi) = (unsigned int)(counter >> 32);
|
||||
(*lo) = (unsigned int)counter;
|
||||
/*
|
||||
|
||||
LARGE_INTEGER lPerformanceCount;
|
||||
|
||||
QueryPerformanceCounter(&lPerformanceCount);
|
||||
(*hi) = (unsigned int)lPerformanceCount.HighPart;
|
||||
(*lo) = (unsigned int)lPerformanceCount.LowPart;
|
||||
// printf("%08X %08X\n",(*hi),(*lo));
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
/* Record the current value of the cycle counter. */
|
||||
void start_counter()
|
||||
{
|
||||
access_counter(&cyc_hi, &cyc_lo);
|
||||
}
|
||||
|
||||
/* Return the number of cycles since the last call to start_counter. */
|
||||
double get_counter()
|
||||
{
|
||||
unsigned ncyc_hi, ncyc_lo;
|
||||
unsigned hi, lo, borrow;
|
||||
double result;
|
||||
|
||||
/* Get cycle counter */
|
||||
access_counter(&ncyc_hi, &ncyc_lo);
|
||||
|
||||
/* Do double precision subtraction */
|
||||
lo = ncyc_lo - cyc_lo;
|
||||
borrow = cyc_lo > ncyc_lo;
|
||||
hi = ncyc_hi - cyc_hi - borrow;
|
||||
result = (double) hi * (1 << 30) * 4 + lo;
|
||||
return result;
|
||||
}
|
||||
void make_CPU_busy(void)
|
||||
{
|
||||
volatile double old_tick,new_tick;
|
||||
start_counter();
|
||||
old_tick = get_counter();
|
||||
new_tick = get_counter();
|
||||
while (new_tick - old_tick < 1000000000)
|
||||
new_tick = get_counter();
|
||||
}
|
||||
|
||||
//CPU<50><55>Ƶ<EFBFBD><C6B5>
|
||||
double mhz(int verbose)
|
||||
{
|
||||
LARGE_INTEGER lFrequency;
|
||||
LARGE_INTEGER lPerformanceCount_Start;
|
||||
LARGE_INTEGER lPerformanceCount_End;
|
||||
double mhz;
|
||||
double fTime;
|
||||
__int64 _i64StartCpuCounter;
|
||||
__int64 _i64EndCpuCounter;
|
||||
//On a multiprocessor machine, it should not matter which processor is called.
|
||||
//However, you can get different results on different processors due to bugs in
|
||||
//the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function.
|
||||
HANDLE hThread=GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread,0x1);
|
||||
|
||||
//<2F><><EFBFBD><EFBFBD><EFBFBD>ϸ߾<CFB8><DFBE>ȶ<EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD>ľ<EFBFBD><C4BE><EFBFBD>Ƶ<EFBFBD><C6B5>
|
||||
//<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1>Ӧ<EFBFBD>þ<EFBFBD><C3BE><EFBFBD>һƬ8253<35><33><EFBFBD><EFBFBD>8254
|
||||
//<2F><>intel ich7<68>м<EFBFBD><D0BC><EFBFBD><EFBFBD><EFBFBD>8254
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
// if (verbose>0)
|
||||
// printf("<22>߾<EFBFBD><DFBE>ȶ<EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD>ľ<EFBFBD><C4BE><EFBFBD>Ƶ<EFBFBD>ʣ<EFBFBD>%1.0fHz.\n",(double)lFrequency.QuadPart);
|
||||
|
||||
//<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1>ÿ<EFBFBD><C3BF><EFBFBD><EFBFBD>һ<EFBFBD><D2BB>ʱ<EFBFBD><CAB1><EFBFBD><EFBFBD><EFBFBD>ڣ<EFBFBD><DAA3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>+1
|
||||
QueryPerformanceCounter(&lPerformanceCount_Start);
|
||||
|
||||
//RDTSCָ<43><D6B8>:<3A><>ȡCPU<50><55><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
_i64StartCpuCounter=__rdtsc();
|
||||
|
||||
//<2F><>ʱ<EFBFBD><CAB1>һ<EFBFBD><D2BB>,<2C><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Сһ<D0A1><D2BB>
|
||||
//int nTemp=100000;
|
||||
//while (--nTemp);
|
||||
Sleep(200);
|
||||
|
||||
QueryPerformanceCounter(&lPerformanceCount_End);
|
||||
|
||||
_i64EndCpuCounter=__rdtsc();
|
||||
|
||||
//f=1/T => f=<3D><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>/(<28><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*T)
|
||||
//<2F><><EFBFBD><EFBFBD><EFBFBD>ġ<EFBFBD><C4A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*T<><54><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD><EFBFBD>
|
||||
fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart)
|
||||
/(double)lFrequency.QuadPart;
|
||||
|
||||
mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0);
|
||||
if (verbose>0)
|
||||
printf("CPUƵ<EFBFBD><EFBFBD>Ϊ:%1.6fMHz.\n",mhz);
|
||||
return mhz;
|
||||
}
|
||||
|
||||
double CPU_Factor1(void)
|
||||
{
|
||||
double result;
|
||||
int i,j,k,ii,jj,kk;
|
||||
LARGE_INTEGER lStart,lEnd;
|
||||
LARGE_INTEGER lFrequency;
|
||||
HANDLE hThread;
|
||||
double fTime;
|
||||
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
|
||||
ii = 43273;
|
||||
kk = 1238;
|
||||
result = 1;
|
||||
jj = 1244;
|
||||
|
||||
hThread=GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread,0x1);
|
||||
QueryPerformanceCounter(&lStart);
|
||||
//_asm("cpuid");
|
||||
start_counter();
|
||||
for (i=0;i<100;i++)
|
||||
for (j=0;j<1000;j++)
|
||||
for (k=0;k<1000;k++)
|
||||
kk += kk*ii+jj;
|
||||
|
||||
result = get_counter();
|
||||
QueryPerformanceCounter(&lEnd);
|
||||
fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart);
|
||||
printf("CPU<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD>Ϊ%f",result);
|
||||
printf("\t %f\n",fTime);
|
||||
return result;
|
||||
}
|
||||
|
||||
double CPU_Factor(void)
|
||||
{
|
||||
double frequency;
|
||||
double multiplier = 1000 * 1000 * 1000;//nano
|
||||
LARGE_INTEGER lFrequency;
|
||||
LARGE_INTEGER start,stop;
|
||||
HANDLE hThread;
|
||||
int i;
|
||||
const int gigahertz= 1000*1000*1000;
|
||||
const int known_instructions_per_loop = 27317;
|
||||
|
||||
int iterations = 100000000;
|
||||
int g = 0;
|
||||
double normal_ticks_per_second;
|
||||
double ticks;
|
||||
double time;
|
||||
double loops_per_sec;
|
||||
double instructions_per_loop;
|
||||
double ratio;
|
||||
double actual_freq;
|
||||
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
frequency = (double)lFrequency.QuadPart;
|
||||
|
||||
hThread=GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread,0x1);
|
||||
QueryPerformanceCounter(&start);
|
||||
for( i = 0; i < iterations; i++)
|
||||
{
|
||||
g++;
|
||||
g++;
|
||||
g++;
|
||||
g++;
|
||||
}
|
||||
QueryPerformanceCounter(&stop);
|
||||
|
||||
//normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199
|
||||
normal_ticks_per_second = frequency * 1000;
|
||||
ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
|
||||
time = (ticks * multiplier) /frequency;
|
||||
loops_per_sec = iterations / (time/multiplier);
|
||||
instructions_per_loop = normal_ticks_per_second / loops_per_sec;
|
||||
|
||||
ratio = (instructions_per_loop / known_instructions_per_loop);
|
||||
actual_freq = normal_ticks_per_second / ratio;
|
||||
/*
|
||||
actual_freq = normal_ticks_per_second / ratio;
|
||||
actual_freq = known_instructions_per_loop*iterations*multiplier/time;
|
||||
|
||||
2293 = x/time;
|
||||
|
||||
2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
|
||||
loops_per_sec = iterations*frequency / ticks
|
||||
|
||||
instructions_per_loop = / loops_per_sec;
|
||||
*/
|
||||
printf("Perf counter freq: %f\n", normal_ticks_per_second);
|
||||
printf("Loops per sec: %f\n", loops_per_sec);
|
||||
printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
|
||||
printf("Presumed freq: %f\n", actual_freq);
|
||||
printf("ratio: %f\n", ratio);
|
||||
printf("time=%f\n",time);
|
||||
return ratio;
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
@ -119,7 +119,7 @@ double fcyc(test_funct f, int *params)
|
||||
if (clear_cache)
|
||||
clear();
|
||||
start_counter();
|
||||
f((long*)params);
|
||||
f((long int*)params);
|
||||
cyc = get_counter();
|
||||
if (cyc > 0.0)
|
||||
add_sample(cyc);
|
||||
@ -131,7 +131,7 @@ double fcyc(test_funct f, int *params)
|
||||
clear();
|
||||
start_counter();
|
||||
for (i=0;i<MAX_ITER_TIMES;i++)
|
||||
f((long*)params);
|
||||
f((long int *)params);
|
||||
cyc = get_counter()/MAX_ITER_TIMES;
|
||||
if (cyc > 0.0)
|
||||
add_sample(cyc);
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,69 +0,0 @@
|
||||
/**************************************************************************
|
||||
??/???????????????????????????????
|
||||
1. ???????????????????????????????
|
||||
2. ??????????????????????
|
||||
3. ??rc_fun_rec rc_fun_tab??????????????????
|
||||
???????????????????????????????????????????
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
????201209054233
|
||||
??????????????
|
||||
*/
|
||||
|
||||
#include "rowcol.h"
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/* ????????????????? */
|
||||
/* ???????????????????????????????????????????????
|
||||
??????????2?????????????????
|
||||
*/
|
||||
|
||||
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
|
||||
int i, j;
|
||||
for (j = 0; j < N; j++) {
|
||||
colsum[j] = 0;
|
||||
for (i = 0; i < N; i++)
|
||||
colsum[j] += M[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
/* ???????????????????? */
|
||||
/* ??????????????????????? */
|
||||
|
||||
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
|
||||
int i, j;
|
||||
for (i = 0; i < N; i++) {
|
||||
rowsum[i] = colsum[i] = 0;
|
||||
for (j = 0; j < N; j++) {
|
||||
rowsum[i] += M[i][j];
|
||||
colsum[i] += M[j][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
????????????????????????????????????????, COL/ROWCOL, "?????????"??
|
||||
COL??????????????????????
|
||||
ROWCOL???????????????????????
|
||||
?????????????????????????????
|
||||
????
|
||||
{my_c_sum1, "?????????????????"},
|
||||
{my_rc_sum2, "??????????????????"},
|
||||
*/
|
||||
|
||||
rc_fun_rec rc_fun_tab[] = {
|
||||
|
||||
/* ???????????????????????????????? */
|
||||
{c_sum, COL, "Best column sum"},
|
||||
/* ?????????????????????????????????? */
|
||||
{rc_sum, ROWCOL, "Best row and column sum"},
|
||||
|
||||
{c_sum, COL, "Column sum, reference implementation"},
|
||||
|
||||
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||
|
||||
/* ??????????????????????????????????????? */
|
||||
{NULL, ROWCOL, NULL}};
|
||||
@ -1,162 +0,0 @@
|
||||
/**************************************************************************
|
||||
<09><>/<2F><><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><D2AA><EFBFBD>༭<EFBFBD><E0BCAD><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD>
|
||||
1. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ѧ<EFBFBD>š<EFBFBD><C5A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ע<EFBFBD>͵ķ<CDB5>ʽд<CABD><D0B4><EFBFBD><EFBFBD><EFBFBD>棻
|
||||
2. ʵ<>ֲ<EFBFBD>ͬ<EFBFBD>汾<EFBFBD><E6B1BE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD><EFBFBD><EFBFBD>
|
||||
3. <20>༭rc_fun_rec rc_fun_tab<61><62><EFBFBD>飬<EFBFBD><E9A3AC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>õĴ<C3B5><C4B4><EFBFBD>
|
||||
<09><><EFBFBD><EFBFBD><EFBFBD>õ<EFBFBD><C3B5>к<EFBFBD><D0BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͡<EFBFBD><CDA1><EFBFBD><EFBFBD>õ<EFBFBD><C3B5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͣ<EFBFBD><CDA3><EFBFBD>Ϊ<EFBFBD><CEAA><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ǰ<EFBFBD><C7B0><EFBFBD><EFBFBD>
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
ѧ<>ţ<EFBFBD>202302723005
|
||||
<09><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>̾<EFBFBD><CCBE><EFBFBD>
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "rowcol.h"
|
||||
#include <math.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
/* <20>ο<EFBFBD><CEBF><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD>ʵ<EFBFBD><CAB5> */
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>е<EFBFBD>ÿһ<C3BF>еĺ͡<C4BA><CDA1><EFBFBD>ע<EFBFBD><D7A2><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>к<EFBFBD><D0BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˵<EFBFBD><CBB5><EFBFBD><EFBFBD><EFBFBD>ò<EFBFBD><C3B2><EFBFBD><EFBFBD><EFBFBD>
|
||||
һ<><D2BB><EFBFBD>ģ<EFBFBD>ֻ<EFBFBD>ǵ<EFBFBD>2<EFBFBD><32><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>õ<EFBFBD><C3B5><EFBFBD><EFBFBD><EFBFBD>
|
||||
*/
|
||||
|
||||
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
int i,j;
|
||||
for (j = 0; j < N; j++) {
|
||||
colsum[j] = 0;
|
||||
for (i = 0; i < N; i++)
|
||||
colsum[j] += M[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* <20>ο<EFBFBD><CEBF><EFBFBD><EFBFBD>к<EFBFBD><D0BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD>ʵ<EFBFBD><CAB5> */
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>е<EFBFBD>ÿһ<C3BF>С<EFBFBD>ÿһ<C3BF>еĺ͡<C4BA> */
|
||||
|
||||
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
int i,j;
|
||||
for (i = 0; i < N; i++) {
|
||||
rowsum[i] = colsum[i] = 0;
|
||||
for (j = 0; j < N; j++) {
|
||||
rowsum[i] += M[i][j];
|
||||
colsum[i] += M[j][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* CUDA<44>Ż<EFBFBD><C5BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD> */
|
||||
void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
// <20><><EFBFBD><EFBFBD><EFBFBD>豸<EFBFBD>ڴ<EFBFBD>
|
||||
int *d_M, *d_colsum;
|
||||
cudaMalloc(&d_M, N * N * sizeof(int));
|
||||
cudaMalloc(&d_colsum, N * sizeof(int));
|
||||
|
||||
// <20><><EFBFBD><EFBFBD><EFBFBD>ݴ<EFBFBD><DDB4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD><C6B5>豸
|
||||
cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||
|
||||
// <20><><EFBFBD><EFBFBD>CUDA<44>˺<EFBFBD><CBBA><EFBFBD>
|
||||
dim3 blockDim(256);
|
||||
dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||
|
||||
// <20><><EFBFBD><EFBFBD><EFBFBD>˺<EFBFBD><CBBA><EFBFBD>
|
||||
cudaColumnSum<<<gridDim, blockDim>>>(d_M, d_colsum);
|
||||
|
||||
// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>豸<EFBFBD><E8B1B8><EFBFBD>ƻ<EFBFBD><C6BB><EFBFBD><EFBFBD><EFBFBD>
|
||||
cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
|
||||
// <20>ͷ<EFBFBD><CDB7>豸<EFBFBD>ڴ<EFBFBD>
|
||||
cudaFree(d_M);
|
||||
cudaFree(d_colsum);
|
||||
}
|
||||
|
||||
/* CUDA<44>Ż<EFBFBD><C5BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD> */
|
||||
void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
// <20><><EFBFBD><EFBFBD><EFBFBD>豸<EFBFBD>ڴ<EFBFBD>
|
||||
int *d_M, *d_rowsum, *d_colsum;
|
||||
cudaMalloc(&d_M, N * N * sizeof(int));
|
||||
cudaMalloc(&d_rowsum, N * sizeof(int));
|
||||
cudaMalloc(&d_colsum, N * sizeof(int));
|
||||
|
||||
// <20><><EFBFBD><EFBFBD><EFBFBD>ݴ<EFBFBD><DDB4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD><C6B5>豸
|
||||
cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||
|
||||
// <20><><EFBFBD><EFBFBD>CUDA<44>˺<EFBFBD><CBBA><EFBFBD>
|
||||
dim3 blockDim(256);
|
||||
dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||
|
||||
// <20><><EFBFBD><EFBFBD><EFBFBD>˺<EFBFBD><CBBA><EFBFBD>
|
||||
cudaRowColSum<<<gridDim, blockDim>>>(d_M, d_rowsum, d_colsum);
|
||||
|
||||
// <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>豸<EFBFBD><E8B1B8><EFBFBD>ƻ<EFBFBD><C6BB><EFBFBD><EFBFBD><EFBFBD>
|
||||
cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
|
||||
// <20>ͷ<EFBFBD><CDB7>豸<EFBFBD>ڴ<EFBFBD>
|
||||
cudaFree(d_M);
|
||||
cudaFree(d_rowsum);
|
||||
cudaFree(d_colsum);
|
||||
}
|
||||
|
||||
/* CUDA<44>˺<EFBFBD><CBBA><EFBFBD> - <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
__global__ void cudaColumnSum(int *M, int *colsum)
|
||||
{
|
||||
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (col < N) {
|
||||
colsum[col] = 0;
|
||||
for (int row = 0; row < N; row++) {
|
||||
colsum[col] += M[row * N + col];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* CUDA<44>˺<EFBFBD><CBBA><EFBFBD> - <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
__global__ void cudaRowColSum(int *M, int *rowsum, int *colsum)
|
||||
{
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < N) {
|
||||
// <20><><EFBFBD><EFBFBD><EFBFBD>к<EFBFBD>
|
||||
rowsum[idx] = 0;
|
||||
for (int j = 0; j < N; j++) {
|
||||
rowsum[idx] += M[idx * N + j];
|
||||
}
|
||||
|
||||
// <20><><EFBFBD><EFBFBD><EFBFBD>к<EFBFBD>
|
||||
colsum[idx] = 0;
|
||||
for (int i = 0; i < N; i++) {
|
||||
colsum[idx] += M[i * N + idx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
<09><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ԫ<EFBFBD>أ<EFBFBD>ÿһ<C3BF><D2BB>Ԫ<EFBFBD>أ<EFBFBD><D8A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>, COL/ROWCOL, "<22><><EFBFBD><EFBFBD><EFBFBD>ַ<EFBFBD><D6B7><EFBFBD>"<22><>
|
||||
COL<4F><4C>ʾ<EFBFBD>ú<EFBFBD><C3BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ÿһ<C3BF>еĺ<D0B5>
|
||||
ROWCOL<4F><4C>ʾ<EFBFBD>ú<EFBFBD><C3BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ÿһ<C3BF>С<EFBFBD>ÿһ<C3BF>еĺ<D0B5>
|
||||
<09><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϊ<EFBFBD><CEAA><EFBFBD>õ<EFBFBD><C3B5><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD>֣<EFBFBD><D6A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ǰ<EFBFBD>档
|
||||
<09><><EFBFBD>磺
|
||||
{my_c_sum1, "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD><CAB5>"},
|
||||
{my_rc_sum2, "<22><>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD><CAB5>"},
|
||||
*/
|
||||
|
||||
rc_fun_rec rc_fun_tab[] =
|
||||
{
|
||||
|
||||
/* <20><>һ<EFBFBD>Ӧ<EEA3AC><D3A6><EFBFBD><EFBFBD><EFBFBD><EFBFBD>д<EFBFBD><D0B4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͵ĺ<CDB5><C4BA><EFBFBD>ʵ<EFBFBD><CAB5> */
|
||||
{cuda_c_sum, COL, "CUDA optimized column sum"},
|
||||
/* <20>ڶ<EFBFBD><DAB6>Ӧ<EEA3AC><D3A6><EFBFBD><EFBFBD><EFBFBD><EFBFBD>д<EFBFBD><D0B4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͵ĺ<CDB5><C4BA><EFBFBD>ʵ<EFBFBD><CAB5> */
|
||||
{cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"},
|
||||
|
||||
{c_sum, COL, "Column sum, reference implementation"},
|
||||
|
||||
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD>Ĵ<EFBFBD><C4B4>벻<EFBFBD><EBB2BB><EFBFBD>Ļ<DEB8><C4BB><EFBFBD>ɾ<EFBFBD><C9BE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>б<EFBFBD><D0B1><EFBFBD><EFBFBD><EFBFBD> */
|
||||
{NULL,ROWCOL,NULL}
|
||||
};
|
||||
Binary file not shown.
@ -1,240 +0,0 @@
|
||||
/**************************************************************************
|
||||
<20><>/<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
1. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
2. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
3. <20><>rc_fun_rec rc_fun_tab<61><62><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
<20><><EFBFBD><EFBFBD>201209054233
|
||||
<20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "rowcol.h"
|
||||
#include <math.h>
|
||||
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><F2BFBFBF>
|
||||
<20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>2<EFBFBD><32><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
*/
|
||||
|
||||
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
int i,j;
|
||||
for (j = 0; j < N; j++) {
|
||||
colsum[j] = 0;
|
||||
for (i = 0; i < N; i++)
|
||||
colsum[j] += M[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
|
||||
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
int i,j;
|
||||
for (i = 0; i < N; i++) {
|
||||
rowsum[i] = colsum[i] = 0;
|
||||
for (j = 0; j < N; j++) {
|
||||
rowsum[i] += M[i][j];
|
||||
colsum[i] += M[j][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
<20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>, COL/ROWCOL, "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>"<22><>
|
||||
COL<4F><4C><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
ROWCOL<4F><4C><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<20><><EFBFBD><EFBFBD>
|
||||
{my_c_sum1, "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>"},
|
||||
{my_rc_sum2, "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>"},
|
||||
*/
|
||||
|
||||
rc_fun_rec rc_fun_tab[] =
|
||||
{
|
||||
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
{c_sum, COL, "Best column sum"},
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
{rc_sum, ROWCOL, "Best row and column sum"},
|
||||
|
||||
{c_sum, COL, "Column sum, reference implementation"},
|
||||
|
||||
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
{NULL,ROWCOL,NULL}
|
||||
};
|
||||
|
||||
// /**************************************************************************
|
||||
// <09><>/<2F><><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><D2AA><EFBFBD>༭<EFBFBD><E0BCAD><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD>
|
||||
// 1. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ѧ<EFBFBD>š<EFBFBD><C5A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ע<EFBFBD>͵ķ<CDB5>ʽд<CABD><D0B4><EFBFBD><EFBFBD><EFBFBD>棻
|
||||
// 2. ʵ<>ֲ<EFBFBD>ͬ<EFBFBD>汾<EFBFBD><E6B1BE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD><EFBFBD><EFBFBD>
|
||||
// 3. <20>༭rc_fun_rec rc_fun_tab<61><62><EFBFBD>飬<EFBFBD><E9A3AC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>õĴ<C3B5><C4B4><EFBFBD>
|
||||
// <09><><EFBFBD><EFBFBD><EFBFBD>õ<EFBFBD><C3B5>к<EFBFBD><D0BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͡<EFBFBD><CDA1><EFBFBD><EFBFBD>õ<EFBFBD><C3B5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͣ<EFBFBD><CDA3><EFBFBD>Ϊ<EFBFBD><CEAA><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ǰ<EFBFBD><C7B0><EFBFBD><EFBFBD>
|
||||
// ***************************************************************************/
|
||||
//
|
||||
// /*
|
||||
// ѧ<>ţ<EFBFBD>202302723005
|
||||
// <09><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>̾<EFBFBD><CCBE><EFBFBD>
|
||||
// */
|
||||
//
|
||||
//
|
||||
// #include <stdio.h>
|
||||
// #include <stdlib.h>
|
||||
// #include "rowcol.h"
|
||||
// #include <math.h>
|
||||
// #include <cuda_runtime.h>
|
||||
//
|
||||
// /* <20>ο<EFBFBD><CEBF><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD>ʵ<EFBFBD><CAB5> */
|
||||
// /* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>е<EFBFBD>ÿһ<C3BF>еĺ͡<C4BA><CDA1><EFBFBD>ע<EFBFBD><D7A2><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>к<EFBFBD><D0BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˵<EFBFBD><CBB5><EFBFBD><EFBFBD><EFBFBD>ò<EFBFBD><C3B2><EFBFBD><EFBFBD><EFBFBD>
|
||||
// һ<><D2BB><EFBFBD>ģ<EFBFBD>ֻ<EFBFBD>ǵ<EFBFBD>2<EFBFBD><32><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>õ<EFBFBD><C3B5><EFBFBD><EFBFBD><EFBFBD>
|
||||
// */
|
||||
//
|
||||
// void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
// {
|
||||
// int i,j;
|
||||
// for (j = 0; j < N; j++) {
|
||||
// colsum[j] = 0;
|
||||
// for (i = 0; i < N; i++)
|
||||
// colsum[j] += M[i][j];
|
||||
// }
|
||||
// }
|
||||
//
|
||||
//
|
||||
// /* <20>ο<EFBFBD><CEBF><EFBFBD><EFBFBD>к<EFBFBD><D0BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD>ʵ<EFBFBD><CAB5> */
|
||||
// /* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>е<EFBFBD>ÿһ<C3BF>С<EFBFBD>ÿһ<C3BF>еĺ͡<C4BA> */
|
||||
//
|
||||
// void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
// {
|
||||
// int i,j;
|
||||
// for (i = 0; i < N; i++) {
|
||||
// rowsum[i] = colsum[i] = 0;
|
||||
// for (j = 0; j < N; j++) {
|
||||
// rowsum[i] += M[i][j];
|
||||
// colsum[i] += M[j][i];
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// /* CUDA<44>Ż<EFBFBD><C5BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD> */
|
||||
// void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
// {
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD>豸<EFBFBD>ڴ<EFBFBD>
|
||||
// int *d_M, *d_colsum;
|
||||
// cudaMalloc(&d_M, N * N * sizeof(int));
|
||||
// cudaMalloc(&d_colsum, N * sizeof(int));
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD>ݴ<EFBFBD><DDB4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD><C6B5>豸
|
||||
// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD>CUDA<44>˺<EFBFBD><CBBA><EFBFBD>
|
||||
// dim3 blockDim(256);
|
||||
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD>˺<EFBFBD><CBBA><EFBFBD>
|
||||
// cudaColumnSum<<<gridDim, blockDim>>>(d_M, d_colsum);
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>豸<EFBFBD><E8B1B8><EFBFBD>ƻ<EFBFBD><C6BB><EFBFBD><EFBFBD><EFBFBD>
|
||||
// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
//
|
||||
// // <20>ͷ<EFBFBD><CDB7>豸<EFBFBD>ڴ<EFBFBD>
|
||||
// cudaFree(d_M);
|
||||
// cudaFree(d_colsum);
|
||||
// }
|
||||
//
|
||||
// /* CUDA<44>Ż<EFBFBD><C5BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD> */
|
||||
// void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
// {
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD>豸<EFBFBD>ڴ<EFBFBD>
|
||||
// int *d_M, *d_rowsum, *d_colsum;
|
||||
// cudaMalloc(&d_M, N * N * sizeof(int));
|
||||
// cudaMalloc(&d_rowsum, N * sizeof(int));
|
||||
// cudaMalloc(&d_colsum, N * sizeof(int));
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD>ݴ<EFBFBD><DDB4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD><C6B5>豸
|
||||
// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD>CUDA<44>˺<EFBFBD><CBBA><EFBFBD>
|
||||
// dim3 blockDim(256);
|
||||
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD>˺<EFBFBD><CBBA><EFBFBD>
|
||||
// cudaRowColSum<<<gridDim, blockDim>>>(d_M, d_rowsum, d_colsum);
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>豸<EFBFBD><E8B1B8><EFBFBD>ƻ<EFBFBD><C6BB><EFBFBD><EFBFBD><EFBFBD>
|
||||
// cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
//
|
||||
// // <20>ͷ<EFBFBD><CDB7>豸<EFBFBD>ڴ<EFBFBD>
|
||||
// cudaFree(d_M);
|
||||
// cudaFree(d_rowsum);
|
||||
// cudaFree(d_colsum);
|
||||
// }
|
||||
//
|
||||
// /* CUDA<44>˺<EFBFBD><CBBA><EFBFBD> - <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
// __global__ void cudaColumnSum(int *M, int *colsum)
|
||||
// {
|
||||
// int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
// if (col < N) {
|
||||
// colsum[col] = 0;
|
||||
// for (int row = 0; row < N; row++) {
|
||||
// colsum[col] += M[row * N + col];
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// /* CUDA<44>˺<EFBFBD><CBBA><EFBFBD> - <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
// __global__ void cudaRowColSum(int *M, int *rowsum, int *colsum)
|
||||
// {
|
||||
// int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
// if (idx < N) {
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD>к<EFBFBD>
|
||||
// rowsum[idx] = 0;
|
||||
// for (int j = 0; j < N; j++) {
|
||||
// rowsum[idx] += M[idx * N + j];
|
||||
// }
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD>к<EFBFBD>
|
||||
// colsum[idx] = 0;
|
||||
// for (int i = 0; i < N; i++) {
|
||||
// colsum[idx] += M[i * N + idx];
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// /*
|
||||
// <09><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ԫ<EFBFBD>أ<EFBFBD>ÿһ<C3BF><D2BB>Ԫ<EFBFBD>أ<EFBFBD><D8A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>, COL/ROWCOL, "<22><><EFBFBD><EFBFBD><EFBFBD>ַ<EFBFBD><D6B7><EFBFBD>"<22><>
|
||||
// COL<4F><4C>ʾ<EFBFBD>ú<EFBFBD><C3BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ÿһ<C3BF>еĺ<D0B5>
|
||||
// ROWCOL<4F><4C>ʾ<EFBFBD>ú<EFBFBD><C3BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ÿһ<C3BF>С<EFBFBD>ÿһ<C3BF>еĺ<D0B5>
|
||||
// <09><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϊ<EFBFBD><CEAA><EFBFBD>õ<EFBFBD><C3B5><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD>֣<EFBFBD><D6A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ǰ<EFBFBD>档
|
||||
// <09><><EFBFBD>磺
|
||||
// {my_c_sum1, "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD><CAB5>"},
|
||||
// {my_rc_sum2, "<22><>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD><CAB5>"},
|
||||
// */
|
||||
//
|
||||
// rc_fun_rec rc_fun_tab[] =
|
||||
// {
|
||||
//
|
||||
// /* <20><>һ<EFBFBD>Ӧ<EEA3AC><D3A6><EFBFBD><EFBFBD><EFBFBD><EFBFBD>д<EFBFBD><D0B4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͵ĺ<CDB5><C4BA><EFBFBD>ʵ<EFBFBD><CAB5> */
|
||||
// {cuda_c_sum, COL, "CUDA optimized column sum"},
|
||||
// /* <20>ڶ<EFBFBD><DAB6>Ӧ<EEA3AC><D3A6><EFBFBD><EFBFBD><EFBFBD><EFBFBD>д<EFBFBD><D0B4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͵ĺ<CDB5><C4BA><EFBFBD>ʵ<EFBFBD><CAB5> */
|
||||
// {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"},
|
||||
//
|
||||
// {c_sum, COL, "Column sum, reference implementation"},
|
||||
//
|
||||
// {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||
//
|
||||
// /* <20><><EFBFBD><EFBFBD><EFBFBD>Ĵ<EFBFBD><C4B4>벻<EFBFBD><EBB2BB><EFBFBD>Ļ<DEB8><C4BB><EFBFBD>ɾ<EFBFBD><C9BE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>б<EFBFBD><D0B1><EFBFBD><EFBFBD><EFBFBD> */
|
||||
// {NULL,ROWCOL,NULL}
|
||||
// };
|
||||
@ -1,240 +0,0 @@
|
||||
/**************************************************************************
|
||||
<20><>/<2F><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
1. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
2. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
3. <20><>rc_fun_rec rc_fun_tab<61><62><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
<20><><EFBFBD><EFBFBD>201209054233
|
||||
<20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "rowcol.h"
|
||||
#include <math.h>
|
||||
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><F2BFBFBF>
|
||||
<20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>2<EFBFBD><32><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
*/
|
||||
|
||||
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
int i,j;
|
||||
for (j = 0; j < N; j++) {
|
||||
colsum[j] = 0;
|
||||
for (i = 0; i < N; i++)
|
||||
colsum[j] += M[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
|
||||
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
int i,j;
|
||||
for (i = 0; i < N; i++) {
|
||||
rowsum[i] = colsum[i] = 0;
|
||||
for (j = 0; j < N; j++) {
|
||||
rowsum[i] += M[i][j];
|
||||
colsum[i] += M[j][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
<20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>, COL/ROWCOL, "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>"<22><>
|
||||
COL<4F><4C><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
ROWCOL<4F><4C><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
<20><><EFBFBD><EFBFBD>
|
||||
{my_c_sum1, "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>"},
|
||||
{my_rc_sum2, "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>"},
|
||||
*/
|
||||
|
||||
rc_fun_rec rc_fun_tab[] =
|
||||
{
|
||||
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
{c_sum, COL, "Best column sum"},
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
{rc_sum, ROWCOL, "Best row and column sum"},
|
||||
|
||||
{c_sum, COL, "Column sum, reference implementation"},
|
||||
|
||||
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
{NULL,ROWCOL,NULL}
|
||||
};
|
||||
|
||||
// /**************************************************************************
|
||||
// <09><>/<2F><><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><D2AA><EFBFBD>༭<EFBFBD><E0BCAD><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD>
|
||||
// 1. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ѧ<EFBFBD>š<EFBFBD><C5A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ע<EFBFBD>͵ķ<CDB5>ʽд<CABD><D0B4><EFBFBD><EFBFBD><EFBFBD>棻
|
||||
// 2. ʵ<>ֲ<EFBFBD>ͬ<EFBFBD>汾<EFBFBD><E6B1BE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD><EFBFBD><EFBFBD>
|
||||
// 3. <20>༭rc_fun_rec rc_fun_tab<61><62><EFBFBD>飬<EFBFBD><E9A3AC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>õĴ<C3B5><C4B4><EFBFBD>
|
||||
// <09><><EFBFBD><EFBFBD><EFBFBD>õ<EFBFBD><C3B5>к<EFBFBD><D0BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͡<EFBFBD><CDA1><EFBFBD><EFBFBD>õ<EFBFBD><C3B5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͣ<EFBFBD><CDA3><EFBFBD>Ϊ<EFBFBD><CEAA><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ǰ<EFBFBD><C7B0><EFBFBD><EFBFBD>
|
||||
// ***************************************************************************/
|
||||
//
|
||||
// /*
|
||||
// ѧ<>ţ<EFBFBD>202302723005
|
||||
// <09><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>̾<EFBFBD><CCBE><EFBFBD>
|
||||
// */
|
||||
//
|
||||
//
|
||||
// #include <stdio.h>
|
||||
// #include <stdlib.h>
|
||||
// #include "rowcol.h"
|
||||
// #include <math.h>
|
||||
// #include <cuda_runtime.h>
|
||||
//
|
||||
// /* <20>ο<EFBFBD><CEBF><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD>ʵ<EFBFBD><CAB5> */
|
||||
// /* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>е<EFBFBD>ÿһ<C3BF>еĺ͡<C4BA><CDA1><EFBFBD>ע<EFBFBD><D7A2><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>к<EFBFBD><D0BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˵<EFBFBD><CBB5><EFBFBD><EFBFBD><EFBFBD>ò<EFBFBD><C3B2><EFBFBD><EFBFBD><EFBFBD>
|
||||
// һ<><D2BB><EFBFBD>ģ<EFBFBD>ֻ<EFBFBD>ǵ<EFBFBD>2<EFBFBD><32><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>õ<EFBFBD><C3B5><EFBFBD><EFBFBD><EFBFBD>
|
||||
// */
|
||||
//
|
||||
// void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
// {
|
||||
// int i,j;
|
||||
// for (j = 0; j < N; j++) {
|
||||
// colsum[j] = 0;
|
||||
// for (i = 0; i < N; i++)
|
||||
// colsum[j] += M[i][j];
|
||||
// }
|
||||
// }
|
||||
//
|
||||
//
|
||||
// /* <20>ο<EFBFBD><CEBF><EFBFBD><EFBFBD>к<EFBFBD><D0BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD>ʵ<EFBFBD><CAB5> */
|
||||
// /* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>е<EFBFBD>ÿһ<C3BF>С<EFBFBD>ÿһ<C3BF>еĺ͡<C4BA> */
|
||||
//
|
||||
// void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
// {
|
||||
// int i,j;
|
||||
// for (i = 0; i < N; i++) {
|
||||
// rowsum[i] = colsum[i] = 0;
|
||||
// for (j = 0; j < N; j++) {
|
||||
// rowsum[i] += M[i][j];
|
||||
// colsum[i] += M[j][i];
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// /* CUDA<44>Ż<EFBFBD><C5BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD> */
|
||||
// void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
// {
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD>豸<EFBFBD>ڴ<EFBFBD>
|
||||
// int *d_M, *d_colsum;
|
||||
// cudaMalloc(&d_M, N * N * sizeof(int));
|
||||
// cudaMalloc(&d_colsum, N * sizeof(int));
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD>ݴ<EFBFBD><DDB4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD><C6B5>豸
|
||||
// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD>CUDA<44>˺<EFBFBD><CBBA><EFBFBD>
|
||||
// dim3 blockDim(256);
|
||||
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD>˺<EFBFBD><CBBA><EFBFBD>
|
||||
// cudaColumnSum<<<gridDim, blockDim>>>(d_M, d_colsum);
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>豸<EFBFBD><E8B1B8><EFBFBD>ƻ<EFBFBD><C6BB><EFBFBD><EFBFBD><EFBFBD>
|
||||
// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
//
|
||||
// // <20>ͷ<EFBFBD><CDB7>豸<EFBFBD>ڴ<EFBFBD>
|
||||
// cudaFree(d_M);
|
||||
// cudaFree(d_colsum);
|
||||
// }
|
||||
//
|
||||
// /* CUDA<44>Ż<EFBFBD><C5BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD> */
|
||||
// void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
// {
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD>豸<EFBFBD>ڴ<EFBFBD>
|
||||
// int *d_M, *d_rowsum, *d_colsum;
|
||||
// cudaMalloc(&d_M, N * N * sizeof(int));
|
||||
// cudaMalloc(&d_rowsum, N * sizeof(int));
|
||||
// cudaMalloc(&d_colsum, N * sizeof(int));
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD>ݴ<EFBFBD><DDB4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD><C6B5>豸
|
||||
// cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD>CUDA<44>˺<EFBFBD><CBBA><EFBFBD>
|
||||
// dim3 blockDim(256);
|
||||
// dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD>˺<EFBFBD><CBBA><EFBFBD>
|
||||
// cudaRowColSum<<<gridDim, blockDim>>>(d_M, d_rowsum, d_colsum);
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>豸<EFBFBD><E8B1B8><EFBFBD>ƻ<EFBFBD><C6BB><EFBFBD><EFBFBD><EFBFBD>
|
||||
// cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
// cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
//
|
||||
// // <20>ͷ<EFBFBD><CDB7>豸<EFBFBD>ڴ<EFBFBD>
|
||||
// cudaFree(d_M);
|
||||
// cudaFree(d_rowsum);
|
||||
// cudaFree(d_colsum);
|
||||
// }
|
||||
//
|
||||
// /* CUDA<44>˺<EFBFBD><CBBA><EFBFBD> - <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
// __global__ void cudaColumnSum(int *M, int *colsum)
|
||||
// {
|
||||
// int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
// if (col < N) {
|
||||
// colsum[col] = 0;
|
||||
// for (int row = 0; row < N; row++) {
|
||||
// colsum[col] += M[row * N + col];
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// /* CUDA<44>˺<EFBFBD><CBBA><EFBFBD> - <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
// __global__ void cudaRowColSum(int *M, int *rowsum, int *colsum)
|
||||
// {
|
||||
// int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
// if (idx < N) {
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD>к<EFBFBD>
|
||||
// rowsum[idx] = 0;
|
||||
// for (int j = 0; j < N; j++) {
|
||||
// rowsum[idx] += M[idx * N + j];
|
||||
// }
|
||||
//
|
||||
// // <20><><EFBFBD><EFBFBD><EFBFBD>к<EFBFBD>
|
||||
// colsum[idx] = 0;
|
||||
// for (int i = 0; i < N; i++) {
|
||||
// colsum[idx] += M[i * N + idx];
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// /*
|
||||
// <09><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ԫ<EFBFBD>أ<EFBFBD>ÿһ<C3BF><D2BB>Ԫ<EFBFBD>أ<EFBFBD><D8A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>, COL/ROWCOL, "<22><><EFBFBD><EFBFBD><EFBFBD>ַ<EFBFBD><D6B7><EFBFBD>"<22><>
|
||||
// COL<4F><4C>ʾ<EFBFBD>ú<EFBFBD><C3BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ÿһ<C3BF>еĺ<D0B5>
|
||||
// ROWCOL<4F><4C>ʾ<EFBFBD>ú<EFBFBD><C3BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ÿһ<C3BF>С<EFBFBD>ÿһ<C3BF>еĺ<D0B5>
|
||||
// <09><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϊ<EFBFBD><CEAA><EFBFBD>õ<EFBFBD><C3B5><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD>֣<EFBFBD><D6A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ǰ<EFBFBD>档
|
||||
// <09><><EFBFBD>磺
|
||||
// {my_c_sum1, "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD><CAB5>"},
|
||||
// {my_rc_sum2, "<22><>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD><CAB5>"},
|
||||
// */
|
||||
//
|
||||
// rc_fun_rec rc_fun_tab[] =
|
||||
// {
|
||||
//
|
||||
// /* <20><>һ<EFBFBD>Ӧ<EEA3AC><D3A6><EFBFBD><EFBFBD><EFBFBD><EFBFBD>д<EFBFBD><D0B4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͵ĺ<CDB5><C4BA><EFBFBD>ʵ<EFBFBD><CAB5> */
|
||||
// {cuda_c_sum, COL, "CUDA optimized column sum"},
|
||||
// /* <20>ڶ<EFBFBD><DAB6>Ӧ<EEA3AC><D3A6><EFBFBD><EFBFBD><EFBFBD><EFBFBD>д<EFBFBD><D0B4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͵ĺ<CDB5><C4BA><EFBFBD>ʵ<EFBFBD><CAB5> */
|
||||
// {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"},
|
||||
//
|
||||
// {c_sum, COL, "Column sum, reference implementation"},
|
||||
//
|
||||
// {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||
//
|
||||
// /* <20><><EFBFBD><EFBFBD><EFBFBD>Ĵ<EFBFBD><C4B4>벻<EFBFBD><EBB2BB><EFBFBD>Ļ<DEB8><C4BB><EFBFBD>ɾ<EFBFBD><C9BE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>б<EFBFBD><D0B1><EFBFBD><EFBFBD><EFBFBD> */
|
||||
// {NULL,ROWCOL,NULL}
|
||||
// };
|
||||
@ -1,69 +0,0 @@
|
||||
/**************************************************************************
|
||||
??/???????????????????????????????
|
||||
1. ???????????????????????????????
|
||||
2. ??????????????????????
|
||||
3. ??rc_fun_rec rc_fun_tab??????????????????
|
||||
???????????????????????????????????????????
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
????201209054233
|
||||
??????????????
|
||||
*/
|
||||
|
||||
#include "rowcol.h"
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/* ????????????????? */
|
||||
/* ???????????????????????????????????????????????
|
||||
??????????2?????????????????
|
||||
*/
|
||||
|
||||
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
|
||||
int i, j;
|
||||
for (j = 0; j < N; j++) {
|
||||
colsum[j] = 0;
|
||||
for (i = 0; i < N; i++)
|
||||
colsum[j] += M[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
/* ???????????????????? */
|
||||
/* ??????????????????????? */
|
||||
|
||||
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
|
||||
int i, j;
|
||||
for (i = 0; i < N; i++) {
|
||||
rowsum[i] = colsum[i] = 0;
|
||||
for (j = 0; j < N; j++) {
|
||||
rowsum[i] += M[i][j];
|
||||
colsum[i] += M[j][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
????????????????????????????????????????, COL/ROWCOL, "?????????"??
|
||||
COL??????????????????????
|
||||
ROWCOL???????????????????????
|
||||
?????????????????????????????
|
||||
????
|
||||
{my_c_sum1, "?????????????????"},
|
||||
{my_rc_sum2, "??????????????????"},
|
||||
*/
|
||||
|
||||
rc_fun_rec rc_fun_tab[] = {
|
||||
|
||||
/* ???????????????????????????????? */
|
||||
{c_sum, COL, "Best column sum"},
|
||||
/* ?????????????????????????????????? */
|
||||
{rc_sum, ROWCOL, "Best row and column sum"},
|
||||
|
||||
{c_sum, COL, "Column sum, reference implementation"},
|
||||
|
||||
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||
|
||||
/* ??????????????????????????????????????? */
|
||||
{NULL, ROWCOL, NULL}};
|
||||
Binary file not shown.
121
perflab/matrix/rowcol_723005.c
Normal file
121
perflab/matrix/rowcol_723005.c
Normal file
@ -0,0 +1,121 @@
|
||||
/**************************************************************************
|
||||
<09><>/<2F><><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><D2AA><EFBFBD>༭<EFBFBD><E0BCAD><EFBFBD>ļ<EFBFBD><C4BC><EFBFBD>
|
||||
1. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ѧ<EFBFBD>š<EFBFBD><C5A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ע<EFBFBD>͵ķ<CDB5>ʽд<CABD><D0B4><EFBFBD><EFBFBD><EFBFBD>棻
|
||||
2. ʵ<>ֲ<EFBFBD>ͬ<EFBFBD>汾<EFBFBD><E6B1BE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD><EFBFBD><EFBFBD>
|
||||
3. <20>༭rc_fun_rec rc_fun_tab<61><62><EFBFBD>飬<EFBFBD><E9A3AC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>õĴ<C3B5><C4B4><EFBFBD>
|
||||
<09><><EFBFBD><EFBFBD><EFBFBD>õ<EFBFBD><C3B5>к<EFBFBD><D0BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͡<EFBFBD><CDA1><EFBFBD><EFBFBD>õ<EFBFBD><C3B5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͣ<EFBFBD><CDA3><EFBFBD>Ϊ<EFBFBD><CEAA><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ǰ<EFBFBD><C7B0><EFBFBD><EFBFBD>
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
ѧ<>ţ<EFBFBD>202302723005
|
||||
<09><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>̾<EFBFBD><CCBE><EFBFBD>
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "rowcol.h"
|
||||
#include <math.h>
|
||||
|
||||
/* <20>ο<EFBFBD><CEBF><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD>ʵ<EFBFBD><CAB5> */
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>е<EFBFBD>ÿһ<C3BF>еĺ͡<C4BA><CDA1><EFBFBD>ע<EFBFBD><D7A2><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>к<EFBFBD><D0BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>˵<EFBFBD><CBB5><EFBFBD><EFBFBD><EFBFBD>ò<EFBFBD><C3B2><EFBFBD><EFBFBD><EFBFBD>
|
||||
һ<><D2BB><EFBFBD>ģ<EFBFBD>ֻ<EFBFBD>ǵ<EFBFBD>2<EFBFBD><32><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>õ<EFBFBD><C3B5><EFBFBD><EFBFBD><EFBFBD>
|
||||
*/
|
||||
/*
|
||||
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
int i,j;
|
||||
for (j = 0; j < N; j++) {
|
||||
colsum[j] = 0;
|
||||
for (i = 0; i < N; i++)
|
||||
colsum[j] += M[i][j];
|
||||
}
|
||||
}
|
||||
*/
|
||||
void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
|
||||
int i, j;
|
||||
for (j = 0; j < N; j += 4) {
|
||||
int sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
|
||||
for (i = 0; i < N; i++) {
|
||||
sum0 += M[i][j];
|
||||
sum1 += M[i][j + 1];
|
||||
sum2 += M[i][j + 2];
|
||||
sum3 += M[i][j + 3];
|
||||
}
|
||||
colsum[j] = sum0;
|
||||
colsum[j + 1] = sum1;
|
||||
colsum[j + 2] = sum2;
|
||||
colsum[j + 3] = sum3;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* <20>ο<EFBFBD><CEBF><EFBFBD><EFBFBD>к<EFBFBD><D0BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͺ<EFBFBD><CDBA><EFBFBD>ʵ<EFBFBD><CAB5> */
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>е<EFBFBD>ÿһ<C3BF>С<EFBFBD>ÿһ<C3BF>еĺ͡<C4BA> */
|
||||
/*
|
||||
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
|
||||
{
|
||||
int i,j;
|
||||
for (i = 0; i < N; i++) {
|
||||
rowsum[i] = colsum[i] = 0;
|
||||
for (j = 0; j < N; j++) {
|
||||
rowsum[i] += M[i][j];
|
||||
colsum[i] += M[j][i];
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
|
||||
int i, j;
|
||||
// <20><>ʼ<EFBFBD><CABC>colsum
|
||||
for (i = 0; i < N; i++) {
|
||||
colsum[i] = 0;
|
||||
}
|
||||
|
||||
// <20>ֿ鴦<D6BF><E9B4A6><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><DFBB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
for (i = 0; i < N; i += 4) {
|
||||
int row_sum0 = 0, row_sum1 = 0, row_sum2 = 0, row_sum3 = 0;
|
||||
for (j = 0; j < N; j++) {
|
||||
// <20><><EFBFBD><EFBFBD><EFBFBD>к<EFBFBD>
|
||||
row_sum0 += M[i][j];
|
||||
row_sum1 += M[i + 1][j];
|
||||
row_sum2 += M[i + 2][j];
|
||||
row_sum3 += M[i + 3][j];
|
||||
|
||||
// ͬʱ<CDAC><CAB1><EFBFBD><EFBFBD><EFBFBD>к<EFBFBD>
|
||||
colsum[j] += M[i][j] + M[i + 1][j] + M[i + 2][j] + M[i + 3][j];
|
||||
}
|
||||
rowsum[i] = row_sum0;
|
||||
rowsum[i + 1] = row_sum1;
|
||||
rowsum[i + 2] = row_sum2;
|
||||
rowsum[i + 3] = row_sum3;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
<09><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ԫ<EFBFBD>أ<EFBFBD>ÿһ<C3BF><D2BB>Ԫ<EFBFBD>أ<EFBFBD><D8A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>, COL/ROWCOL, "<22><><EFBFBD><EFBFBD><EFBFBD>ַ<EFBFBD><D6B7><EFBFBD>"<22><>
|
||||
COL<4F><4C>ʾ<EFBFBD>ú<EFBFBD><C3BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ÿһ<C3BF>еĺ<D0B5>
|
||||
ROWCOL<4F><4C>ʾ<EFBFBD>ú<EFBFBD><C3BA><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ÿһ<C3BF>С<EFBFBD>ÿһ<C3BF>еĺ<D0B5>
|
||||
<09><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϊ<EFBFBD><CEAA><EFBFBD>õ<EFBFBD><C3B5><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD>֣<EFBFBD><D6A3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ǰ<EFBFBD>档
|
||||
<09><><EFBFBD>磺
|
||||
{my_c_sum1, "<22><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD><CAB5>"},
|
||||
{my_rc_sum2, "<22><>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD><CAB5>"},
|
||||
*/
|
||||
|
||||
rc_fun_rec rc_fun_tab[] =
|
||||
{
|
||||
|
||||
/* <20><>һ<EFBFBD>Ӧ<EEA3AC><D3A6><EFBFBD><EFBFBD><EFBFBD><EFBFBD>д<EFBFBD><D0B4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͵ĺ<CDB5><C4BA><EFBFBD>ʵ<EFBFBD><CAB5> */
|
||||
{c_sum, COL, "Best column sum"},
|
||||
/* <20>ڶ<EFBFBD><DAB6>Ӧ<EEA3AC><D3A6><EFBFBD><EFBFBD><EFBFBD><EFBFBD>д<EFBFBD><D0B4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͵ĺ<CDB5><C4BA><EFBFBD>ʵ<EFBFBD><CAB5> */
|
||||
{rc_sum, ROWCOL, "Best row and column sum"},
|
||||
|
||||
{c_sum, COL, "Column sum, reference implementation"},
|
||||
|
||||
{rc_sum, ROWCOL, "Row and column sum, reference implementation"},
|
||||
|
||||
/* <20><><EFBFBD><EFBFBD><EFBFBD>Ĵ<EFBFBD><C4B4>벻<EFBFBD><EBB2BB><EFBFBD>Ļ<DEB8><C4BB><EFBFBD>ɾ<EFBFBD><C9BE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>б<EFBFBD><D0B1><EFBFBD><EFBFBD><EFBFBD> */
|
||||
{NULL,ROWCOL,NULL}
|
||||
};
|
||||
@ -1,9 +1,9 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
// #include <random.h>
|
||||
#include "clock.h"
|
||||
#include "fcyc.h"
|
||||
//#include <random.h>
|
||||
#include "rowcol.h"
|
||||
#include "fcyc.h"
|
||||
#include "clock.h"
|
||||
|
||||
#define MAX_ITER_COUNT 100
|
||||
|
||||
@ -11,9 +11,9 @@
|
||||
static struct {
|
||||
double cref; /* Cycles taken by reference solution */
|
||||
double cbest; /* Cycles taken by our best implementation */
|
||||
} cstandard[2] = {
|
||||
{7.7, 6.40}, /* Column Sum */
|
||||
{9.75, 6.60} /* Row & Column Sum */
|
||||
} cstandard[2] =
|
||||
{{7.7, 6.40}, /* Column Sum */
|
||||
{9.75, 6.60} /* Row & Column Sum */
|
||||
};
|
||||
|
||||
/* Put in code to align matrix so that it starts on a cache block boundary.
|
||||
@ -26,7 +26,7 @@ static struct {
|
||||
#define WPB 16
|
||||
|
||||
int verbose = 1;
|
||||
int data[N * N + WPB];
|
||||
int data[N*N+WPB];
|
||||
int *mstart;
|
||||
|
||||
typedef vector_t *row_t;
|
||||
@ -37,122 +37,137 @@ vector_t rsref, csref, rcomp, ccomp;
|
||||
static void init_tests(void);
|
||||
extern void make_CPU_busy(void);
|
||||
|
||||
static void init_tests(void) {
|
||||
int i, j;
|
||||
size_t bytes_per_block = sizeof(int) * WPB;
|
||||
/* round mstart up to nearest block boundary */
|
||||
mstart = (int *)(((size_t)data + bytes_per_block - 1) / bytes_per_block *
|
||||
bytes_per_block);
|
||||
for (i = 0; i < N; i++) {
|
||||
rsref[i] = csref[i] = 0;
|
||||
}
|
||||
for (i = 0; i < N; i++) {
|
||||
for (j = 0; j < N; j++) {
|
||||
int val = rand();
|
||||
mstart[i * N + j] = val;
|
||||
rsref[i] += val;
|
||||
csref[j] += val;
|
||||
static void init_tests(void)
|
||||
{
|
||||
int i, j;
|
||||
size_t bytes_per_block = sizeof(int) * WPB;
|
||||
/* round mstart up to nearest block boundary */
|
||||
mstart = (int *)
|
||||
(((size_t) data + bytes_per_block-1) / bytes_per_block * bytes_per_block);
|
||||
for (i = 0; i < N; i++) {
|
||||
rsref[i] = csref[i] = 0;
|
||||
}
|
||||
for (i = 0; i < N; i++) {
|
||||
for (j = 0; j < N; j++) {
|
||||
int val = rand();
|
||||
mstart[i*N+j] = val;
|
||||
rsref[i] += val;
|
||||
csref[j] += val;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Test function on all values */
|
||||
int test_rc(rc_fun f, FILE *rpt, rc_comp_t rc_type) {
|
||||
int i;
|
||||
int ok = 1;
|
||||
int i;
|
||||
int ok = 1;
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
rcomp[i] = ccomp[i] = 0xDEADBEEF;
|
||||
f((row_t)mstart, rcomp, ccomp);
|
||||
for (i = 0; i < N; i++)
|
||||
rcomp[i] = ccomp[i] = 0xDEADBEEF;
|
||||
f((row_t)mstart, rcomp, ccomp);
|
||||
|
||||
for (i = 0; ok && i < N; i++) {
|
||||
if (rc_type == ROWCOL
|
||||
&& rsref[i] != rcomp[i]) {
|
||||
ok = 0;
|
||||
if (rpt)
|
||||
fprintf(rpt,
|
||||
"<EFBFBD>Ե<EFBFBD>%d<>еļ<D0B5><C4BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȷ<EFBFBD><C8B7><EFBFBD><EFBFBD><EFBFBD><EFBFBD>%d<><64><EFBFBD><EFBFBD><EFBFBD>Ǽ<EFBFBD><C7BC><EFBFBD><EFBFBD>õ<EFBFBD>%d\n",
|
||||
i, rsref[i], rcomp[i]);
|
||||
}
|
||||
if ((rc_type == ROWCOL || rc_type == COL)
|
||||
&& csref[i] != ccomp[i]) {
|
||||
ok = 0;
|
||||
if (rpt)
|
||||
fprintf(rpt,
|
||||
"<EFBFBD>Ե<EFBFBD>%d<>еļ<D0B5><C4BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȷ<EFBFBD><C8B7><EFBFBD><EFBFBD><EFBFBD><EFBFBD>%d<><64><EFBFBD><EFBFBD><EFBFBD>Ǽ<EFBFBD><C7BC><EFBFBD><EFBFBD>õ<EFBFBD>%d\n",
|
||||
i, csref[i], ccomp[i]);
|
||||
}
|
||||
|
||||
for (i = 0; ok && i < N; i++) {
|
||||
if (rc_type == ROWCOL && rsref[i] != rcomp[i]) {
|
||||
ok = 0;
|
||||
if (rpt)
|
||||
fprintf(rpt, "对第%d行的计算出错!正确结果是%d,但是计算得到%d\n", i,
|
||||
rsref[i], rcomp[i]);
|
||||
}
|
||||
if ((rc_type == ROWCOL || rc_type == COL) && csref[i] != ccomp[i]) {
|
||||
ok = 0;
|
||||
if (rpt)
|
||||
fprintf(rpt, "对第%d列的计算出错!正确结果是%d,但是计算得到%d\n", i,
|
||||
csref[i], ccomp[i]);
|
||||
}
|
||||
}
|
||||
return ok;
|
||||
return ok;
|
||||
}
|
||||
|
||||
/* Kludgy way to interface to cycle measuring code */
|
||||
void do_test(int *intf) {
|
||||
rc_fun f = (rc_fun)intf;
|
||||
void do_test(int *intf)
|
||||
{
|
||||
rc_fun f = (rc_fun) intf;
|
||||
f((row_t)mstart, rcomp, ccomp);
|
||||
}
|
||||
|
||||
void time_rc(rc_fun f, rc_comp_t rc_type, char *descr, double *cycp) {
|
||||
int i;
|
||||
int *intf = (int *)f;
|
||||
void time_rc(rc_fun f, rc_comp_t rc_type, char *descr, double *cycp)
|
||||
{
|
||||
int i;
|
||||
int *intf = (int *) f;
|
||||
double t, cme;
|
||||
t = 0;
|
||||
if (verbose)
|
||||
printf("函数:%s\n", descr);
|
||||
if (verbose) printf("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>%s\n", descr);
|
||||
if (test_rc(f, stdout, rc_type)) {
|
||||
make_CPU_busy();
|
||||
for (i = 0; i < MAX_ITER_COUNT; i++)
|
||||
t += fcyc((void (*)(long *))do_test, intf);
|
||||
t = t / MAX_ITER_COUNT;
|
||||
cme = t / (N * N);
|
||||
if (verbose)
|
||||
printf(" 总周期数 = %.2f, 平均周期/元素 = %.2f\n", t, cme);
|
||||
make_CPU_busy();
|
||||
for (i=0;i<MAX_ITER_COUNT;i++)
|
||||
t += fcyc((void(*)(long int*))do_test, intf);
|
||||
t = t/MAX_ITER_COUNT;
|
||||
cme = t/(N*N);
|
||||
if (verbose) printf(" <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> = %.2f, ƽ<><C6BD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>/Ԫ<><D4AA> = %.2f\n",
|
||||
t, cme);
|
||||
if (cycp)
|
||||
*cycp = cme;
|
||||
}
|
||||
}
|
||||
|
||||
/* Compute the grade achieved by function */
|
||||
static double compute_score(double cmeas, double cref, double cbest) {
|
||||
double sbest = cref / cbest;
|
||||
double smeas = cref / cmeas;
|
||||
if (smeas < 0.1 * (sbest - 1) + 1)
|
||||
static double compute_score(double cmeas, double cref, double cbest)
|
||||
{
|
||||
double sbest = cref/cbest;
|
||||
double smeas = cref/cmeas;
|
||||
if (smeas < 0.1*(sbest-1)+1)
|
||||
return 0;
|
||||
if (smeas > 1.1 * (sbest - 1) + 1)
|
||||
if (smeas > 1.1*(sbest-1)+1)
|
||||
return 120;
|
||||
return 100 * ((smeas - 1.0) / (sbest - 1.0) + 0.1);
|
||||
return 100*((smeas-1.0)/(sbest-1.0) + 0.1);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int i;
|
||||
double cme;
|
||||
double cme_c, cme_rc;
|
||||
int EnableScore = 0;
|
||||
|
||||
if (argc == 3) {
|
||||
EnableScore = 1;
|
||||
verbose = 0;
|
||||
double cme_c,cme_rc;
|
||||
int EnableScore=0;
|
||||
|
||||
if (argc == 3)
|
||||
{
|
||||
EnableScore = 1;
|
||||
verbose = 0;
|
||||
}
|
||||
init_tests();
|
||||
set_fcyc_clear_cache(1); /* Set so that clears cache between runs */
|
||||
set_fcyc_clear_cache(1); /* Set so that clears cache between runs */
|
||||
for (i = 0; rc_fun_tab[i].f != NULL; i++) {
|
||||
cme = 100.0;
|
||||
time_rc(rc_fun_tab[i].f, rc_fun_tab[i].rc_type, rc_fun_tab[i].descr, &cme);
|
||||
if (i == 0) {
|
||||
cme_c = cme;
|
||||
if (EnableScore == 0) {
|
||||
printf(" 最高\"列求和\"得分 ======================== %.0f\n",
|
||||
compute_score(cme, cstandard[0].cref, cstandard[0].cbest));
|
||||
}
|
||||
}
|
||||
if (i == 1) {
|
||||
cme_rc = cme;
|
||||
if (EnableScore == 0) {
|
||||
printf(" 最高\"行和列求和\"得分 ====================== %.0f\n",
|
||||
compute_score(cme, cstandard[1].cref, cstandard[1].cbest));
|
||||
}
|
||||
}
|
||||
cme = 100.0;
|
||||
time_rc(rc_fun_tab[i].f,
|
||||
rc_fun_tab[i].rc_type, rc_fun_tab[i].descr, &cme);
|
||||
if (i == 0)
|
||||
{
|
||||
cme_c = cme;
|
||||
if (EnableScore==0)
|
||||
{
|
||||
printf(" <20><><EFBFBD><EFBFBD>\"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>\"<EFBFBD>÷<EFBFBD> ======================== %.0f\n",
|
||||
compute_score(cme, cstandard[0].cref, cstandard[0].cbest));
|
||||
}
|
||||
}
|
||||
if (i == 1)
|
||||
{
|
||||
cme_rc = cme;
|
||||
if (EnableScore==0)
|
||||
{
|
||||
printf(" <20><><EFBFBD><EFBFBD>\"<EFBFBD>к<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>\"<EFBFBD>÷<EFBFBD> ====================== %.0f\n",
|
||||
compute_score(cme, cstandard[1].cref, cstandard[1].cbest));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (EnableScore)
|
||||
printf("%.2f\t %.0f\t %.2f\t %.0f\t 0\t 0\n", cme_c,
|
||||
compute_score(cme_c, cstandard[0].cref, cstandard[0].cbest), cme_rc,
|
||||
compute_score(cme_rc, cstandard[1].cref, cstandard[1].cbest));
|
||||
printf("%.2f\t %.0f\t %.2f\t %.0f\t 0\t 0\n",cme_c,compute_score(cme_c, cstandard[0].cref, cstandard[0].cbest),
|
||||
cme_rc,compute_score(cme_rc, cstandard[1].cref, cstandard[1].cbest));
|
||||
return 0;
|
||||
}
|
||||
|
||||
Binary file not shown.
@ -1,35 +0,0 @@
|
||||
CC = gcc
|
||||
NVCC = nvcc
|
||||
CFLAGS = -Wall -O2 -g
|
||||
CUDA_FLAGS = -O2 -g
|
||||
LDFLAGS = -lm -lcudart
|
||||
|
||||
# Source files
|
||||
SRCS = poly_test.c clock.c cpe.c fcyc.c lsquare.c
|
||||
CUDA_SRCS = poly.cu
|
||||
OBJS = $(SRCS:.c=.o) poly.o
|
||||
|
||||
# Target executable
|
||||
TARGET = poly_test
|
||||
|
||||
# Default target
|
||||
all: $(TARGET)
|
||||
|
||||
# Rule to build the executable
|
||||
$(TARGET): $(OBJS)
|
||||
$(CC) $(OBJS) -o $(TARGET) $(LDFLAGS)
|
||||
|
||||
# Rule to build object files
|
||||
%.o: %.c
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
# Rule to build CUDA object files
|
||||
poly.o: poly.cu
|
||||
$(NVCC) $(CUDA_FLAGS) -c $< -o $@
|
||||
|
||||
# Clean rule
|
||||
clean:
|
||||
rm -f $(OBJS) $(TARGET)
|
||||
|
||||
# Phony targets
|
||||
.PHONY: all clean
|
||||
BIN
perflab/poly/a.exe
Executable file
BIN
perflab/poly/a.exe
Executable file
Binary file not shown.
@ -13,11 +13,11 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <x86intrin.h>
|
||||
// #include <intrinsics.h>
|
||||
#include "clock.h"
|
||||
#include <time.h>
|
||||
#include <intrin.h>
|
||||
//#include <intrinsics.h>
|
||||
#include <windows.h>
|
||||
#include <time.h>
|
||||
#include "clock.h"
|
||||
|
||||
/* Use x86 cycle counter */
|
||||
|
||||
@ -27,195 +27,203 @@ static unsigned cyc_lo = 0;
|
||||
|
||||
/* Set *hi and *lo to the high and low order bits of the cycle counter.
|
||||
Implementation requires assembly code to use the rdtsc instruction. */
|
||||
void access_counter(unsigned *hi, unsigned *lo) {
|
||||
void access_counter(unsigned *hi, unsigned *lo)
|
||||
{
|
||||
|
||||
long long counter;
|
||||
long long counter;
|
||||
|
||||
counter = __rdtsc();
|
||||
(*hi) = (unsigned int)(counter >> 32);
|
||||
(*lo) = (unsigned int)counter;
|
||||
/*
|
||||
counter = __rdtsc();
|
||||
(*hi) = (unsigned int)(counter >> 32);
|
||||
(*lo) = (unsigned int)counter;
|
||||
/*
|
||||
|
||||
LARGE_INTEGER lPerformanceCount;
|
||||
LARGE_INTEGER lPerformanceCount;
|
||||
|
||||
QueryPerformanceCounter(&lPerformanceCount);
|
||||
(*hi) = (unsigned int)lPerformanceCount.HighPart;
|
||||
(*lo) = (unsigned int)lPerformanceCount.LowPart;
|
||||
// printf("%08X %08X\n",(*hi),(*lo));
|
||||
*/
|
||||
QueryPerformanceCounter(&lPerformanceCount);
|
||||
(*hi) = (unsigned int)lPerformanceCount.HighPart;
|
||||
(*lo) = (unsigned int)lPerformanceCount.LowPart;
|
||||
// printf("%08X %08X\n",(*hi),(*lo));
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
/* Record the current value of the cycle counter. */
|
||||
void start_counter() { access_counter(&cyc_hi, &cyc_lo); }
|
||||
void start_counter()
|
||||
{
|
||||
access_counter(&cyc_hi, &cyc_lo);
|
||||
}
|
||||
|
||||
/* Return the number of cycles since the last call to start_counter. */
|
||||
double get_counter() {
|
||||
unsigned ncyc_hi, ncyc_lo;
|
||||
unsigned hi, lo, borrow;
|
||||
double result;
|
||||
double get_counter()
|
||||
{
|
||||
unsigned ncyc_hi, ncyc_lo;
|
||||
unsigned hi, lo, borrow;
|
||||
double result;
|
||||
|
||||
/* Get cycle counter */
|
||||
access_counter(&ncyc_hi, &ncyc_lo);
|
||||
/* Get cycle counter */
|
||||
access_counter(&ncyc_hi, &ncyc_lo);
|
||||
|
||||
/* Do double precision subtraction */
|
||||
lo = ncyc_lo - cyc_lo;
|
||||
borrow = cyc_lo > ncyc_lo;
|
||||
hi = ncyc_hi - cyc_hi - borrow;
|
||||
result = (double)hi * (1 << 30) * 4 + lo;
|
||||
return result;
|
||||
/* Do double precision subtraction */
|
||||
lo = ncyc_lo - cyc_lo;
|
||||
borrow = cyc_lo > ncyc_lo;
|
||||
hi = ncyc_hi - cyc_hi - borrow;
|
||||
result = (double) hi * (1 << 30) * 4 + lo;
|
||||
return result;
|
||||
}
|
||||
void make_CPU_busy(void) {
|
||||
volatile double old_tick, new_tick;
|
||||
start_counter();
|
||||
old_tick = get_counter();
|
||||
new_tick = get_counter();
|
||||
while (new_tick - old_tick < 1000000000)
|
||||
new_tick = get_counter();
|
||||
void make_CPU_busy(void)
|
||||
{
|
||||
volatile double old_tick,new_tick;
|
||||
start_counter();
|
||||
old_tick = get_counter();
|
||||
new_tick = get_counter();
|
||||
while (new_tick - old_tick < 1000000000)
|
||||
new_tick = get_counter();
|
||||
}
|
||||
|
||||
// CPU<EFBFBD><EFBFBD>Ƶ<EFBFBD><EFBFBD>
|
||||
double mhz(int verbose) {
|
||||
LARGE_INTEGER lFrequency;
|
||||
LARGE_INTEGER lPerformanceCount_Start;
|
||||
LARGE_INTEGER lPerformanceCount_End;
|
||||
double mhz;
|
||||
double fTime;
|
||||
__int64 _i64StartCpuCounter;
|
||||
__int64 _i64EndCpuCounter;
|
||||
// On a multiprocessor machine, it should not matter which processor is
|
||||
// called. However, you can get different results on different processors due
|
||||
// to bugs in the BIOS or the HAL. To specify processor affinity for a thread,
|
||||
// use the SetThreadAffinityMask function.
|
||||
HANDLE hThread = GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread, 0x1);
|
||||
//CPU<50><55>Ƶ<EFBFBD><C6B5>
|
||||
double mhz(int verbose)
|
||||
{
|
||||
LARGE_INTEGER lFrequency;
|
||||
LARGE_INTEGER lPerformanceCount_Start;
|
||||
LARGE_INTEGER lPerformanceCount_End;
|
||||
double mhz;
|
||||
double fTime;
|
||||
__int64 _i64StartCpuCounter;
|
||||
__int64 _i64EndCpuCounter;
|
||||
//On a multiprocessor machine, it should not matter which processor is called.
|
||||
//However, you can get different results on different processors due to bugs in
|
||||
//the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function.
|
||||
HANDLE hThread=GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread,0x1);
|
||||
|
||||
// <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ϸ߾<EFBFBD><EFBFBD>ȶ<EFBFBD>ʱ<EFBFBD><EFBFBD><EFBFBD>ľ<EFBFBD><EFBFBD><EFBFBD>Ƶ<EFBFBD><EFBFBD>
|
||||
// <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD>Ӧ<EFBFBD>þ<EFBFBD><EFBFBD><EFBFBD>һƄ1<EFBFBD>78253<EFBFBD><EFBFBD><EFBFBD><EFBFBD>8254
|
||||
// <EFBFBD><EFBFBD>intel ich7<68>м<EFBFBD><D0BC><EFBFBD><EFBFBD><EFBFBD>8254
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
// if (verbose>0)
|
||||
// printf("<22>߾<EFBFBD><DFBE>ȶ<EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD>ľ<EFBFBD><C4BE><EFBFBD>Ƶ<EFBFBD>ʣ<EFBFBD>%1.0fHz.\n",(double)lFrequency.QuadPart);
|
||||
//<2F><><EFBFBD><EFBFBD><EFBFBD>ϸ߾<CFB8><DFBE>ȶ<EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD>ľ<EFBFBD><C4BE><EFBFBD>Ƶ<EFBFBD><C6B5>
|
||||
//<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD>Ӧ<EFBFBD>þ<EFBFBD><EFBFBD><EFBFBD>һƬ8253<EFBFBD><EFBFBD><EFBFBD><EFBFBD>8254
|
||||
//<2F><>intel ich7<68>м<EFBFBD><D0BC><EFBFBD><EFBFBD><EFBFBD>8254
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
// if (verbose>0)
|
||||
// printf("<22>߾<EFBFBD><DFBE>ȶ<EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD>ľ<EFBFBD><C4BE><EFBFBD>Ƶ<EFBFBD>ʣ<EFBFBD>%1.0fHz.\n",(double)lFrequency.QuadPart);
|
||||
|
||||
// <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD>ÿ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ڣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>+1
|
||||
QueryPerformanceCounter(&lPerformanceCount_Start);
|
||||
//<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD>ÿ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ<EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ڣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>+1
|
||||
QueryPerformanceCounter(&lPerformanceCount_Start);
|
||||
|
||||
// RDTSCָ<EFBFBD><EFBFBD>:<3A><>ȡCPU<50><55><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
_i64StartCpuCounter = __rdtsc();
|
||||
//RDTSCָ<43><D6B8>:<3A><>ȡCPU<50><55><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
_i64StartCpuCounter=__rdtsc();
|
||||
|
||||
// <EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD>һ<EFBFBD><EFBFBD>,<2C><><EFBFBD><EFBFBD>Сһ<D0A1><D2BB>
|
||||
// int nTemp=100000;
|
||||
// while (--nTemp);
|
||||
Sleep(200);
|
||||
//<2F><>ʱ<EFBFBD><CAB1>һ<EFBFBD><D2BB>,<2C><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Сһ<EFBFBD><EFBFBD>
|
||||
//int nTemp=100000;
|
||||
//while (--nTemp);
|
||||
Sleep(200);
|
||||
|
||||
QueryPerformanceCounter(&lPerformanceCount_End);
|
||||
QueryPerformanceCounter(&lPerformanceCount_End);
|
||||
|
||||
_i64EndCpuCounter = __rdtsc();
|
||||
_i64EndCpuCounter=__rdtsc();
|
||||
|
||||
// f=1/T => f=<3D><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>/(<28><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*T)
|
||||
// <EFBFBD><EFBFBD><EFBFBD><EFBFBD>ġ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ᅣ1<EFBFBD>7*T<><54><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1>ᅣ1<EFBFBD>7
|
||||
fTime = ((double)lPerformanceCount_End.QuadPart -
|
||||
(double)lPerformanceCount_Start.QuadPart) /
|
||||
(double)lFrequency.QuadPart;
|
||||
//f=1/T => f=<3D><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>/(<28><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*T)
|
||||
//<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ġ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*T<><54><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1><EFBFBD><EFBFBD>
|
||||
fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart)
|
||||
/(double)lFrequency.QuadPart;
|
||||
|
||||
mhz = (_i64EndCpuCounter - _i64StartCpuCounter) / (fTime * 1000000.0);
|
||||
if (verbose > 0)
|
||||
printf("CPUƵ<EFBFBD><EFBFBD>Ϊ:%1.6fMHz.\n", mhz);
|
||||
return mhz;
|
||||
mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0);
|
||||
if (verbose>0)
|
||||
printf("CPUƵ<EFBFBD><EFBFBD>Ϊ:%1.6fMHz.\n",mhz);
|
||||
return mhz;
|
||||
}
|
||||
|
||||
double CPU_Factor1(void) {
|
||||
double result;
|
||||
int i, j, k, ii, jj, kk;
|
||||
LARGE_INTEGER lStart, lEnd;
|
||||
double CPU_Factor1(void)
|
||||
{
|
||||
double result;
|
||||
int i,j,k,ii,jj,kk;
|
||||
LARGE_INTEGER lStart,lEnd;
|
||||
LARGE_INTEGER lFrequency;
|
||||
HANDLE hThread;
|
||||
double fTime;
|
||||
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
|
||||
ii = 43273;
|
||||
kk = 1238;
|
||||
result = 1;
|
||||
jj = 1244;
|
||||
ii = 43273;
|
||||
kk = 1238;
|
||||
result = 1;
|
||||
jj = 1244;
|
||||
|
||||
hThread = GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread, 0x1);
|
||||
hThread=GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread,0x1);
|
||||
QueryPerformanceCounter(&lStart);
|
||||
//_asm("cpuid");
|
||||
start_counter();
|
||||
for (i = 0; i < 100; i++)
|
||||
for (j = 0; j < 1000; j++)
|
||||
for (k = 0; k < 1000; k++)
|
||||
kk += kk * ii + jj;
|
||||
start_counter();
|
||||
for (i=0;i<100;i++)
|
||||
for (j=0;j<1000;j++)
|
||||
for (k=0;k<1000;k++)
|
||||
kk += kk*ii+jj;
|
||||
|
||||
result = get_counter();
|
||||
QueryPerformanceCounter(&lEnd);
|
||||
fTime = ((double)lEnd.QuadPart - (double)lStart.QuadPart);
|
||||
printf("CPU<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD>Ϊ%f", result);
|
||||
printf("\t %f\n", fTime);
|
||||
return result;
|
||||
result = get_counter();
|
||||
QueryPerformanceCounter(&lEnd);
|
||||
fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart);
|
||||
printf("CPU<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʱ<EFBFBD><EFBFBD>Ϊ%f",result);
|
||||
printf("\t %f\n",fTime);
|
||||
return result;
|
||||
}
|
||||
|
||||
double CPU_Factor(void) {
|
||||
double frequency;
|
||||
double multiplier = 1000 * 1000 * 1000; // nano
|
||||
LARGE_INTEGER lFrequency;
|
||||
LARGE_INTEGER start, stop;
|
||||
HANDLE hThread;
|
||||
int i;
|
||||
const int gigahertz = 1000 * 1000 * 1000;
|
||||
const int known_instructions_per_loop = 27317;
|
||||
double CPU_Factor(void)
|
||||
{
|
||||
double frequency;
|
||||
double multiplier = 1000 * 1000 * 1000;//nano
|
||||
LARGE_INTEGER lFrequency;
|
||||
LARGE_INTEGER start,stop;
|
||||
HANDLE hThread;
|
||||
int i;
|
||||
const int gigahertz= 1000*1000*1000;
|
||||
const int known_instructions_per_loop = 27317;
|
||||
|
||||
int iterations = 100000000;
|
||||
int g = 0;
|
||||
double normal_ticks_per_second;
|
||||
double ticks;
|
||||
double time;
|
||||
double loops_per_sec;
|
||||
double instructions_per_loop;
|
||||
double ratio;
|
||||
double actual_freq;
|
||||
int iterations = 100000000;
|
||||
int g = 0;
|
||||
double normal_ticks_per_second;
|
||||
double ticks;
|
||||
double time;
|
||||
double loops_per_sec;
|
||||
double instructions_per_loop;
|
||||
double ratio;
|
||||
double actual_freq;
|
||||
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
frequency = (double)lFrequency.QuadPart;
|
||||
QueryPerformanceFrequency(&lFrequency);
|
||||
frequency = (double)lFrequency.QuadPart;
|
||||
|
||||
hThread = GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread, 0x1);
|
||||
QueryPerformanceCounter(&start);
|
||||
for (i = 0; i < iterations; i++) {
|
||||
g++;
|
||||
g++;
|
||||
g++;
|
||||
g++;
|
||||
}
|
||||
QueryPerformanceCounter(&stop);
|
||||
hThread=GetCurrentThread();
|
||||
SetThreadAffinityMask(hThread,0x1);
|
||||
QueryPerformanceCounter(&start);
|
||||
for( i = 0; i < iterations; i++)
|
||||
{
|
||||
g++;
|
||||
g++;
|
||||
g++;
|
||||
g++;
|
||||
}
|
||||
QueryPerformanceCounter(&stop);
|
||||
|
||||
// normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ
|
||||
// 3199
|
||||
normal_ticks_per_second = frequency * 1000;
|
||||
ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
|
||||
time = (ticks * multiplier) / frequency;
|
||||
loops_per_sec = iterations / (time / multiplier);
|
||||
instructions_per_loop = normal_ticks_per_second / loops_per_sec;
|
||||
//normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199
|
||||
normal_ticks_per_second = frequency * 1000;
|
||||
ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
|
||||
time = (ticks * multiplier) /frequency;
|
||||
loops_per_sec = iterations / (time/multiplier);
|
||||
instructions_per_loop = normal_ticks_per_second / loops_per_sec;
|
||||
|
||||
ratio = (instructions_per_loop / known_instructions_per_loop);
|
||||
actual_freq = normal_ticks_per_second / ratio;
|
||||
/*
|
||||
actual_freq = normal_ticks_per_second / ratio;
|
||||
actual_freq = known_instructions_per_loop*iterations*multiplier/time;
|
||||
ratio = (instructions_per_loop / known_instructions_per_loop);
|
||||
actual_freq = normal_ticks_per_second / ratio;
|
||||
/*
|
||||
actual_freq = normal_ticks_per_second / ratio;
|
||||
actual_freq = known_instructions_per_loop*iterations*multiplier/time;
|
||||
|
||||
2293 = x/time;
|
||||
|
||||
2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
|
||||
loops_per_sec = iterations*frequency / ticks
|
||||
|
||||
instructions_per_loop = / loops_per_sec;
|
||||
*/
|
||||
printf("Perf counter freq: %f\n", normal_ticks_per_second);
|
||||
printf("Loops per sec: %f\n", loops_per_sec);
|
||||
printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
|
||||
printf("Presumed freq: %f\n", actual_freq);
|
||||
printf("ratio: %f\n", ratio);
|
||||
printf("time=%f\n", time);
|
||||
return ratio;
|
||||
2293 = x/time;
|
||||
|
||||
2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
|
||||
loops_per_sec = iterations*frequency / ticks
|
||||
|
||||
instructions_per_loop = / loops_per_sec;
|
||||
*/
|
||||
printf("Perf counter freq: %f\n", normal_ticks_per_second);
|
||||
printf("Loops per sec: %f\n", loops_per_sec);
|
||||
printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
|
||||
printf("Presumed freq: %f\n", actual_freq);
|
||||
printf("ratio: %f\n", ratio);
|
||||
printf("time=%f\n",time);
|
||||
return ratio;
|
||||
}
|
||||
|
||||
@ -119,7 +119,7 @@ double fcyc(test_funct f, int *params)
|
||||
if (clear_cache)
|
||||
clear();
|
||||
start_counter();
|
||||
f(params);
|
||||
f((long int*)params);
|
||||
cyc = get_counter();
|
||||
if (cyc > 0.0)
|
||||
add_sample(cyc);
|
||||
@ -131,7 +131,7 @@ double fcyc(test_funct f, int *params)
|
||||
clear();
|
||||
start_counter();
|
||||
for (i=0;i<MAX_ITER_TIMES;i++)
|
||||
f(params);
|
||||
f((long int *)params);
|
||||
cyc = get_counter()/MAX_ITER_TIMES;
|
||||
if (cyc > 0.0)
|
||||
add_sample(cyc);
|
||||
|
||||
@ -1,325 +0,0 @@
|
||||
/**************************************************************************
|
||||
多项式计算函数。按下面的要求编辑此文件:
|
||||
1. 将你的学号、姓名,以注释的方式写到下面;
|
||||
2. 实现不同版本的多项式计算函数;
|
||||
3. 编辑peval_fun_rec peval_fun_tab数组,将你的最好的答案
|
||||
(最小CPE、最小C10)作为数组的前两项
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
学号:201209054233
|
||||
姓名:夜半加班狂
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <cuda_runtime.h>
|
||||
typedef int (*peval_fun)(int*, int, int);
|
||||
|
||||
typedef struct {
|
||||
peval_fun f;
|
||||
char *descr;
|
||||
} peval_fun_rec, *peval_fun_ptr;
|
||||
|
||||
|
||||
/**************************************************************************
|
||||
Edit this comment to indicate your name and Andrew ID
|
||||
#ifdef ASSIGN
|
||||
Submission by Harry Q. Bovik, bovik@andrew.cmu.edu
|
||||
#else
|
||||
Instructor's version.
|
||||
Created by Randal E. Bryant, Randy.Bryant@cs.cmu.edu, 10/07/02
|
||||
#endif
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
实现一个指定的常系数多项式计算
|
||||
第一次,请直接运行程序,以便获知你需要实现的常系数是啥
|
||||
*/
|
||||
int const_poly_eval(int *not_use, int not_use2, int x)
|
||||
{
|
||||
int result = 0;
|
||||
/* int i;
|
||||
int xpwr = 1; // x的幂次
|
||||
int a[4] = {21,90,42,88};
|
||||
for (i = 0; i <= 3; i++) {
|
||||
result += a[i]*xpwr;
|
||||
xpwr *= x;
|
||||
}
|
||||
*/
|
||||
// 90 = 64 + 32 - 4 - 2
|
||||
// 42 = 32 + 8 + 2
|
||||
// 88 = 64 + 16 + 8
|
||||
int x64,x32,x16,x8,x4,x2;
|
||||
|
||||
x64 = x << 6;
|
||||
x32 = x << 5;
|
||||
x16 = x << 4;
|
||||
x8 = x << 3;
|
||||
x4 = x << 2;
|
||||
x2 = x << 1;
|
||||
result = 21 + x64+x32-x4-x2 + ((x32+x8+x2) + (x64+x16+x8)*x)*x;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* 多项式计算函数。注意:这个只是一个参考实现,你需要实现自己的版本 */
|
||||
|
||||
/*
|
||||
友情提示:lcc支持ATT格式的嵌入式汇编,例如
|
||||
|
||||
_asm("movl %eax,%ebx");
|
||||
_asm("pushl %edx");
|
||||
|
||||
可以在lcc中project->configuration->Compiler->Code Generation->Generate .asm,
|
||||
将其选中后,可以在lcc目录下面生成对应程序的汇编代码实现。通过查看汇编文件,
|
||||
你可以了解编译器是如何实现你的代码的。有些实现可能非常低效。
|
||||
你可以在适当的地方加入嵌入式汇编,来大幅度提高计算性能。
|
||||
*/
|
||||
|
||||
int poly_eval(int *a, int degree, int x)
|
||||
{
|
||||
int result = 0;
|
||||
int i;
|
||||
int xpwr = 1; /* x的幂次 */
|
||||
// printf("阶=%d\n",degree);
|
||||
for (i = 0; i <= degree; i++) {
|
||||
result += a[i]*xpwr;
|
||||
xpwr *= x;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* CUDA优化的多项式计算函数 - 低CPE版本 */
|
||||
int cuda_poly_eval_low_cpe(int *a, int degree, int x)
|
||||
{
|
||||
// 对于低CPE版本,我们使用CUDA并行计算多项式的各个项
|
||||
// 然后将结果传回主机进行求和
|
||||
|
||||
// 分配设备内存
|
||||
int *d_a, *d_results;
|
||||
cudaError_t err;
|
||||
|
||||
// 分配内存
|
||||
err = cudaMalloc(&d_a, (degree + 1) * sizeof(int));
|
||||
if (err != cudaSuccess) {
|
||||
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||
return 0;
|
||||
}
|
||||
|
||||
err = cudaMalloc(&d_results, (degree + 1) * sizeof(int));
|
||||
if (err != cudaSuccess) {
|
||||
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_a);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 将系数从主机复制到设备
|
||||
err = cudaMemcpy(d_a, a, (degree + 1) * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) {
|
||||
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_a);
|
||||
cudaFree(d_results);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 定义CUDA核函数
|
||||
dim3 blockDim(256);
|
||||
dim3 gridDim((degree + 1 + blockDim.x - 1) / blockDim.x);
|
||||
|
||||
// 启动核函数
|
||||
cudaPolyEvalLowCPE<<<gridDim, blockDim>>>(d_a, degree, x, d_results);
|
||||
|
||||
// 检查核函数执行错误
|
||||
err = cudaGetLastError();
|
||||
if (err != cudaSuccess) {
|
||||
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_a);
|
||||
cudaFree(d_results);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 分配主机内存用于结果
|
||||
int *h_results = (int *)malloc((degree + 1) * sizeof(int));
|
||||
if (h_results == NULL) {
|
||||
printf("Memory allocation error\n");
|
||||
cudaFree(d_a);
|
||||
cudaFree(d_results);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 将结果从设备复制回主机
|
||||
err = cudaMemcpy(h_results, d_results, (degree + 1) * sizeof(int), cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||
free(h_results);
|
||||
cudaFree(d_a);
|
||||
cudaFree(d_results);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 在主机上求和
|
||||
int result = 0;
|
||||
for (int i = 0; i <= degree; i++) {
|
||||
result += h_results[i];
|
||||
}
|
||||
|
||||
// 释放内存
|
||||
free(h_results);
|
||||
cudaFree(d_a);
|
||||
cudaFree(d_results);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/* CUDA优化的多项式计算函数 - 10阶优化版本 */
|
||||
int cuda_poly_eval_degree10(int *a, int degree, int x)
|
||||
{
|
||||
// 对于10阶多项式,我们可以使用更优化的方法
|
||||
// 使用CUDA并行计算,但针对10阶多项式进行特殊优化
|
||||
|
||||
// 分配设备内存
|
||||
int *d_a, *d_result;
|
||||
cudaError_t err;
|
||||
|
||||
// 分配内存
|
||||
err = cudaMalloc(&d_a, (degree + 1) * sizeof(int));
|
||||
if (err != cudaSuccess) {
|
||||
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||
return 0;
|
||||
}
|
||||
|
||||
err = cudaMalloc(&d_result, sizeof(int));
|
||||
if (err != cudaSuccess) {
|
||||
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_a);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 将系数从主机复制到设备
|
||||
err = cudaMemcpy(d_a, a, (degree + 1) * sizeof(int), cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) {
|
||||
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_a);
|
||||
cudaFree(d_result);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 定义CUDA核函数
|
||||
dim3 blockDim(256);
|
||||
dim3 gridDim(1); // 只需要一个块,因为我们只需要一个结果
|
||||
|
||||
// 启动核函数
|
||||
cudaPolyEvalDegree10<<<gridDim, blockDim>>>(d_a, degree, x, d_result);
|
||||
|
||||
// 检查核函数执行错误
|
||||
err = cudaGetLastError();
|
||||
if (err != cudaSuccess) {
|
||||
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_a);
|
||||
cudaFree(d_result);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 获取结果
|
||||
int result;
|
||||
err = cudaMemcpy(&result, d_result, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
printf("CUDA Error: %s\n", cudaGetErrorString(err));
|
||||
cudaFree(d_a);
|
||||
cudaFree(d_result);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 释放内存
|
||||
cudaFree(d_a);
|
||||
cudaFree(d_result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/* CUDA核函数 - 低CPE版本 */
|
||||
__global__ void cudaPolyEvalLowCPE(int *a, int degree, int x, int *results)
|
||||
{
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx <= degree) {
|
||||
// 计算x的幂
|
||||
int xpwr = 1;
|
||||
for (int i = 0; i < idx; i++) {
|
||||
xpwr *= x;
|
||||
}
|
||||
|
||||
// 计算这一项的结果
|
||||
results[idx] = a[idx] * xpwr;
|
||||
}
|
||||
}
|
||||
|
||||
/* CUDA核函数 - 10阶优化版本 */
|
||||
__global__ void cudaPolyEvalDegree10(int *a, int degree, int x, int *result)
|
||||
{
|
||||
// 使用共享内存来存储中间结果
|
||||
__shared__ int shared_result;
|
||||
|
||||
// 只有第一个线程初始化共享结果
|
||||
if (threadIdx.x == 0) {
|
||||
shared_result = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 每个线程计算一部分项
|
||||
int local_result = 0;
|
||||
int xpwr = 1;
|
||||
|
||||
// 计算x的幂
|
||||
for (int i = 0; i < threadIdx.x; i++) {
|
||||
xpwr *= x;
|
||||
}
|
||||
|
||||
// 计算这一项的结果
|
||||
if (threadIdx.x <= degree) {
|
||||
local_result = a[threadIdx.x] * xpwr;
|
||||
}
|
||||
|
||||
// 使用原子操作累加结果
|
||||
atomicAdd(&shared_result, local_result);
|
||||
|
||||
// 同步所有线程
|
||||
__syncthreads();
|
||||
|
||||
// 只有第一个线程将结果写回全局内存
|
||||
if (threadIdx.x == 0) {
|
||||
*result = shared_result;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
这个表格包含多个数组元素,每一组元素(函数名字, "描述字符串")
|
||||
将你认为最好的两个实现,放在最前面。
|
||||
比如:
|
||||
{my_poly_eval1, "超级垃圾实现"},
|
||||
{my_poly_eval2, "好一点的实现"},
|
||||
*/
|
||||
|
||||
peval_fun_rec peval_fun_tab[] =
|
||||
{
|
||||
|
||||
/* 第一项,应当是你写的最好CPE的函数实现 */
|
||||
{cuda_poly_eval_low_cpe, "CUDA optimized low CPE implementation"},
|
||||
/* 第二项,应当是你写的在10阶时具有最好性能的实现 */
|
||||
{cuda_poly_eval_degree10, "CUDA optimized degree 10 implementation"},
|
||||
|
||||
{poly_eval, "poly_eval: 参考实现"},
|
||||
|
||||
/* 下面的代码不能修改或者删除!!表明数组列表结束 */
|
||||
{NULL, ""}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Binary file not shown.
@ -1,125 +1,143 @@
|
||||
/**************************************************************************
|
||||
多项式计算函数。按下面的要求编辑此文件:
|
||||
1. 将你的学号、姓名,以注释的方式写到下面;
|
||||
2. 实现不同版本的多项式计算函数;
|
||||
3. 编辑peval_fun_rec peval_fun_tab数组,将你的最好的答案
|
||||
(最小CPE、最小C10)作为数组的前两项
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
学号:201209054233
|
||||
姓名:夜半加班狂
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
typedef int (*peval_fun)(int*, int, int);
|
||||
|
||||
typedef struct {
|
||||
peval_fun f;
|
||||
char *descr;
|
||||
} peval_fun_rec, *peval_fun_ptr;
|
||||
|
||||
|
||||
/**************************************************************************
|
||||
Edit this comment to indicate your name and Andrew ID
|
||||
#ifdef ASSIGN
|
||||
Submission by Harry Q. Bovik, bovik@andrew.cmu.edu
|
||||
#else
|
||||
Instructor's version.
|
||||
Created by Randal E. Bryant, Randy.Bryant@cs.cmu.edu, 10/07/02
|
||||
#endif
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
实现一个指定的常系数多项式计算
|
||||
第一次,请直接运行程序,以便获知你需要实现的常系数是啥
|
||||
*/
|
||||
int const_poly_eval(int *not_use, int not_use2, int x)
|
||||
{
|
||||
int result = 0;
|
||||
/* int i;
|
||||
int xpwr = 1; // x的幂次
|
||||
int a[4] = {21,90,42,88};
|
||||
for (i = 0; i <= 3; i++) {
|
||||
result += a[i]*xpwr;
|
||||
xpwr *= x;
|
||||
}
|
||||
*/
|
||||
// 90 = 64 + 32 - 4 - 2
|
||||
// 42 = 32 + 8 + 2
|
||||
// 88 = 64 + 16 + 8
|
||||
int x64,x32,x16,x8,x4,x2;
|
||||
|
||||
x64 = x << 6;
|
||||
x32 = x << 5;
|
||||
x16 = x << 4;
|
||||
x8 = x << 3;
|
||||
x4 = x << 2;
|
||||
x2 = x << 1;
|
||||
result = 21 + x64+x32-x4-x2 + ((x32+x8+x2) + (x64+x16+x8)*x)*x;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* 多项式计算函数。注意:这个只是一个参考实现,你需要实现自己的版本 */
|
||||
|
||||
/*
|
||||
友情提示:lcc支持ATT格式的嵌入式汇编,例如
|
||||
|
||||
_asm("movl %eax,%ebx");
|
||||
_asm("pushl %edx");
|
||||
|
||||
可以在lcc中project->configuration->Compiler->Code Generation->Generate .asm,
|
||||
将其选中后,可以在lcc目录下面生成对应程序的汇编代码实现。通过查看汇编文件,
|
||||
你可以了解编译器是如何实现你的代码的。有些实现可能非常低效。
|
||||
你可以在适当的地方加入嵌入式汇编,来大幅度提高计算性能。
|
||||
*/
|
||||
|
||||
int poly_eval(int *a, int degree, int x)
|
||||
{
|
||||
int result = 0;
|
||||
int i;
|
||||
int xpwr = 1; /* x的幂次 */
|
||||
// printf("阶=%d\n",degree);
|
||||
for (i = 0; i <= degree; i++) {
|
||||
result += a[i]*xpwr;
|
||||
xpwr *= x;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
这个表格包含多个数组元素,每一组元素(函数名字, "描述字符串")
|
||||
将你认为最好的两个实现,放在最前面。
|
||||
比如:
|
||||
{my_poly_eval1, "超级垃圾实现"},
|
||||
{my_poly_eval2, "好一点的实现"},
|
||||
*/
|
||||
|
||||
peval_fun_rec peval_fun_tab[] =
|
||||
{
|
||||
|
||||
/* 第一项,应当是你写的最好CPE的函数实现 */
|
||||
{poly_eval, "夜半加班狂的CPE"},
|
||||
/* 第二项,应当是你写的在10阶时具有最好性能的实现 */
|
||||
{poly_eval, "夜半加班狂的10阶实现"},
|
||||
|
||||
{poly_eval, "poly_eval: 参考实现"},
|
||||
|
||||
/* 下面的代码不能修改或者删除!!表明数组列表结束 */
|
||||
{NULL, ""}
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
常系数多项式计算函数。
|
||||
通过减少乘法次数和使用位运算优化性能。
|
||||
公式:result = 37 + 72*x + 84*x^2 + 52*x^3
|
||||
*/
|
||||
|
||||
/**************************************************************************
|
||||
多项式计算函数。按下面的要求编辑此文件:
|
||||
1. 将你的学号、姓名,以注释的方式写到下面;
|
||||
2. 实现不同版本的多项式计算函数;
|
||||
3. 编辑peval_fun_rec peval_fun_tab数组,将你的最好的答案
|
||||
(最小CPE、最小C10)作为数组的前两项
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
学号:202302723005
|
||||
姓名:程景愉
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
typedef int (*peval_fun)(int*, int, int);
|
||||
|
||||
typedef struct {
|
||||
peval_fun f;
|
||||
char *descr;
|
||||
} peval_fun_rec, *peval_fun_ptr;
|
||||
|
||||
|
||||
/**************************************************************************
|
||||
Edit this comment to indicate your name and Andrew ID
|
||||
#ifdef ASSIGN
|
||||
Submission by Harry Q. Bovik, bovik@andrew.cmu.edu
|
||||
#else
|
||||
Instructor's version.
|
||||
Created by Randal E. Bryant, Randy.Bryant@cs.cmu.edu, 10/07/02
|
||||
#endif
|
||||
***************************************************************************/
|
||||
|
||||
/*
|
||||
实现一个指定的常系数多项式计算
|
||||
第一次,请直接运行程序,以便获知你需要实现的常系数是啥
|
||||
*/
|
||||
int poly_eval(int *a, int degree, int x)
|
||||
{
|
||||
int result = 0;
|
||||
int i;
|
||||
int xpwr = 1; // x的幂次
|
||||
|
||||
for (i = 0; i <= degree; i++) {
|
||||
result += a[i]*xpwr;
|
||||
xpwr *= x;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* 多项式计算函数。注意:这个只是一个参考实现,你需要实现自己的版本 */
|
||||
|
||||
/*
|
||||
友情提示:lcc支持ATT格式的嵌入式汇编,例如
|
||||
|
||||
_asm("movl %eax,%ebx");
|
||||
_asm("pushl %edx");
|
||||
|
||||
可以在lcc中project->configuration->Compiler->Code Generation->Generate .asm,
|
||||
将其选中后,可以在lcc目录下面生成对应程序的汇编代码实现。通过查看汇编文件,
|
||||
你可以了解编译器是如何实现你的代码的。有些实现可能非常低效。
|
||||
你可以在适当的地方加入嵌入式汇编,来大幅度提高计算性能。
|
||||
*/
|
||||
int const_poly_eval(int *not_use, int not_use2, int x)
|
||||
{
|
||||
register int result = 0;
|
||||
register int x1, x2, x3;
|
||||
register int tmp = x; // tmp = x
|
||||
register int tmp1 = tmp * tmp; // tmp1 = x^2
|
||||
register int tmp2 = tmp1 * tmp;// tmp2 = x^3
|
||||
|
||||
// 计算72x: 64x + 8x = (x << 6) + (x << 3)
|
||||
x1 = (tmp << 6) + (tmp << 3);
|
||||
|
||||
// 计算84x^2: 64x2 + 16x2 + 4x2 = (x2 << 6) + (x2 << 4) + (x2 << 2)
|
||||
x2 = (tmp1 << 6) + (tmp1 << 4) + (tmp1 << 2);
|
||||
|
||||
// 计算52x^3: 32x3 + 16x3 + 4x3 = (x3 << 5) + (x3 << 4) + (x3 << 2)
|
||||
x3 = (tmp2 << 5) + (tmp2 << 4) + (tmp2 << 2);
|
||||
|
||||
// 合并结果:37 + 72x + 84x2 + 52x3
|
||||
result = 37 + x1 + x2 + x3;
|
||||
return result;
|
||||
}
|
||||
|
||||
int poly_eval12(int* a, int degree, int x) {
|
||||
if (degree == 10) {
|
||||
// 针对10阶完全展开霍纳法则(保持原逻辑不变)
|
||||
int result = a[10];
|
||||
result = result * x + a[9];
|
||||
result = result * x + a[8];
|
||||
result = result * x + a[7];
|
||||
result = result * x + a[6];
|
||||
result = result * x + a[5];
|
||||
result = result * x + a[4];
|
||||
result = result * x + a[3];
|
||||
result = result * x + a[2];
|
||||
result = result * x + a[1];
|
||||
return result * x + a[0];
|
||||
} else {
|
||||
// 通用版本处理其他阶数(保持原逻辑不变)
|
||||
int result = 0;
|
||||
int x2 = x * x;
|
||||
int i = degree;
|
||||
for (; i > 0; i -= 2) {
|
||||
result = result * x2 + a[i] * x + a[i - 1];
|
||||
}
|
||||
if (i == 0) {
|
||||
result = result * x + a[0];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
/*
|
||||
这个表格包含多个数组元素,每一组元素(函数名字, "描述字符串")
|
||||
将你认为最好的两个实现,放在最前面。
|
||||
比如:
|
||||
{my_poly_eval1, "超级垃圾实现"},
|
||||
{my_poly_eval2, "好一点的实现"},
|
||||
*/
|
||||
|
||||
peval_fun_rec peval_fun_tab[] =
|
||||
{
|
||||
|
||||
/* 第一项,应当是你写的最好CPE的函数实现 */
|
||||
{poly_eval12, "程景愉的CPE"},
|
||||
/* 第二项,应当是你写的在10阶时具有最好性能的实现 */
|
||||
{poly_eval12, "程景愉的10阶实现"},
|
||||
|
||||
{poly_eval, "poly_eval: 参考实现"},
|
||||
|
||||
/* 下面的代码不能修改或者删除!!表明数组列表结束 */
|
||||
{NULL, ""}
|
||||
};
|
||||
@ -2,11 +2,11 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
//#include <random.h>
|
||||
#include "poly.h"
|
||||
#include "cpe.h"
|
||||
#include "clock.h"
|
||||
#include <time.h>
|
||||
|
||||
double CPU_Mhz;
|
||||
|
||||
@ -18,7 +18,7 @@ static int coeff[MAXDEGREE+1];
|
||||
|
||||
#define MAX_ITER_COUNT 100
|
||||
|
||||
#define REF_CPU_MHZ 2292.6 // <20><><EFBFBD><EFBFBD><EFBFBD>ҵĴ<D2B5><C4B4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƶ
|
||||
#define REF_CPU_MHZ 2292.6 // <20><><EFBFBD><EFBFBD><EFBFBD>ҵĴ<D2B5><C4B4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƶ
|
||||
|
||||
/* Define performance standards */
|
||||
static struct {
|
||||
@ -27,7 +27,7 @@ static struct {
|
||||
} cstandard[3] =
|
||||
{{4.00, 1.75}, /* CPE */
|
||||
{50, 43}, /* C(10) */
|
||||
{57,31} /* <20><>ϵ<EFBFBD><CFB5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><CABD><EFBFBD><EFBFBD> */
|
||||
{57,31} /* <20><>ϵ<EFBFBD><CFB5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><CABD><EFBFBD><EFBFBD> */
|
||||
};
|
||||
|
||||
int coeff_const[4];
|
||||
@ -83,7 +83,7 @@ static void init_const_poly(void)
|
||||
coeff_const[i] = rand_div+10;
|
||||
}
|
||||
|
||||
printf("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><EFBFBD>poly.c<><63>const_poly_eval<61><6C><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD><CAB5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ij<EFBFBD><C4B3><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><CABD><EFBFBD>㣡\n");
|
||||
printf("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><EFBFBD>poly.c<><63>const_poly_eval<61><6C><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD><CAB5><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ij<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD>㣡\n");
|
||||
printf("\tresult=%d+%d*x+%d*x^2+%d*x^3\n",coeff_const[0],coeff_const[1],coeff_const[2],coeff_const[3]);
|
||||
|
||||
fixval_const = ref_poly_eval(coeff_const, 3, xval);
|
||||
@ -98,15 +98,15 @@ void test_const_poly(void)
|
||||
int my_cal = const_poly_eval(coeff_const, 3, xval);
|
||||
if (fixval_const != my_cal)
|
||||
{
|
||||
printf("<EFBFBD><EFBFBD>ϵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>const_poly_evalʵ<EFBFBD>ִ<EFBFBD><EFBFBD><EFBFBD>x=%d<><64><EFBFBD><EFBFBD>Ԥ<EFBFBD>ڽ<EFBFBD><DABD><EFBFBD><EFBFBD>%d<><64><EFBFBD><EFBFBD><EFBFBD>Ǽ<EFBFBD><C7BC><EFBFBD>õ<EFBFBD><C3B5><EFBFBD><EFBFBD><EFBFBD>%d\n",xval,fixval_const,my_cal);
|
||||
printf("<EFBFBD><EFBFBD>ϵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>const_poly_evalʵ<EFBFBD>ִ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>x=%d<><64><EFBFBD><EFBFBD>Ԥ<EFBFBD>ڽ<EFBFBD><DABD><EFBFBD><EFBFBD><EFBFBD>%d<><64><EFBFBD><EFBFBD><EFBFBD>Ǽ<EFBFBD><C7BC><EFBFBD><EFBFBD>õ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>%d\n",xval,fixval_const,my_cal);
|
||||
exit(0);
|
||||
}
|
||||
fix_time = 0;
|
||||
for (i=0;i<MAX_ITER_COUNT;i++)
|
||||
fix_time += measure_function(run_fun_const, 3);
|
||||
fix_time = fix_time / MAX_ITER_COUNT;
|
||||
printf(" <20><>ϵ<EFBFBD><CFB5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><CABD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1> = %.1f\n", fix_time);
|
||||
printf(" <20><>ߵij<DFB5>ϵ<EFBFBD><CFB5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><CABD><EFBFBD><EFBFBD>÷<EFBFBD> ============== %.0f\n",
|
||||
printf(" <20><>ϵ<EFBFBD><CFB5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><CABD><EFBFBD><EFBFBD>ʱ<EFBFBD><CAB1> = %.1f\n", fix_time);
|
||||
printf(" <20><><EFBFBD>ߵij<EFBFBD>ϵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>÷<EFBFBD> ============== %.0f\n",
|
||||
compute_score(fix_time, cstandard[2].cref, cstandard[2].cbest));
|
||||
}
|
||||
|
||||
@ -133,7 +133,7 @@ int test_poly(peval_fun f, FILE *rpt) {
|
||||
ok = 0;
|
||||
if (rpt) {
|
||||
fprintf(rpt,
|
||||
"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD>㲻<EFBFBD>ԣ<EFBFBD><EFBFBD><EFBFBD>=%dʱ<64><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5>%d<><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȷֵ<C8B7><D6B5>%d\n",
|
||||
"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD>㲻<EFBFBD>ԣ<EFBFBD><EFBFBD><EFBFBD>=%dʱ<64><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD><EFBFBD>%d<><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȷֵ<C8B7><D6B5>%d\n",
|
||||
MAXDEGREE-i, v, pval[i]);
|
||||
}
|
||||
}
|
||||
@ -143,7 +143,7 @@ int test_poly(peval_fun f, FILE *rpt) {
|
||||
ok = 0;
|
||||
if (rpt) {
|
||||
fprintf(rpt,
|
||||
"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD>㲻<EFBFBD>ԣ<EFBFBD><EFBFBD><EFBFBD>=%dʱ<64><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5>%d<><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȷֵ<C8B7><D6B5>%d\n",
|
||||
"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD>㲻<EFBFBD>ԣ<EFBFBD><EFBFBD><EFBFBD>=%dʱ<64><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD><EFBFBD>%d<><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ȷֵ<C8B7><D6B5>%d\n",
|
||||
FIXDEGREE, v, fixval);
|
||||
}
|
||||
}
|
||||
@ -176,7 +176,7 @@ void run_poly(peval_fun f, char *descr, double *cpep, double *cfixp)
|
||||
double cpe=0;
|
||||
double fix_time=0;
|
||||
pfun = f;
|
||||
printf("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>%s\n", descr);
|
||||
printf("<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>%s\n", descr);
|
||||
if (test_poly(f, stdout)) {
|
||||
cpe = 0;
|
||||
for (i=0;i<MAX_ITER_COUNT;i++)
|
||||
@ -207,7 +207,7 @@ static double compute_score(double cmeas, double cref, double cbest)
|
||||
return 100*((smeas-1.0)/(sbest-1.0) + 0.1);
|
||||
}
|
||||
|
||||
/* <20><><EFBFBD><EFBFBD>һ<EFBFBD><D2BB>0~divv-1֮<31><D6AE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͬʱ<CDAC><CAB1><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
/* <20><><EFBFBD><EFBFBD>һ<EFBFBD><D2BB>0~divv-1֮<31><D6AE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ͬʱ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> */
|
||||
void GenerateRandomNumber(unsigned long divv)
|
||||
{
|
||||
unsigned long long x = rand1_h;
|
||||
@ -231,18 +231,18 @@ int main(int argc, char *argv[])
|
||||
|
||||
// CPU_Factor();
|
||||
// GetCpuClock();
|
||||
printf("\t2015<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD>Ż<EFBFBD>ʵ<EFBFBD>飬<EFBFBD><EFBFBD>ӭ<EFBFBD>㣡\n");
|
||||
printf("\t2015<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD>Ż<EFBFBD>ʵ<EFBFBD>飬<EFBFBD><EFBFBD>ӭ<EFBFBD>㣡\n");
|
||||
printf("============================\n");
|
||||
|
||||
if (argc == 1)
|
||||
{
|
||||
printf("ʹ<EFBFBD>÷<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>%s ѧ<>ź<EFBFBD>6λ [ѧ<>ź<EFBFBD>6λ] [ѧ<>ź<EFBFBD>6λ] ...\n",argv[0]);
|
||||
printf("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʾ<EFBFBD><EFBFBD>дpoly.c<><63><EFBFBD><EFBFBD>ʵ<EFBFBD><CAB5>һ<EFBFBD><D2BB><EFBFBD><EFBFBD>ϵ<EFBFBD><CFB5><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD>ļ<EFBFBD><C4BC>㣬<EFBFBD><E3A3AC><EFBFBD><EFBFBD><EFBFBD>ܿ<EFBFBD>Ŷ....\n");
|
||||
printf("<EFBFBD><EFBFBD><EFBFBD>⣬<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><EFBFBD>дpoly.c<><63><EFBFBD><EFBFBD>ʵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ķ<EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>10<EFBFBD>Ķ<EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD>㣬Ҫ<EFBFBD>죡\n");
|
||||
printf("ʹ<EFBFBD>÷<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>%s ѧ<>ź<EFBFBD>6λ [ѧ<>ź<EFBFBD>6λ] [ѧ<>ź<EFBFBD>6λ] ...\n",argv[0]);
|
||||
printf("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʾ<EFBFBD><EFBFBD>дpoly.c<><63><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD><EFBFBD>һ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ϵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD>ļ<EFBFBD><EFBFBD>㣬<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ܿ<EFBFBD>Ŷ....\n");
|
||||
printf("<EFBFBD><EFBFBD><EFBFBD>⣬<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ҫ<EFBFBD><EFBFBD>дpoly.c<><63><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʵ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ķ<EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>10<EFBFBD>Ķ<EFBFBD><EFBFBD><EFBFBD>ʽ<EFBFBD><EFBFBD><EFBFBD>㣬Ҫ<EFBFBD>죡\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*<2A><><EFBFBD><EFBFBD>ѧ<EFBFBD>ţ<EFBFBD><C5A3><EFBFBD>ʼ<EFBFBD><CABC>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*/
|
||||
/*<2A><><EFBFBD><EFBFBD>ѧ<EFBFBD>ţ<EFBFBD><C5A3><EFBFBD>ʼ<EFBFBD><CABC>һ<EFBFBD><D2BB><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>*/
|
||||
rand1_h = (unsigned long)atoi(argv[1]);
|
||||
rand1_l=0x29A;
|
||||
GenerateRandomNumber(0);
|
||||
@ -267,10 +267,10 @@ int main(int argc, char *argv[])
|
||||
//make_CPU_busy();
|
||||
run_poly(peval_fun_tab[i].f, peval_fun_tab[i].descr, &cpe, &cfix);
|
||||
if (i == 0)
|
||||
printf(" <20><>ߵ<EFBFBD>CPE<50>÷<EFBFBD> =========================== %.0f\n",
|
||||
printf(" <20><><EFBFBD>ߵ<EFBFBD>CPE<EFBFBD>÷<EFBFBD> =========================== %.0f\n",
|
||||
compute_score(cpe, cstandard[0].cref, cstandard[0].cbest));
|
||||
if (i == 1)
|
||||
printf(" <20><>ߵ<EFBFBD>C(10)<29>÷<EFBFBD> ========================= %.0f\n",
|
||||
printf(" <20><><EFBFBD>ߵ<EFBFBD>C(10)<29>÷<EFBFBD> ========================= %.0f\n",
|
||||
compute_score(cfix, cstandard[1].cref, cstandard[1].cbest));
|
||||
}
|
||||
return 0;
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
Before Width: | Height: | Size: 655 KiB |
@ -1,7 +1,8 @@
|
||||
# Makefile for word frequency analysis program
|
||||
|
||||
CC = icx
|
||||
CFLAGS = -Ofast -pg
|
||||
#CFLAGS = -O2 -pg
|
||||
CFLAGS = -Ofast -Wall
|
||||
TARGET = prog
|
||||
SOURCES = prog.c options.c
|
||||
|
||||
|
||||
BIN
profile/gmon.out
BIN
profile/gmon.out
Binary file not shown.
BIN
profile/prog
BIN
profile/prog
Binary file not shown.
@ -3,7 +3,6 @@
|
||||
|
||||
#include "options.h"
|
||||
#include "string.h"
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
@ -26,13 +25,28 @@ typedef void (*lower_fun_t)(char *s);
|
||||
|
||||
/* Lower case conversion routines */
|
||||
|
||||
/* Convert string to lower case: slow */
|
||||
void lower1(char *s) {
|
||||
int i;
|
||||
/* Convert string to lower case: optimized with lookup table */
|
||||
static unsigned char lcase_table[256];
|
||||
static int table_initialized = 0;
|
||||
|
||||
for (i = 0; i < Strlen(s); i++)
|
||||
if (s[i] >= 'A' && s[i] <= 'Z')
|
||||
s[i] -= ('A' - 'a');
|
||||
void init_lcase_table() {
|
||||
if (!table_initialized) {
|
||||
int i;
|
||||
for (i = 0; i < 256; i++)
|
||||
lcase_table[i] = i;
|
||||
for (i = 'A'; i <= 'Z'; i++)
|
||||
lcase_table[i] = i - ('A' - 'a');
|
||||
table_initialized = 1;
|
||||
}
|
||||
}
|
||||
|
||||
void lower1(char *s) {
|
||||
init_lcase_table();
|
||||
unsigned char *us = (unsigned char *)s;
|
||||
while (*us) {
|
||||
*us = lcase_table[*us];
|
||||
us++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Convert string to lower case: faster */
|
||||
@ -137,9 +151,20 @@ unsigned h_xor(char *s) {
|
||||
return val % tsize;
|
||||
}
|
||||
|
||||
#define HCNT 3
|
||||
hash_fun_t hash_fun_set[HCNT] = {h_mod, h_add, h_xor};
|
||||
char *hash_fun_names[HCNT] = {"h_mod", "h_add", "h_xor"};
|
||||
/* FNV-1a hash function */
|
||||
unsigned h_fnv1a(char *s) {
|
||||
unsigned hash = 2166136261u;
|
||||
unsigned char *us = (unsigned char *)s;
|
||||
while (*us) {
|
||||
hash ^= *us++;
|
||||
hash *= 16777619u;
|
||||
}
|
||||
return hash % tsize;
|
||||
}
|
||||
|
||||
#define HCNT 4
|
||||
hash_fun_t hash_fun_set[HCNT] = {h_mod, h_add, h_xor, h_fnv1a};
|
||||
char *hash_fun_names[HCNT] = {"h_mod", "h_add", "h_xor", "h_fnv1a"};
|
||||
|
||||
char *save_string(char *s) {
|
||||
char *result = (char *)malloc(Strlen(s) + 1);
|
||||
@ -194,7 +219,6 @@ h_ptr find_ele_iter_f(h_ptr ls, char *s) {
|
||||
h_ptr find_ele_iter_r(h_ptr ls, char *s) {
|
||||
h_ptr ele = ls;
|
||||
h_ptr last = NULL;
|
||||
#pragma omp parallel shared(ls, s, last)
|
||||
for (ele = ls; ele; ele = ele->next) {
|
||||
char *word = ele->word;
|
||||
if (strcmp(s, word) == 0) {
|
||||
@ -220,10 +244,10 @@ h_ptr find_ele_iter_r(h_ptr ls, char *s) {
|
||||
typedef h_ptr (*find_ele_fun_t)(h_ptr, char *);
|
||||
|
||||
#define FCNT 3
|
||||
find_ele_fun_t find_ele_fun_set[FCNT] = {find_ele_iter_r, find_ele_iter_f,
|
||||
find_ele_rec};
|
||||
char *find_ele_fun_names[FCNT] = {"find_ele_iter_r", "find_ele_iter_f",
|
||||
"find_ele_rec"};
|
||||
find_ele_fun_t find_ele_fun_set[FCNT] = {find_ele_rec, find_ele_iter_f,
|
||||
find_ele_iter_r};
|
||||
char *find_ele_fun_names[FCNT] = {"find_ele_rec", "find_ele_iter_f",
|
||||
"find_ele_iter_r"};
|
||||
|
||||
/* Comparision function for sorting */
|
||||
int compare_ele(const void *vele1, const void *vele2) {
|
||||
@ -386,8 +410,8 @@ void word_freq(FILE *src, int verbose, int ngram, int size, int quick,
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int verbose = 1;
|
||||
int size = 1024;
|
||||
int hash_fun_index = 0;
|
||||
int size = 32768; // 修改默认值为32768
|
||||
int hash_fun_index = 3;
|
||||
int lower_fun_index = 0;
|
||||
int find_fun_index = 0;
|
||||
int ngram = 1;
|
||||
|
||||
Reference in New Issue
Block a user