diff --git a/perflab/matrix/Makefile b/perflab/matrix/Makefile
new file mode 100644
index 0000000..2dc672d
--- /dev/null
+++ b/perflab/matrix/Makefile
@@ -0,0 +1,34 @@
+CC = gcc
+CFLAGS = -Wall -O1 -g
+#LDFLAGS = -lm -lcudart -lcuda
+
+# Source files
+SRCS = rowcol_test.c clock.c cpe.c fcyc.c lsquare.c rowcol_202302723005.c
+#CUDA_SRCS = rowcol.cu
+OBJS = $(SRCS:.c=.o) 
+#rowcol.o
+
+# Target executable
+TARGET = matrix_test
+
+# Default target
+all: $(TARGET)
+
+# Rule to build the executable
+$(TARGET): $(OBJS)
+	$(CC) $(OBJS) -o $(TARGET) $(LDFLAGS)
+
+# Rule to build object files
+%.o: %.c
+	$(CC) $(CFLAGS) -c $< -o $@
+
+# Rule to build CUDA object files
+#rowcol.o: rowcol.cu
+#	$(NVCC) $(CUDA_FLAGS) -c $< -o $@
+
+# Clean rule
+clean:
+	rm -f $(OBJS) $(TARGET)
+
+# Phony targets
+.PHONY: all clean 
diff --git a/perflab/matrix/clock.c b/perflab/matrix/clock.c
index a587590..b826af4 100644
--- a/perflab/matrix/clock.c
+++ b/perflab/matrix/clock.c
@@ -1,229 +1,196 @@
-/* clock.c
- * Retrofitted to use thread-specific timers
- * and to get clock information from /proc/cpuinfo
- * (C) R. E. Bryant, 2010
- *
- */
-
-/* When this constant is not defined, uses time stamp counter */
-#define USE_POSIX 0
-
-/* Choice to use cpu_gettime call or Intel time stamp counter directly */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <intrin.h>
-//#include <intrinsics.h>
-#include <windows.h>
-#include <time.h>
-#include "clock.h"
-
-/* Use x86 cycle counter */
-
-/* Initialize the cycle counter */
-static unsigned cyc_hi = 0;
-static unsigned cyc_lo = 0;
-
-/* Set *hi and *lo to the high and low order bits  of the cycle counter.
-   Implementation requires assembly code to use the rdtsc instruction. */
-void access_counter(unsigned *hi, unsigned *lo)
-{
-
-	long long counter;
-
-	counter = __rdtsc();
-	(*hi) = (unsigned int)(counter >> 32);
-	(*lo) = (unsigned int)counter;
-/*
-
-	LARGE_INTEGER lPerformanceCount;
-
-	QueryPerformanceCounter(&lPerformanceCount);
-	(*hi) = (unsigned int)lPerformanceCount.HighPart;
-	(*lo) = (unsigned int)lPerformanceCount.LowPart;
-//	printf("%08X %08X\n",(*hi),(*lo));
-*/
-}
-
-
-/* Record the current value of the cycle counter. */
-void start_counter()
-{
-    access_counter(&cyc_hi, &cyc_lo);
-}
-
-/* Return the number of cycles since the last call to start_counter. */
-double get_counter()
-{
-    unsigned ncyc_hi, ncyc_lo;
-    unsigned hi, lo, borrow;
-    double result;
-
-    /* Get cycle counter */
-    access_counter(&ncyc_hi, &ncyc_lo);
-
-    /* Do double precision subtraction */
-    lo = ncyc_lo - cyc_lo;
-    borrow = cyc_lo > ncyc_lo;
-    hi = ncyc_hi - cyc_hi - borrow;
-    result = (double) hi * (1 << 30) * 4 + lo;
-    return result;
-}
-void make_CPU_busy(void)
-{
-	volatile double old_tick,new_tick;
-	start_counter();
-	old_tick = get_counter();
-	new_tick = get_counter();
-	while (new_tick - old_tick < 1000000000)
-		new_tick = get_counter();
-}
-
-//CPUµÄÆµÂÊ
-double mhz(int verbose)
-{
-    LARGE_INTEGER lFrequency;
-    LARGE_INTEGER lPerformanceCount_Start;
-    LARGE_INTEGER lPerformanceCount_End;
-	double mhz;
-	double fTime;
-	__int64 _i64StartCpuCounter;
-	__int64 _i64EndCpuCounter;
-    //On a multiprocessor machine, it should not matter which processor is called.
-    //However, you can get different results on different processors due to bugs in
-    //the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function.
-    HANDLE hThread=GetCurrentThread();
-    SetThreadAffinityMask(hThread,0x1);
-
-    //Ö÷°åÉÏ¸ß¾«¶È¶¨Ê±Æ÷µÄ¾§ÕñÆµÂÊ
-    //Õâ¸ö¶¨Ê±Æ÷Ó¦¸Ã¾ÍÊÇÒ»Æ¬8253»òÕß8254
-    //ÔÚintel ich7ÖÐ¼¯³ÉÁË8254
-    QueryPerformanceFrequency(&lFrequency);
-//    if (verbose>0)
-//    	printf("¸ß¾«¶È¶¨Ê±Æ÷µÄ¾§ÕñÆµÂÊ£º%1.0fHz.\n",(double)lFrequency.QuadPart);
-
-    //Õâ¸ö¶¨Ê±Æ÷Ã¿¾­¹ýÒ»¸öÊ±ÖÓÖÜÆÚ£¬Æä¼ÆÊýÆ÷»á+1
-    QueryPerformanceCounter(&lPerformanceCount_Start);
-
-    //RDTSCÖ¸Áî:»ñÈ¡CPU¾­ÀúµÄÊ±ÖÓÖÜÆÚÊý
-    _i64StartCpuCounter=__rdtsc();
-
-    //ÑÓÊ±³¤Ò»µã,Îó²î»áÐ¡Ò»µã
-    //int nTemp=100000;
-    //while (--nTemp);
-    Sleep(200);
-
-    QueryPerformanceCounter(&lPerformanceCount_End);
-
-    _i64EndCpuCounter=__rdtsc();
-
-    //f=1/T => f=¼ÆÊý´ÎÊý/(¼ÆÊý´ÎÊý*T)
-    //ÕâÀïµÄ¡°¼ÆÊý´ÎÊý*T¡±¾ÍÊÇÊ±¼ä²î
-    fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart)
-        /(double)lFrequency.QuadPart;
-
- 		mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0);
-    if (verbose>0)
-    	printf("CPUÆµÂÊÎª:%1.6fMHz.\n",mhz);
-    return mhz;
-}
-
-double CPU_Factor1(void)
-{
-	double result;
-	int i,j,k,ii,jj,kk;
-	LARGE_INTEGER lStart,lEnd;
-  LARGE_INTEGER lFrequency;
-  HANDLE hThread;
-  double fTime;
-
-  QueryPerformanceFrequency(&lFrequency);
-
-	ii = 43273;
-	kk = 1238;
-	result = 1;
-	jj = 1244;
-
-    hThread=GetCurrentThread();
-    SetThreadAffinityMask(hThread,0x1);
-  QueryPerformanceCounter(&lStart);
-  //_asm("cpuid");
-	start_counter();
-	for (i=0;i<100;i++)
-		for (j=0;j<1000;j++)
-			for (k=0;k<1000;k++)
-				kk += kk*ii+jj;
-
-	result = get_counter();
-	QueryPerformanceCounter(&lEnd);
-  fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart);
-	printf("CPUÔËÐÐÊ±¼äÎª%f",result);
-	printf("\t %f\n",fTime);
-	return result;
-}
-
-double CPU_Factor(void)
-{
- double frequency;
- double multiplier = 1000 * 1000 * 1000;//nano
- LARGE_INTEGER lFrequency;
- LARGE_INTEGER start,stop;
- HANDLE hThread;
- int i;
- const int gigahertz= 1000*1000*1000;
- const int known_instructions_per_loop = 27317; 
-
- int iterations = 100000000;
- int g = 0;
- double normal_ticks_per_second;
-double ticks;
-double time;
-double loops_per_sec;
-double instructions_per_loop;
-double ratio;
-double actual_freq;
-
- QueryPerformanceFrequency(&lFrequency);
- frequency = (double)lFrequency.QuadPart;
-
- hThread=GetCurrentThread();
- SetThreadAffinityMask(hThread,0x1);
- QueryPerformanceCounter(&start);
- for( i = 0; i < iterations; i++)
- {
-   g++;
-   g++;
-   g++;
-   g++;
- }
- QueryPerformanceCounter(&stop);
-
- //normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199
- normal_ticks_per_second = frequency * 1000;
- ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
- time = (ticks * multiplier) /frequency;
- loops_per_sec = iterations / (time/multiplier);
- instructions_per_loop = normal_ticks_per_second  / loops_per_sec;
-
- ratio = (instructions_per_loop / known_instructions_per_loop);
- actual_freq = normal_ticks_per_second / ratio;
-/* 
- actual_freq = normal_ticks_per_second / ratio;
- actual_freq = known_instructions_per_loop*iterations*multiplier/time;
-
-	2293 = x/time;
-	
-	2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
- loops_per_sec = iterations*frequency / ticks
- 
- instructions_per_loop =   / loops_per_sec;
-*/ 
- printf("Perf counter freq: %f\n", normal_ticks_per_second);
- printf("Loops per sec:      %f\n", loops_per_sec);
- printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
- printf("Presumed freq: %f\n", actual_freq);
- printf("ratio: %f\n", ratio);
- printf("time=%f\n",time);
- return ratio;
-}
+/* clock.c
+ * Retrofitted to use thread-specific timers
+ * and to get clock information from /proc/cpuinfo
+ * (C) R. E. Bryant, 2010
+ * Modified for cross-platform compatibility
+ */
+
+#define _GNU_SOURCE // For sched_setaffinity on Linux
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _WIN32
+#include <intrin.h>
+#include <windows.h>
+#else
+#include <sched.h>
+#include <time.h>
+#include <unistd.h>
+#include <x86intrin.h>
+typedef struct {
+  uint64_t QuadPart;
+} LARGE_INTEGER;
+typedef void *HANDLE;
+#define __int64 long long
+#define Sleep(ms) usleep((ms) * 1000)
+#endif
+
+#include "clock.h"
+
+/* Use x86 cycle counter */
+static unsigned cyc_hi = 0;
+static unsigned cyc_lo = 0;
+
+void access_counter(unsigned *hi, unsigned *lo) {
+  uint64_t counter = __rdtsc();
+  *hi = (unsigned)(counter >> 32);
+  *lo = (unsigned)counter;
+}
+
+void start_counter() { access_counter(&cyc_hi, &cyc_lo); }
+
+double get_counter() {
+  unsigned ncyc_hi, ncyc_lo;
+  access_counter(&ncyc_hi, &ncyc_lo);
+  uint64_t start = ((uint64_t)cyc_hi << 32) | cyc_lo;
+  uint64_t end = ((uint64_t)ncyc_hi << 32) | ncyc_lo;
+  return (double)(end - start);
+}
+
+void make_CPU_busy(void) {
+  volatile double old_tick = get_counter();
+  volatile double new_tick;
+  while ((new_tick - old_tick) < 1000000000) {
+    new_tick = get_counter();
+  }
+}
+
+#ifdef _WIN32
+#define GET_TIME(dest) QueryPerformanceCounter(dest)
+#else
+static inline void GET_TIME(LARGE_INTEGER *dest) {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  dest->QuadPart = (uint64_t)ts.tv_sec * 1000000000 + ts.tv_nsec;
+}
+#define QueryPerformanceFrequency(freq) ((freq)->QuadPart = 1000000000)
+#endif
+
+double mhz(int verbose) {
+  LARGE_INTEGER lFrequency;
+  LARGE_INTEGER lPerformanceCount_Start;
+  LARGE_INTEGER lPerformanceCount_End;
+  double mhz;
+  double fTime;
+  __int64 _i64StartCpuCounter;
+  __int64 _i64EndCpuCounter;
+
+#ifdef _WIN32
+  HANDLE hThread = GetCurrentThread();
+  SetThreadAffinityMask(hThread, 0x1);
+#else
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(0, &cpuset);
+  sched_setaffinity(0, sizeof(cpuset), &cpuset);
+#endif
+
+  QueryPerformanceFrequency(&lFrequency);
+  GET_TIME(&lPerformanceCount_Start);
+  _i64StartCpuCounter = __rdtsc();
+  Sleep(200);
+  GET_TIME(&lPerformanceCount_End);
+  _i64EndCpuCounter = __rdtsc();
+
+  fTime = (lPerformanceCount_End.QuadPart - lPerformanceCount_Start.QuadPart) /
+          (double)lFrequency.QuadPart;
+  mhz = (_i64EndCpuCounter - _i64StartCpuCounter) / (fTime * 1000000.0);
+
+  if (verbose > 0) {
+    printf("CPUé¢‘çŽ‡ä¸º: %.6fMHz.\n", mhz);
+  }
+  return mhz;
+}
+
+double CPU_Factor1(void) {
+  double result;
+  int i, j, k;
+  LARGE_INTEGER lStart, lEnd;
+  LARGE_INTEGER lFrequency;
+  double fTime;
+
+#ifdef _WIN32
+  HANDLE hThread = GetCurrentThread();
+  SetThreadAffinityMask(hThread, 0x1);
+#else
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(0, &cpuset);
+  sched_setaffinity(0, sizeof(cpuset), &cpuset);
+#endif
+
+  QueryPerformanceFrequency(&lFrequency);
+  GET_TIME(&lStart);
+  start_counter();
+
+  for (i = 0; i < 100; i++)
+    for (j = 0; j < 1000; j++)
+      for (k = 0; k < 1000; k++)
+        ;
+
+  result = get_counter();
+  GET_TIME(&lEnd);
+
+  fTime = (lEnd.QuadPart - lStart.QuadPart) / (double)lFrequency.QuadPart;
+  printf("CPUè®¡ç®—æ—¶é•¿ä¸º: %f", result);
+  printf("\t %f\n", fTime);
+  return result;
+}
+
+double CPU_Factor(void) {
+  double frequency;
+  double multiplier = 1000 * 1000 * 1000; // nano
+  LARGE_INTEGER lFrequency;
+  LARGE_INTEGER start, stop;
+  int i;
+  const int known_instructions_per_loop = 27317;
+  int iterations = 100000000;
+  int g = 0;
+  double normal_ticks_per_second;
+  double ticks;
+  double time;
+  double loops_per_sec;
+  double instructions_per_loop;
+  double ratio;
+  double actual_freq;
+
+#ifdef _WIN32
+  HANDLE hThread = GetCurrentThread();
+  SetThreadAffinityMask(hThread, 0x1);
+#else
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  CPU_SET(0, &cpuset);
+  sched_setaffinity(0, sizeof(cpuset), &cpuset);
+#endif
+
+  QueryPerformanceFrequency(&lFrequency);
+  frequency = (double)lFrequency.QuadPart;
+  GET_TIME(&start);
+
+  for (i = 0; i < iterations; i++) {
+    g++;
+    g++;
+    g++;
+    g++;
+  }
+
+  GET_TIME(&stop);
+  normal_ticks_per_second = frequency * 1000;
+  ticks = (double)(stop.QuadPart - start.QuadPart);
+  time = (ticks * multiplier) / frequency;
+  loops_per_sec = iterations / (time / multiplier);
+  instructions_per_loop = normal_ticks_per_second / loops_per_sec;
+  ratio = instructions_per_loop / known_instructions_per_loop;
+  actual_freq = normal_ticks_per_second / ratio;
+
+  printf("Perf counter freq: %f\n", normal_ticks_per_second);
+  printf("Loops per sec:      %f\n", loops_per_sec);
+  printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
+  printf("Presumed freq: %f\n", actual_freq);
+  printf("ratio: %f\n", ratio);
+  printf("time=%f\n", time);
+  return ratio;
+}
diff --git a/perflab/matrix/clock.c.bak b/perflab/matrix/clock.c.bak
new file mode 100644
index 0000000..3b2a198
--- /dev/null
+++ b/perflab/matrix/clock.c.bak
@@ -0,0 +1,229 @@
+/* clock.c
+ * Retrofitted to use thread-specific timers
+ * and to get clock information from /proc/cpuinfo
+ * (C) R. E. Bryant, 2010
+ *
+ */
+
+/* When this constant is not defined, uses time stamp counter */
+#define USE_POSIX 0
+
+/* Choice to use cpu_gettime call or Intel time stamp counter directly */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <x86intrin.h>
+//#include <intrinsics.h>
+//#include <windows.h>
+#include <time.h>
+#include "clock.h"
+
+/* Use x86 cycle counter */
+
+/* Initialize the cycle counter */
+static unsigned cyc_hi = 0;
+static unsigned cyc_lo = 0;
+
+/* Set *hi and *lo to the high and low order bits  of the cycle counter.
+   Implementation requires assembly code to use the rdtsc instruction. */
+void access_counter(unsigned *hi, unsigned *lo)
+{
+
+	long long counter;
+
+	counter = __rdtsc();
+	(*hi) = (unsigned int)(counter >> 32);
+	(*lo) = (unsigned int)counter;
+/*
+
+	LARGE_INTEGER lPerformanceCount;
+
+	QueryPerformanceCounter(&lPerformanceCount);
+	(*hi) = (unsigned int)lPerformanceCount.HighPart;
+	(*lo) = (unsigned int)lPerformanceCount.LowPart;
+//	printf("%08X %08X\n",(*hi),(*lo));
+*/
+}
+
+
+/* Record the current value of the cycle counter. */
+void start_counter()
+{
+    access_counter(&cyc_hi, &cyc_lo);
+}
+
+/* Return the number of cycles since the last call to start_counter. */
+double get_counter()
+{
+    unsigned ncyc_hi, ncyc_lo;
+    unsigned hi, lo, borrow;
+    double result;
+
+    /* Get cycle counter */
+    access_counter(&ncyc_hi, &ncyc_lo);
+
+    /* Do double precision subtraction */
+    lo = ncyc_lo - cyc_lo;
+    borrow = cyc_lo > ncyc_lo;
+    hi = ncyc_hi - cyc_hi - borrow;
+    result = (double) hi * (1 << 30) * 4 + lo;
+    return result;
+}
+void make_CPU_busy(void)
+{
+	volatile double old_tick,new_tick;
+	start_counter();
+	old_tick = get_counter();
+	new_tick = get_counter();
+	while (new_tick - old_tick < 1000000000)
+		new_tick = get_counter();
+}
+
+//CPUµÄÆµÂÊ
+double mhz(int verbose)
+{
+    LARGE_INTEGER lFrequency;
+    LARGE_INTEGER lPerformanceCount_Start;
+    LARGE_INTEGER lPerformanceCount_End;
+	double mhz;
+	double fTime;
+	__int64 _i64StartCpuCounter;
+	__int64 _i64EndCpuCounter;
+    //On a multiprocessor machine, it should not matter which processor is called.
+    //However, you can get different results on different processors due to bugs in
+    //the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function.
+    HANDLE hThread=GetCurrentThread();
+    SetThreadAffinityMask(hThread,0x1);
+
+    //Ö÷°åÉÏ¸ß¾«¶È¶¨Ê±Æ÷µÄ¾§ÕñÆµÂÊ
+    //Õâ¸ö¶¨Ê±Æ÷Ó¦¸Ã¾ÍÊÇÒ»Æ¬8253»òÕß8254
+    //ÔÚintel ich7ÖÐ¼¯³ÉÁË8254
+    QueryPerformanceFrequency(&lFrequency);
+//    if (verbose>0)
+//    	printf("¸ß¾«¶È¶¨Ê±Æ÷µÄ¾§ÕñÆµÂÊ£º%1.0fHz.\n",(double)lFrequency.QuadPart);
+
+    //Õâ¸ö¶¨Ê±Æ÷Ã¿¾­¹ýÒ»¸öÊ±ÖÓÖÜÆÚ£¬Æä¼ÆÊýÆ÷»á+1
+    QueryPerformanceCounter(&lPerformanceCount_Start);
+
+    //RDTSCÖ¸Áî:»ñÈ¡CPU¾­ÀúµÄÊ±ÖÓÖÜÆÚÊý
+    _i64StartCpuCounter=__rdtsc();
+
+    //ÑÓÊ±³¤Ò»µã,Îó²î»áÐ¡Ò»µã
+    //int nTemp=100000;
+    //while (--nTemp);
+    Sleep(200);
+
+    QueryPerformanceCounter(&lPerformanceCount_End);
+
+    _i64EndCpuCounter=__rdtsc();
+
+    //f=1/T => f=¼ÆÊý´ÎÊý/(¼ÆÊý´ÎÊý*T)
+    //ÕâÀïµÄ¡°¼ÆÊý´ÎÊý*T¡±¾ÍÊÇÊ±¼ä²î
+    fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart)
+        /(double)lFrequency.QuadPart;
+
+ 		mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0);
+    if (verbose>0)
+    	printf("CPUÆµÂÊÎª:%1.6fMHz.\n",mhz);
+    return mhz;
+}
+
+double CPU_Factor1(void)
+{
+	double result;
+	int i,j,k,ii,jj,kk;
+	LARGE_INTEGER lStart,lEnd;
+  LARGE_INTEGER lFrequency;
+  HANDLE hThread;
+  double fTime;
+
+  QueryPerformanceFrequency(&lFrequency);
+
+	ii = 43273;
+	kk = 1238;
+	result = 1;
+	jj = 1244;
+
+    hThread=GetCurrentThread();
+    SetThreadAffinityMask(hThread,0x1);
+  QueryPerformanceCounter(&lStart);
+  //_asm("cpuid");
+	start_counter();
+	for (i=0;i<100;i++)
+		for (j=0;j<1000;j++)
+			for (k=0;k<1000;k++)
+				kk += kk*ii+jj;
+
+	result = get_counter();
+	QueryPerformanceCounter(&lEnd);
+  fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart);
+	printf("CPUÔËÐÐÊ±¼äÎª%f",result);
+	printf("\t %f\n",fTime);
+	return result;
+}
+
+double CPU_Factor(void)
+{
+ double frequency;
+ double multiplier = 1000 * 1000 * 1000;//nano
+ LARGE_INTEGER lFrequency;
+ LARGE_INTEGER start,stop;
+ HANDLE hThread;
+ int i;
+ const int gigahertz= 1000*1000*1000;
+ const int known_instructions_per_loop = 27317; 
+
+ int iterations = 100000000;
+ int g = 0;
+ double normal_ticks_per_second;
+double ticks;
+double time;
+double loops_per_sec;
+double instructions_per_loop;
+double ratio;
+double actual_freq;
+
+ QueryPerformanceFrequency(&lFrequency);
+ frequency = (double)lFrequency.QuadPart;
+
+ hThread=GetCurrentThread();
+ SetThreadAffinityMask(hThread,0x1);
+ QueryPerformanceCounter(&start);
+ for( i = 0; i < iterations; i++)
+ {
+   g++;
+   g++;
+   g++;
+   g++;
+ }
+ QueryPerformanceCounter(&stop);
+
+ //normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199
+ normal_ticks_per_second = frequency * 1000;
+ ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
+ time = (ticks * multiplier) /frequency;
+ loops_per_sec = iterations / (time/multiplier);
+ instructions_per_loop = normal_ticks_per_second  / loops_per_sec;
+
+ ratio = (instructions_per_loop / known_instructions_per_loop);
+ actual_freq = normal_ticks_per_second / ratio;
+/* 
+ actual_freq = normal_ticks_per_second / ratio;
+ actual_freq = known_instructions_per_loop*iterations*multiplier/time;
+
+	2293 = x/time;
+	
+	2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
+ loops_per_sec = iterations*frequency / ticks
+ 
+ instructions_per_loop =   / loops_per_sec;
+*/ 
+ printf("Perf counter freq: %f\n", normal_ticks_per_second);
+ printf("Loops per sec:      %f\n", loops_per_sec);
+ printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
+ printf("Presumed freq: %f\n", actual_freq);
+ printf("ratio: %f\n", ratio);
+ printf("time=%f\n",time);
+ return ratio;
+}
diff --git a/perflab/matrix/clock.o b/perflab/matrix/clock.o
new file mode 100644
index 0000000..0e68ee3
Binary files /dev/null and b/perflab/matrix/clock.o differ
diff --git a/perflab/matrix/cpe.o b/perflab/matrix/cpe.o
new file mode 100644
index 0000000..96d717d
Binary files /dev/null and b/perflab/matrix/cpe.o differ
diff --git a/perflab/matrix/fcyc.c b/perflab/matrix/fcyc.c
index dc2f735..a5f4077 100644
--- a/perflab/matrix/fcyc.c
+++ b/perflab/matrix/fcyc.c
@@ -119,7 +119,7 @@ double fcyc(test_funct f, int *params)
 	    if (clear_cache)
 		clear();
 	    start_counter();
-	    f(params);
+	    f((long*)params);
 	    cyc = get_counter();
 	    if (cyc > 0.0)
 		add_sample(cyc);
@@ -131,7 +131,7 @@ double fcyc(test_funct f, int *params)
 		clear();
 	    start_counter();
 	    for (i=0;i<MAX_ITER_TIMES;i++)
-    		f(params);
+    		f((long*)params);
 	    cyc = get_counter()/MAX_ITER_TIMES;
 	    if (cyc > 0.0)
 		add_sample(cyc);
diff --git a/perflab/matrix/fcyc.o b/perflab/matrix/fcyc.o
new file mode 100644
index 0000000..0943503
Binary files /dev/null and b/perflab/matrix/fcyc.o differ
diff --git a/perflab/matrix/lsquare.o b/perflab/matrix/lsquare.o
new file mode 100644
index 0000000..f36c57e
Binary files /dev/null and b/perflab/matrix/lsquare.o differ
diff --git a/perflab/matrix/matrix_test b/perflab/matrix/matrix_test
new file mode 100644
index 0000000..9fbc0a4
Binary files /dev/null and b/perflab/matrix/matrix_test differ
diff --git a/perflab/matrix/rowcol.c b/perflab/matrix/rowcol.c
index 9e50cbb..b504582 100644
--- a/perflab/matrix/rowcol.c
+++ b/perflab/matrix/rowcol.c
@@ -1,77 +1,69 @@
 /**************************************************************************
-	ÐÐ/ÁÐÇóºÍº¯Êý¡£°´ÏÂÃæµÄÒªÇó±à¼­´ËÎÄ¼þ£º
-	1. ½«ÄãµÄÑ§ºÅ¡¢ÐÕÃû£¬ÒÔ×¢ÊÍµÄ·½Ê½Ð´µ½ÏÂÃæ£»
-	2. ÊµÏÖ²»Í¬°æ±¾µÄÐÐÁÐÇóºÍº¯Êý£»
-	3. ±à¼­rc_fun_rec rc_fun_tabÊý×é£¬½«ÄãµÄ×îºÃµÄ´ð°¸
-		£¨×îºÃµÄÐÐºÍÁÐÇóºÍ¡¢×îºÃµÄÁÐÇóºÍ£©×÷ÎªÊý×éµÄÇ°Á½Ïî
+        ??/???????????????????????????????
+        1. ???????????????????????????????
+        2. ??????????????????????
+        3. ??rc_fun_rec rc_fun_tab??????????????????
+                ???????????????????????????????????????????
 ***************************************************************************/
-   
+
 /*
-	Ñ§ºÅ£º201209054233
-	ÐÕÃû£ºÒ¹°ë¼Ó°à¿ñ
+        ????201209054233
+        ??????????????
 */
 
+#include "rowcol.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
 
-#include  <stdio.h>
-#include  <stdlib.h>
-#include  "rowcol.h"
-#include  <math.h>
-
-/* ²Î¿¼µÄÁÐÇóºÍº¯ÊýÊµÏÖ */
-/* ¼ÆËã¾ØÕóÖÐµÄÃ¿Ò»ÁÐµÄºÍ¡£Çë×¢Òâ¶ÔÓÚÐÐºÍÁÐÇóºÍÀ´Ëµ£¬µ÷ÓÃ²ÎÊýÊÇ
-	Ò»ÑùµÄ£¬Ö»ÊÇµÚ2¸ö²ÎÊý²»»áÓÃµ½¶øÒÑ
+/* ????????????????? */
+/* ???????????????????????????????????????????????
+        ??????????2?????????????????
 */
 
-void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
-{
-    int i,j;
+void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
+  int i, j;
+  for (j = 0; j < N; j++) {
+    colsum[j] = 0;
+    for (i = 0; i < N; i++)
+      colsum[j] += M[i][j];
+  }
+}
+
+/* ???????????????????? */
+/* ??????????????????????? */
+
+void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
+  int i, j;
+  for (i = 0; i < N; i++) {
+    rowsum[i] = colsum[i] = 0;
     for (j = 0; j < N; j++) {
-	colsum[j] = 0;
-	for (i = 0; i < N; i++)
-	    colsum[j] += M[i][j];
+      rowsum[i] += M[i][j];
+      colsum[i] += M[j][i];
     }
+  }
 }
 
-
-/* ²Î¿¼µÄÁÐºÍÐÐÇóºÍº¯ÊýÊµÏÖ */
-/* ¼ÆËã¾ØÕóÖÐµÄÃ¿Ò»ÐÐ¡¢Ã¿Ò»ÁÐµÄºÍ¡£ */
-
-void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
-{
-    int i,j;
-    for (i = 0; i < N; i++) {
-	rowsum[i] = colsum[i] = 0;
-	for (j = 0; j < N; j++) {
-	    rowsum[i] += M[i][j];
-	    colsum[i] += M[j][i];
-	}
-    }
-}
-
-
-
-/* 
-	Õâ¸ö±í¸ñ°üº¬¶à¸öÊý×éÔªËØ£¬Ã¿Ò»×éÔªËØ£¨º¯ÊýÃû×Ö, COL/ROWCOL, "ÃèÊö×Ö·û´®"£©
-	COL±íÊ¾¸Ãº¯Êý½ö½ö¼ÆËãÃ¿Ò»ÁÐµÄºÍ
-	ROWCOL±íÊ¾¸Ãº¯Êý¼ÆËãÃ¿Ò»ÐÐ¡¢Ã¿Ò»ÁÐµÄºÍ
-	½«ÄãÈÏÎª×îºÃµÄÁ½¸öÊµÏÖ£¬·ÅÔÚ×îÇ°Ãæ¡£
-	±ÈÈç£º
-	{my_c_sum1, "³¬¼¶À¬»øÁÐÇóºÍÊµÏÖ"},
-	{my_rc_sum2, "ºÃÒ»µãµÄÐÐÁÐÇóºÍÊµÏÖ"},
+/*
+        ????????????????????????????????????????, COL/ROWCOL, "?????????"??
+        COL??????????????????????
+        ROWCOL???????????????????????
+        ?????????????????????????????
+        ????
+        {my_c_sum1, "?????????????????"},
+        {my_rc_sum2, "??????????????????"},
 */
 
-rc_fun_rec rc_fun_tab[] = 
-{
+rc_fun_rec rc_fun_tab[] = {
 
-  /* µÚÒ»Ïî£¬Ó¦µ±ÊÇÄãÐ´µÄ×îºÃÁÐÇóºÍµÄº¯ÊýÊµÏÖ */
+    /* ???????????????????????????????? */
     {c_sum, COL, "Best column sum"},
-  /* µÚ¶þÏî£¬Ó¦µ±ÊÇÄãÐ´µÄ×îºÃÐÐÁÐÇóºÍµÄº¯ÊýÊµÏÖ */
+    /* ?????????????????????????????????? */
     {rc_sum, ROWCOL, "Best row and column sum"},
 
     {c_sum, COL, "Column sum, reference implementation"},
 
     {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
 
- /* ÏÂÃæµÄ´úÂë²»ÄÜÐÞ¸Ä»òÕßÉ¾³ý£¡£¡±íÃ÷Êý×éÁÐ±í½áÊø */
-    {NULL,ROWCOL,NULL}
-};
+    /* ??????????????????????????????????????? */
+    {NULL, ROWCOL, NULL}};
\ No newline at end of file
diff --git a/perflab/matrix/rowcol.c~ b/perflab/matrix/rowcol.c~
new file mode 100644
index 0000000..990ce83
--- /dev/null
+++ b/perflab/matrix/rowcol.c~
@@ -0,0 +1,162 @@
+/**************************************************************************
+	ÐÐ/ÁÐÇóºÍº¯Êý¡£°´ÏÂÃæµÄÒªÇó±à¼­´ËÎÄ¼þ£º
+	1. ½«ÄãµÄÑ§ºÅ¡¢ÐÕÃû£¬ÒÔ×¢ÊÍµÄ·½Ê½Ð´µ½ÏÂÃæ£»
+	2. ÊµÏÖ²»Í¬°æ±¾µÄÐÐÁÐÇóºÍº¯Êý£»
+	3. ±à¼­rc_fun_rec rc_fun_tabÊý×é£¬½«ÄãµÄ×îºÃµÄ´ð°¸
+		£¨×îºÃµÄÐÐºÍÁÐÇóºÍ¡¢×îºÃµÄÁÐÇóºÍ£©×÷ÎªÊý×éµÄÇ°Á½Ïî
+***************************************************************************/
+   
+/*
+	Ñ§ºÅ£º202302723005
+	ÐÕÃû£º³Ì¾°Óä
+*/
+
+
+#include  <stdio.h>
+#include  <stdlib.h>
+#include  "rowcol.h"
+#include  <math.h>
+#include  <cuda_runtime.h>
+
+/* ²Î¿¼µÄÁÐÇóºÍº¯ÊýÊµÏÖ */
+/* ¼ÆËã¾ØÕóÖÐµÄÃ¿Ò»ÁÐµÄºÍ¡£Çë×¢Òâ¶ÔÓÚÐÐºÍÁÐÇóºÍÀ´Ëµ£¬µ÷ÓÃ²ÎÊýÊÇ
+	Ò»ÑùµÄ£¬Ö»ÊÇµÚ2¸ö²ÎÊý²»»áÓÃµ½¶øÒÑ
+*/
+
+void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+{
+    int i,j;
+    for (j = 0; j < N; j++) {
+	colsum[j] = 0;
+	for (i = 0; i < N; i++)
+	    colsum[j] += M[i][j];
+    }
+}
+
+
+/* ²Î¿¼µÄÁÐºÍÐÐÇóºÍº¯ÊýÊµÏÖ */
+/* ¼ÆËã¾ØÕóÖÐµÄÃ¿Ò»ÐÐ¡¢Ã¿Ò»ÁÐµÄºÍ¡£ */
+
+void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+{
+    int i,j;
+    for (i = 0; i < N; i++) {
+	rowsum[i] = colsum[i] = 0;
+	for (j = 0; j < N; j++) {
+	    rowsum[i] += M[i][j];
+	    colsum[i] += M[j][i];
+	}
+    }
+}
+
+/* CUDAÓÅ»¯µÄÁÐÇóºÍº¯Êý */
+void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+{
+    // ·ÖÅäÉè±¸ÄÚ´æ
+    int *d_M, *d_colsum;
+    cudaMalloc(&d_M, N * N * sizeof(int));
+    cudaMalloc(&d_colsum, N * sizeof(int));
+    
+    // ½«Êý¾Ý´ÓÖ÷»ú¸´ÖÆµ½Éè±¸
+    cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
+    
+    // ¶¨ÒåCUDAºËº¯Êý
+    dim3 blockDim(256);
+    dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
+    
+    // Æô¶¯ºËº¯Êý
+    cudaColumnSum<<<gridDim, blockDim>>>(d_M, d_colsum);
+    
+    // ½«½á¹û´ÓÉè±¸¸´ÖÆ»ØÖ÷»ú
+    cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
+    
+    // ÊÍ·ÅÉè±¸ÄÚ´æ
+    cudaFree(d_M);
+    cudaFree(d_colsum);
+}
+
+/* CUDAÓÅ»¯µÄÐÐÁÐÇóºÍº¯Êý */
+void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+{
+    // ·ÖÅäÉè±¸ÄÚ´æ
+    int *d_M, *d_rowsum, *d_colsum;
+    cudaMalloc(&d_M, N * N * sizeof(int));
+    cudaMalloc(&d_rowsum, N * sizeof(int));
+    cudaMalloc(&d_colsum, N * sizeof(int));
+    
+    // ½«Êý¾Ý´ÓÖ÷»ú¸´ÖÆµ½Éè±¸
+    cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
+    
+    // ¶¨ÒåCUDAºËº¯Êý
+    dim3 blockDim(256);
+    dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
+    
+    // Æô¶¯ºËº¯Êý
+    cudaRowColSum<<<gridDim, blockDim>>>(d_M, d_rowsum, d_colsum);
+    
+    // ½«½á¹û´ÓÉè±¸¸´ÖÆ»ØÖ÷»ú
+    cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
+    
+    // ÊÍ·ÅÉè±¸ÄÚ´æ
+    cudaFree(d_M);
+    cudaFree(d_rowsum);
+    cudaFree(d_colsum);
+}
+
+/* CUDAºËº¯Êý - ÁÐÇóºÍ */
+__global__ void cudaColumnSum(int *M, int *colsum)
+{
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (col < N) {
+        colsum[col] = 0;
+        for (int row = 0; row < N; row++) {
+            colsum[col] += M[row * N + col];
+        }
+    }
+}
+
+/* CUDAºËº¯Êý - ÐÐÁÐÇóºÍ */
+__global__ void cudaRowColSum(int *M, int *rowsum, int *colsum)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < N) {
+        // ¼ÆËãÐÐºÍ
+        rowsum[idx] = 0;
+        for (int j = 0; j < N; j++) {
+            rowsum[idx] += M[idx * N + j];
+        }
+        
+        // ¼ÆËãÁÐºÍ
+        colsum[idx] = 0;
+        for (int i = 0; i < N; i++) {
+            colsum[idx] += M[i * N + idx];
+        }
+    }
+}
+
+/* 
+	Õâ¸ö±í¸ñ°üº¬¶à¸öÊý×éÔªËØ£¬Ã¿Ò»×éÔªËØ£¨º¯ÊýÃû×Ö, COL/ROWCOL, "ÃèÊö×Ö·û´®"£©
+	COL±íÊ¾¸Ãº¯Êý½ö½ö¼ÆËãÃ¿Ò»ÁÐµÄºÍ
+	ROWCOL±íÊ¾¸Ãº¯Êý¼ÆËãÃ¿Ò»ÐÐ¡¢Ã¿Ò»ÁÐµÄºÍ
+	½«ÄãÈÏÎª×îºÃµÄÁ½¸öÊµÏÖ£¬·ÅÔÚ×îÇ°Ãæ¡£
+	±ÈÈç£º
+	{my_c_sum1, "³¬¼¶À¬»øÁÐÇóºÍÊµÏÖ"},
+	{my_rc_sum2, "ºÃÒ»µãµÄÐÐÁÐÇóºÍÊµÏÖ"},
+*/
+
+rc_fun_rec rc_fun_tab[] = 
+{
+
+  /* µÚÒ»Ïî£¬Ó¦µ±ÊÇÄãÐ´µÄ×îºÃÁÐÇóºÍµÄº¯ÊýÊµÏÖ */
+    {cuda_c_sum, COL, "CUDA optimized column sum"},
+  /* µÚ¶þÏî£¬Ó¦µ±ÊÇÄãÐ´µÄ×îºÃÐÐÁÐÇóºÍµÄº¯ÊýÊµÏÖ */
+    {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"},
+
+    {c_sum, COL, "Column sum, reference implementation"},
+
+    {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
+
+ /* ÏÂÃæµÄ´úÂë²»ÄÜÐÞ¸Ä»òÕßÉ¾³ý£¡£¡±íÃ÷Êý×éÁÐ±í½áÊø */
+    {NULL,ROWCOL,NULL}
+};
diff --git a/perflab/matrix/rowcol.o b/perflab/matrix/rowcol.o
new file mode 100644
index 0000000..abada5f
Binary files /dev/null and b/perflab/matrix/rowcol.o differ
diff --git a/perflab/matrix/rowcol.y~ b/perflab/matrix/rowcol.y~
new file mode 100644
index 0000000..5d3310a
--- /dev/null
+++ b/perflab/matrix/rowcol.y~
@@ -0,0 +1,240 @@
+/**************************************************************************
+        ¿¿/¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿
+        1. ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿
+        2. ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿
+        3. ¿¿rc_fun_rec rc_fun_tab¿¿¿¿¿¿¿¿¿¿¿¿¿õ¿¿¿¿
+                ¿¿¿¿¿õ¿¿¿¿¿¿¿¿¿¿¿¿¿¿õ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿
+***************************************************************************/
+   
+/*
+        ¿¿¿¿201209054233
+        ¿¿¿¿¿¿¿¿¿¿¿¿¿¿
+*/
+
+
+#include  <stdio.h>
+#include  <stdlib.h>
+#include  "rowcol.h"
+#include  <math.h>
+
+/* ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿ */
+/* ¿¿¿¿¿¿¿¿¿¿¿ÿ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿ò¿¿¿¿¿
+        ¿¿¿¿¿¿¿¿¿¿2¿¿¿¿¿¿¿¿¿¿¿õ¿¿¿¿¿
+*/
+
+void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+{
+    int i,j;
+    for (j = 0; j < N; j++) {
+        colsum[j] = 0;
+        for (i = 0; i < N; i++)
+            colsum[j] += M[i][j];
+    }
+}
+
+
+/* ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿ */
+/* ¿¿¿¿¿¿¿¿¿¿¿ÿ¿¿¿¿ÿ¿¿¿¿¿¿ */
+
+void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+{
+    int i,j;
+    for (i = 0; i < N; i++) {
+        rowsum[i] = colsum[i] = 0;
+        for (j = 0; j < N; j++) {
+            rowsum[i] += M[i][j];
+            colsum[i] += M[j][i];
+        }
+    }
+}
+
+
+
+/* 
+        ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿ÿ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿, COL/ROWCOL, "¿¿¿¿¿¿¿¿¿"¿¿
+        COL¿¿¿¿ú¿¿¿¿¿¿¿¿¿¿¿ÿ¿¿¿¿¿
+        ROWCOL¿¿¿¿ú¿¿¿¿¿¿¿ÿ¿¿¿¿ÿ¿¿¿¿¿
+        ¿¿¿¿¿¿¿¿¿¿õ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿
+        ¿¿¿¿
+        {my_c_sum1, "¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿"},
+        {my_rc_sum2, "¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿"},
+*/
+
+rc_fun_rec rc_fun_tab[] = 
+{
+
+  /* ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿ */
+    {c_sum, COL, "Best column sum"},
+  /* ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿ */
+    {rc_sum, ROWCOL, "Best row and column sum"},
+
+    {c_sum, COL, "Column sum, reference implementation"},
+
+    {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
+
+ /* ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿ */
+    {NULL,ROWCOL,NULL}
+};
+
+// /**************************************************************************
+// 	ÐÐ/ÁÐÇóºÍº¯Êý¡£°´ÏÂÃæµÄÒªÇó±à¼­´ËÎÄ¼þ£º
+// 	1. ½«ÄãµÄÑ§ºÅ¡¢ÐÕÃû£¬ÒÔ×¢ÊÍµÄ·½Ê½Ð´µ½ÏÂÃæ£»
+// 	2. ÊµÏÖ²»Í¬°æ±¾µÄÐÐÁÐÇóºÍº¯Êý£»
+// 	3. ±à¼­rc_fun_rec rc_fun_tabÊý×é£¬½«ÄãµÄ×îºÃµÄ´ð°¸
+// 		£¨×îºÃµÄÐÐºÍÁÐÇóºÍ¡¢×îºÃµÄÁÐÇóºÍ£©×÷ÎªÊý×éµÄÇ°Á½Ïî
+// ***************************************************************************/
+//
+// /*
+// 	Ñ§ºÅ£º202302723005
+// 	ÐÕÃû£º³Ì¾°Óä
+// */
+//
+//
+// #include  <stdio.h>
+// #include  <stdlib.h>
+// #include  "rowcol.h"
+// #include  <math.h>
+// #include  <cuda_runtime.h>
+//
+// /* ²Î¿¼µÄÁÐÇóºÍº¯ÊýÊµÏÖ */
+// /* ¼ÆËã¾ØÕóÖÐµÄÃ¿Ò»ÁÐµÄºÍ¡£Çë×¢Òâ¶ÔÓÚÐÐºÍÁÐÇóºÍÀ´Ëµ£¬µ÷ÓÃ²ÎÊýÊÇ
+// 	Ò»ÑùµÄ£¬Ö»ÊÇµÚ2¸ö²ÎÊý²»»áÓÃµ½¶øÒÑ
+// */
+//
+// void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+// {
+//     int i,j;
+//     for (j = 0; j < N; j++) {
+// 	colsum[j] = 0;
+// 	for (i = 0; i < N; i++)
+// 	    colsum[j] += M[i][j];
+//     }
+// }
+//
+//
+// /* ²Î¿¼µÄÁÐºÍÐÐÇóºÍº¯ÊýÊµÏÖ */
+// /* ¼ÆËã¾ØÕóÖÐµÄÃ¿Ò»ÐÐ¡¢Ã¿Ò»ÁÐµÄºÍ¡£ */
+//
+// void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+// {
+//     int i,j;
+//     for (i = 0; i < N; i++) {
+// 	rowsum[i] = colsum[i] = 0;
+// 	for (j = 0; j < N; j++) {
+// 	    rowsum[i] += M[i][j];
+// 	    colsum[i] += M[j][i];
+// 	}
+//     }
+// }
+//
+// /* CUDAÓÅ»¯µÄÁÐÇóºÍº¯Êý */
+// void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+// {
+//     // ·ÖÅäÉè±¸ÄÚ´æ
+//     int *d_M, *d_colsum;
+//     cudaMalloc(&d_M, N * N * sizeof(int));
+//     cudaMalloc(&d_colsum, N * sizeof(int));
+//
+//     // ½«Êý¾Ý´ÓÖ÷»ú¸´ÖÆµ½Éè±¸
+//     cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
+//
+//     // ¶¨ÒåCUDAºËº¯Êý
+//     dim3 blockDim(256);
+//     dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
+//
+//     // Æô¶¯ºËº¯Êý
+//     cudaColumnSum<<<gridDim, blockDim>>>(d_M, d_colsum);
+//
+//     // ½«½á¹û´ÓÉè±¸¸´ÖÆ»ØÖ÷»ú
+//     cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
+//
+//     // ÊÍ·ÅÉè±¸ÄÚ´æ
+//     cudaFree(d_M);
+//     cudaFree(d_colsum);
+// }
+//
+// /* CUDAÓÅ»¯µÄÐÐÁÐÇóºÍº¯Êý */
+// void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+// {
+//     // ·ÖÅäÉè±¸ÄÚ´æ
+//     int *d_M, *d_rowsum, *d_colsum;
+//     cudaMalloc(&d_M, N * N * sizeof(int));
+//     cudaMalloc(&d_rowsum, N * sizeof(int));
+//     cudaMalloc(&d_colsum, N * sizeof(int));
+//
+//     // ½«Êý¾Ý´ÓÖ÷»ú¸´ÖÆµ½Éè±¸
+//     cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
+//
+//     // ¶¨ÒåCUDAºËº¯Êý
+//     dim3 blockDim(256);
+//     dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
+//
+//     // Æô¶¯ºËº¯Êý
+//     cudaRowColSum<<<gridDim, blockDim>>>(d_M, d_rowsum, d_colsum);
+//
+//     // ½«½á¹û´ÓÉè±¸¸´ÖÆ»ØÖ÷»ú
+//     cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost);
+//     cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
+//
+//     // ÊÍ·ÅÉè±¸ÄÚ´æ
+//     cudaFree(d_M);
+//     cudaFree(d_rowsum);
+//     cudaFree(d_colsum);
+// }
+//
+// /* CUDAºËº¯Êý - ÁÐÇóºÍ */
+// __global__ void cudaColumnSum(int *M, int *colsum)
+// {
+//     int col = blockIdx.x * blockDim.x + threadIdx.x;
+//     if (col < N) {
+//         colsum[col] = 0;
+//         for (int row = 0; row < N; row++) {
+//             colsum[col] += M[row * N + col];
+//         }
+//     }
+// }
+//
+// /* CUDAºËº¯Êý - ÐÐÁÐÇóºÍ */
+// __global__ void cudaRowColSum(int *M, int *rowsum, int *colsum)
+// {
+//     int idx = blockIdx.x * blockDim.x + threadIdx.x;
+//     if (idx < N) {
+//         // ¼ÆËãÐÐºÍ
+//         rowsum[idx] = 0;
+//         for (int j = 0; j < N; j++) {
+//             rowsum[idx] += M[idx * N + j];
+//         }
+//
+//         // ¼ÆËãÁÐºÍ
+//         colsum[idx] = 0;
+//         for (int i = 0; i < N; i++) {
+//             colsum[idx] += M[i * N + idx];
+//         }
+//     }
+// }
+//
+// /* 
+// 	Õâ¸ö±í¸ñ°üº¬¶à¸öÊý×éÔªËØ£¬Ã¿Ò»×éÔªËØ£¨º¯ÊýÃû×Ö, COL/ROWCOL, "ÃèÊö×Ö·û´®"£©
+// 	COL±íÊ¾¸Ãº¯Êý½ö½ö¼ÆËãÃ¿Ò»ÁÐµÄºÍ
+// 	ROWCOL±íÊ¾¸Ãº¯Êý¼ÆËãÃ¿Ò»ÐÐ¡¢Ã¿Ò»ÁÐµÄºÍ
+// 	½«ÄãÈÏÎª×îºÃµÄÁ½¸öÊµÏÖ£¬·ÅÔÚ×îÇ°Ãæ¡£
+// 	±ÈÈç£º
+// 	{my_c_sum1, "³¬¼¶À¬»øÁÐÇóºÍÊµÏÖ"},
+// 	{my_rc_sum2, "ºÃÒ»µãµÄÐÐÁÐÇóºÍÊµÏÖ"},
+// */
+//
+// rc_fun_rec rc_fun_tab[] = 
+// {
+//
+//   /* µÚÒ»Ïî£¬Ó¦µ±ÊÇÄãÐ´µÄ×îºÃÁÐÇóºÍµÄº¯ÊýÊµÏÖ */
+//     {cuda_c_sum, COL, "CUDA optimized column sum"},
+//   /* µÚ¶þÏî£¬Ó¦µ±ÊÇÄãÐ´µÄ×îºÃÐÐÁÐÇóºÍµÄº¯ÊýÊµÏÖ */
+//     {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"},
+//
+//     {c_sum, COL, "Column sum, reference implementation"},
+//
+//     {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
+//
+//  /* ÏÂÃæµÄ´úÂë²»ÄÜÐÞ¸Ä»òÕßÉ¾³ý£¡£¡±íÃ÷Êý×éÁÐ±í½áÊø */
+//     {NULL,ROWCOL,NULL}
+// };
diff --git a/perflab/matrix/rowcol.z~ b/perflab/matrix/rowcol.z~
new file mode 100644
index 0000000..5d3310a
--- /dev/null
+++ b/perflab/matrix/rowcol.z~
@@ -0,0 +1,240 @@
+/**************************************************************************
+        ¿¿/¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿
+        1. ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿
+        2. ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿
+        3. ¿¿rc_fun_rec rc_fun_tab¿¿¿¿¿¿¿¿¿¿¿¿¿õ¿¿¿¿
+                ¿¿¿¿¿õ¿¿¿¿¿¿¿¿¿¿¿¿¿¿õ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿
+***************************************************************************/
+   
+/*
+        ¿¿¿¿201209054233
+        ¿¿¿¿¿¿¿¿¿¿¿¿¿¿
+*/
+
+
+#include  <stdio.h>
+#include  <stdlib.h>
+#include  "rowcol.h"
+#include  <math.h>
+
+/* ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿ */
+/* ¿¿¿¿¿¿¿¿¿¿¿ÿ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿ò¿¿¿¿¿
+        ¿¿¿¿¿¿¿¿¿¿2¿¿¿¿¿¿¿¿¿¿¿õ¿¿¿¿¿
+*/
+
+void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+{
+    int i,j;
+    for (j = 0; j < N; j++) {
+        colsum[j] = 0;
+        for (i = 0; i < N; i++)
+            colsum[j] += M[i][j];
+    }
+}
+
+
+/* ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿ */
+/* ¿¿¿¿¿¿¿¿¿¿¿ÿ¿¿¿¿ÿ¿¿¿¿¿¿ */
+
+void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+{
+    int i,j;
+    for (i = 0; i < N; i++) {
+        rowsum[i] = colsum[i] = 0;
+        for (j = 0; j < N; j++) {
+            rowsum[i] += M[i][j];
+            colsum[i] += M[j][i];
+        }
+    }
+}
+
+
+
+/* 
+        ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿ÿ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿, COL/ROWCOL, "¿¿¿¿¿¿¿¿¿"¿¿
+        COL¿¿¿¿ú¿¿¿¿¿¿¿¿¿¿¿ÿ¿¿¿¿¿
+        ROWCOL¿¿¿¿ú¿¿¿¿¿¿¿ÿ¿¿¿¿ÿ¿¿¿¿¿
+        ¿¿¿¿¿¿¿¿¿¿õ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿
+        ¿¿¿¿
+        {my_c_sum1, "¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿"},
+        {my_rc_sum2, "¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿"},
+*/
+
+rc_fun_rec rc_fun_tab[] = 
+{
+
+  /* ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿ */
+    {c_sum, COL, "Best column sum"},
+  /* ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿ */
+    {rc_sum, ROWCOL, "Best row and column sum"},
+
+    {c_sum, COL, "Column sum, reference implementation"},
+
+    {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
+
+ /* ¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿¿ */
+    {NULL,ROWCOL,NULL}
+};
+
+// /**************************************************************************
+// 	ÐÐ/ÁÐÇóºÍº¯Êý¡£°´ÏÂÃæµÄÒªÇó±à¼­´ËÎÄ¼þ£º
+// 	1. ½«ÄãµÄÑ§ºÅ¡¢ÐÕÃû£¬ÒÔ×¢ÊÍµÄ·½Ê½Ð´µ½ÏÂÃæ£»
+// 	2. ÊµÏÖ²»Í¬°æ±¾µÄÐÐÁÐÇóºÍº¯Êý£»
+// 	3. ±à¼­rc_fun_rec rc_fun_tabÊý×é£¬½«ÄãµÄ×îºÃµÄ´ð°¸
+// 		£¨×îºÃµÄÐÐºÍÁÐÇóºÍ¡¢×îºÃµÄÁÐÇóºÍ£©×÷ÎªÊý×éµÄÇ°Á½Ïî
+// ***************************************************************************/
+//
+// /*
+// 	Ñ§ºÅ£º202302723005
+// 	ÐÕÃû£º³Ì¾°Óä
+// */
+//
+//
+// #include  <stdio.h>
+// #include  <stdlib.h>
+// #include  "rowcol.h"
+// #include  <math.h>
+// #include  <cuda_runtime.h>
+//
+// /* ²Î¿¼µÄÁÐÇóºÍº¯ÊýÊµÏÖ */
+// /* ¼ÆËã¾ØÕóÖÐµÄÃ¿Ò»ÁÐµÄºÍ¡£Çë×¢Òâ¶ÔÓÚÐÐºÍÁÐÇóºÍÀ´Ëµ£¬µ÷ÓÃ²ÎÊýÊÇ
+// 	Ò»ÑùµÄ£¬Ö»ÊÇµÚ2¸ö²ÎÊý²»»áÓÃµ½¶øÒÑ
+// */
+//
+// void c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+// {
+//     int i,j;
+//     for (j = 0; j < N; j++) {
+// 	colsum[j] = 0;
+// 	for (i = 0; i < N; i++)
+// 	    colsum[j] += M[i][j];
+//     }
+// }
+//
+//
+// /* ²Î¿¼µÄÁÐºÍÐÐÇóºÍº¯ÊýÊµÏÖ */
+// /* ¼ÆËã¾ØÕóÖÐµÄÃ¿Ò»ÐÐ¡¢Ã¿Ò»ÁÐµÄºÍ¡£ */
+//
+// void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+// {
+//     int i,j;
+//     for (i = 0; i < N; i++) {
+// 	rowsum[i] = colsum[i] = 0;
+// 	for (j = 0; j < N; j++) {
+// 	    rowsum[i] += M[i][j];
+// 	    colsum[i] += M[j][i];
+// 	}
+//     }
+// }
+//
+// /* CUDAÓÅ»¯µÄÁÐÇóºÍº¯Êý */
+// void cuda_c_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+// {
+//     // ·ÖÅäÉè±¸ÄÚ´æ
+//     int *d_M, *d_colsum;
+//     cudaMalloc(&d_M, N * N * sizeof(int));
+//     cudaMalloc(&d_colsum, N * sizeof(int));
+//
+//     // ½«Êý¾Ý´ÓÖ÷»ú¸´ÖÆµ½Éè±¸
+//     cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
+//
+//     // ¶¨ÒåCUDAºËº¯Êý
+//     dim3 blockDim(256);
+//     dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
+//
+//     // Æô¶¯ºËº¯Êý
+//     cudaColumnSum<<<gridDim, blockDim>>>(d_M, d_colsum);
+//
+//     // ½«½á¹û´ÓÉè±¸¸´ÖÆ»ØÖ÷»ú
+//     cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
+//
+//     // ÊÍ·ÅÉè±¸ÄÚ´æ
+//     cudaFree(d_M);
+//     cudaFree(d_colsum);
+// }
+//
+// /* CUDAÓÅ»¯µÄÐÐÁÐÇóºÍº¯Êý */
+// void cuda_rc_sum(matrix_t M, vector_t rowsum, vector_t colsum)
+// {
+//     // ·ÖÅäÉè±¸ÄÚ´æ
+//     int *d_M, *d_rowsum, *d_colsum;
+//     cudaMalloc(&d_M, N * N * sizeof(int));
+//     cudaMalloc(&d_rowsum, N * sizeof(int));
+//     cudaMalloc(&d_colsum, N * sizeof(int));
+//
+//     // ½«Êý¾Ý´ÓÖ÷»ú¸´ÖÆµ½Éè±¸
+//     cudaMemcpy(d_M, M, N * N * sizeof(int), cudaMemcpyHostToDevice);
+//
+//     // ¶¨ÒåCUDAºËº¯Êý
+//     dim3 blockDim(256);
+//     dim3 gridDim((N + blockDim.x - 1) / blockDim.x);
+//
+//     // Æô¶¯ºËº¯Êý
+//     cudaRowColSum<<<gridDim, blockDim>>>(d_M, d_rowsum, d_colsum);
+//
+//     // ½«½á¹û´ÓÉè±¸¸´ÖÆ»ØÖ÷»ú
+//     cudaMemcpy(rowsum, d_rowsum, N * sizeof(int), cudaMemcpyDeviceToHost);
+//     cudaMemcpy(colsum, d_colsum, N * sizeof(int), cudaMemcpyDeviceToHost);
+//
+//     // ÊÍ·ÅÉè±¸ÄÚ´æ
+//     cudaFree(d_M);
+//     cudaFree(d_rowsum);
+//     cudaFree(d_colsum);
+// }
+//
+// /* CUDAºËº¯Êý - ÁÐÇóºÍ */
+// __global__ void cudaColumnSum(int *M, int *colsum)
+// {
+//     int col = blockIdx.x * blockDim.x + threadIdx.x;
+//     if (col < N) {
+//         colsum[col] = 0;
+//         for (int row = 0; row < N; row++) {
+//             colsum[col] += M[row * N + col];
+//         }
+//     }
+// }
+//
+// /* CUDAºËº¯Êý - ÐÐÁÐÇóºÍ */
+// __global__ void cudaRowColSum(int *M, int *rowsum, int *colsum)
+// {
+//     int idx = blockIdx.x * blockDim.x + threadIdx.x;
+//     if (idx < N) {
+//         // ¼ÆËãÐÐºÍ
+//         rowsum[idx] = 0;
+//         for (int j = 0; j < N; j++) {
+//             rowsum[idx] += M[idx * N + j];
+//         }
+//
+//         // ¼ÆËãÁÐºÍ
+//         colsum[idx] = 0;
+//         for (int i = 0; i < N; i++) {
+//             colsum[idx] += M[i * N + idx];
+//         }
+//     }
+// }
+//
+// /* 
+// 	Õâ¸ö±í¸ñ°üº¬¶à¸öÊý×éÔªËØ£¬Ã¿Ò»×éÔªËØ£¨º¯ÊýÃû×Ö, COL/ROWCOL, "ÃèÊö×Ö·û´®"£©
+// 	COL±íÊ¾¸Ãº¯Êý½ö½ö¼ÆËãÃ¿Ò»ÁÐµÄºÍ
+// 	ROWCOL±íÊ¾¸Ãº¯Êý¼ÆËãÃ¿Ò»ÐÐ¡¢Ã¿Ò»ÁÐµÄºÍ
+// 	½«ÄãÈÏÎª×îºÃµÄÁ½¸öÊµÏÖ£¬·ÅÔÚ×îÇ°Ãæ¡£
+// 	±ÈÈç£º
+// 	{my_c_sum1, "³¬¼¶À¬»øÁÐÇóºÍÊµÏÖ"},
+// 	{my_rc_sum2, "ºÃÒ»µãµÄÐÐÁÐÇóºÍÊµÏÖ"},
+// */
+//
+// rc_fun_rec rc_fun_tab[] = 
+// {
+//
+//   /* µÚÒ»Ïî£¬Ó¦µ±ÊÇÄãÐ´µÄ×îºÃÁÐÇóºÍµÄº¯ÊýÊµÏÖ */
+//     {cuda_c_sum, COL, "CUDA optimized column sum"},
+//   /* µÚ¶þÏî£¬Ó¦µ±ÊÇÄãÐ´µÄ×îºÃÐÐÁÐÇóºÍµÄº¯ÊýÊµÏÖ */
+//     {cuda_rc_sum, ROWCOL, "CUDA optimized row and column sum"},
+//
+//     {c_sum, COL, "Column sum, reference implementation"},
+//
+//     {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
+//
+//  /* ÏÂÃæµÄ´úÂë²»ÄÜÐÞ¸Ä»òÕßÉ¾³ý£¡£¡±íÃ÷Êý×éÁÐ±í½áÊø */
+//     {NULL,ROWCOL,NULL}
+// };
diff --git a/perflab/matrix/rowcol_202302723005.c b/perflab/matrix/rowcol_202302723005.c
new file mode 100644
index 0000000..b504582
--- /dev/null
+++ b/perflab/matrix/rowcol_202302723005.c
@@ -0,0 +1,69 @@
+/**************************************************************************
+        ??/???????????????????????????????
+        1. ???????????????????????????????
+        2. ??????????????????????
+        3. ??rc_fun_rec rc_fun_tab??????????????????
+                ???????????????????????????????????????????
+***************************************************************************/
+
+/*
+        ????201209054233
+        ??????????????
+*/
+
+#include "rowcol.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* ????????????????? */
+/* ???????????????????????????????????????????????
+        ??????????2?????????????????
+*/
+
+void c_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
+  int i, j;
+  for (j = 0; j < N; j++) {
+    colsum[j] = 0;
+    for (i = 0; i < N; i++)
+      colsum[j] += M[i][j];
+  }
+}
+
+/* ???????????????????? */
+/* ??????????????????????? */
+
+void rc_sum(matrix_t M, vector_t rowsum, vector_t colsum) {
+  int i, j;
+  for (i = 0; i < N; i++) {
+    rowsum[i] = colsum[i] = 0;
+    for (j = 0; j < N; j++) {
+      rowsum[i] += M[i][j];
+      colsum[i] += M[j][i];
+    }
+  }
+}
+
+/*
+        ????????????????????????????????????????, COL/ROWCOL, "?????????"??
+        COL??????????????????????
+        ROWCOL???????????????????????
+        ?????????????????????????????
+        ????
+        {my_c_sum1, "?????????????????"},
+        {my_rc_sum2, "??????????????????"},
+*/
+
+rc_fun_rec rc_fun_tab[] = {
+
+    /* ???????????????????????????????? */
+    {c_sum, COL, "Best column sum"},
+    /* ?????????????????????????????????? */
+    {rc_sum, ROWCOL, "Best row and column sum"},
+
+    {c_sum, COL, "Column sum, reference implementation"},
+
+    {rc_sum, ROWCOL, "Row and column sum, reference implementation"},
+
+    /* ??????????????????????????????????????? */
+    {NULL, ROWCOL, NULL}};
\ No newline at end of file
diff --git a/perflab/matrix/rowcol_202302723005.o b/perflab/matrix/rowcol_202302723005.o
new file mode 100644
index 0000000..b28983c
Binary files /dev/null and b/perflab/matrix/rowcol_202302723005.o differ
diff --git a/perflab/matrix/rowcol_test.c b/perflab/matrix/rowcol_test.c
index 6b67926..e6a046e 100644
--- a/perflab/matrix/rowcol_test.c
+++ b/perflab/matrix/rowcol_test.c
@@ -1,9 +1,9 @@
 #include <stdio.h>
 #include <stdlib.h>
-//#include <random.h>
-#include "rowcol.h"
-#include "fcyc.h"
+// #include <random.h>
 #include "clock.h"
+#include "fcyc.h"
+#include "rowcol.h"
 
 #define MAX_ITER_COUNT 100
 
@@ -11,9 +11,9 @@
 static struct {
   double cref;  /* Cycles taken by reference solution */
   double cbest; /* Cycles taken by our best implementation */
-} cstandard[2] = 
-{{7.7, 6.40}, /* Column Sum */
- {9.75, 6.60} /* Row & Column Sum */
+} cstandard[2] = {
+    {7.7, 6.40}, /* Column Sum */
+    {9.75, 6.60} /* Row & Column Sum */
 };
 
 /* Put in code to align matrix so that it starts on a cache block boundary.
@@ -26,7 +26,7 @@ static struct {
 #define WPB 16
 
 int verbose = 1;
-int data[N*N+WPB];
+int data[N * N + WPB];
 int *mstart;
 
 typedef vector_t *row_t;
@@ -37,137 +37,122 @@ vector_t rsref, csref, rcomp, ccomp;
 static void init_tests(void);
 extern void make_CPU_busy(void);
 
-static void init_tests(void)
-{
-    int i, j;
-    size_t bytes_per_block = sizeof(int) * WPB;
-    /* round mstart up to nearest block boundary */
-    mstart = (int *)
-      (((size_t) data + bytes_per_block-1) / bytes_per_block * bytes_per_block);
-    for (i = 0; i < N; i++) {
-	rsref[i] = csref[i] = 0;
-    }
-    for (i = 0; i < N; i++) {
-	for (j = 0; j < N; j++) {
-	    int val = rand();
-	    mstart[i*N+j] = val;
-	    rsref[i] += val;
-	    csref[j] += val;
-	}
+static void init_tests(void) {
+  int i, j;
+  size_t bytes_per_block = sizeof(int) * WPB;
+  /* round mstart up to nearest block boundary */
+  mstart = (int *)(((size_t)data + bytes_per_block - 1) / bytes_per_block *
+                   bytes_per_block);
+  for (i = 0; i < N; i++) {
+    rsref[i] = csref[i] = 0;
+  }
+  for (i = 0; i < N; i++) {
+    for (j = 0; j < N; j++) {
+      int val = rand();
+      mstart[i * N + j] = val;
+      rsref[i] += val;
+      csref[j] += val;
     }
+  }
 }
 
-
 /* Test function on all values */
 int test_rc(rc_fun f, FILE *rpt, rc_comp_t rc_type) {
-    int i;
-    int ok = 1;
+  int i;
+  int ok = 1;
 
-    for (i = 0; i < N; i++)
-	rcomp[i] = ccomp[i] = 0xDEADBEEF;
-    f((row_t)mstart, rcomp, ccomp);
-
-    for (i = 0; ok && i < N; i++) {
-	if (rc_type == ROWCOL
-	    && rsref[i] != rcomp[i]) {
-	    ok = 0;
-	    if (rpt)
-		fprintf(rpt,
-			"¶ÔµÚ%dÐÐµÄ¼ÆËã³ö´í£¡ÕýÈ·½á¹ûÊÇ%d£¬µ«ÊÇ¼ÆËãµÃµ½%d\n",
-			i, rsref[i], rcomp[i]);
-	}
-	if ((rc_type == ROWCOL || rc_type == COL)
-		 && csref[i] != ccomp[i]) {
-	    ok = 0;
-	    if (rpt)
-		fprintf(rpt,
-			"¶ÔµÚ%dÁÐµÄ¼ÆËã³ö´í£¡ÕýÈ·½á¹ûÊÇ%d£¬µ«ÊÇ¼ÆËãµÃµ½%d\n",
-			i, csref[i], ccomp[i]);
-	}
+  for (i = 0; i < N; i++)
+    rcomp[i] = ccomp[i] = 0xDEADBEEF;
+  f((row_t)mstart, rcomp, ccomp);
 
+  for (i = 0; ok && i < N; i++) {
+    if (rc_type == ROWCOL && rsref[i] != rcomp[i]) {
+      ok = 0;
+      if (rpt)
+        fprintf(rpt, "å¯¹ç¬¬%dè¡Œçš„è®¡ç®—å‡ºé”™ï¼æ­£ç¡®ç»“æžœæ˜¯%dï¼Œä½†æ˜¯è®¡ç®—å¾—åˆ°%d\n", i,
+                rsref[i], rcomp[i]);
     }
-    return ok;
+    if ((rc_type == ROWCOL || rc_type == COL) && csref[i] != ccomp[i]) {
+      ok = 0;
+      if (rpt)
+        fprintf(rpt, "å¯¹ç¬¬%dåˆ—çš„è®¡ç®—å‡ºé”™ï¼æ­£ç¡®ç»“æžœæ˜¯%dï¼Œä½†æ˜¯è®¡ç®—å¾—åˆ°%d\n", i,
+                csref[i], ccomp[i]);
+    }
+  }
+  return ok;
 }
 
 /* Kludgy way to interface to cycle measuring code */
-void do_test(int *intf)
-{
-  rc_fun f = (rc_fun) intf;
+void do_test(int *intf) {
+  rc_fun f = (rc_fun)intf;
   f((row_t)mstart, rcomp, ccomp);
 }
 
-void time_rc(rc_fun f, rc_comp_t rc_type, char *descr, double *cycp)
-{
-	int i;
-  int *intf = (int *) f;
+void time_rc(rc_fun f, rc_comp_t rc_type, char *descr, double *cycp) {
+  int i;
+  int *intf = (int *)f;
   double t, cme;
   t = 0;
-  if (verbose) printf("º¯Êý£º%s\n", descr);
+  if (verbose)
+    printf("å‡½æ•°ï¼š%s\n", descr);
   if (test_rc(f, stdout, rc_type)) {
-  	make_CPU_busy();
-  	for (i=0;i<MAX_ITER_COUNT;i++)
-    	t += fcyc(do_test, intf);
-    t = t/MAX_ITER_COUNT;
-    cme = t/(N*N);
-    if (verbose) printf("  ×ÜÖÜÆÚÊý = %.2f, Æ½¾ùÖÜÆÚ/ÔªËØ = %.2f\n",
-	   t, cme);
+    make_CPU_busy();
+    for (i = 0; i < MAX_ITER_COUNT; i++)
+      t += fcyc((void (*)(long *))do_test, intf);
+    t = t / MAX_ITER_COUNT;
+    cme = t / (N * N);
+    if (verbose)
+      printf("  æ€»å‘¨æœŸæ•° = %.2f, å¹³å‡å‘¨æœŸ/å…ƒç´  = %.2f\n", t, cme);
     if (cycp)
       *cycp = cme;
   }
 }
 
 /* Compute the grade achieved by function */
-static double compute_score(double cmeas, double cref, double cbest)
-{
-  double sbest = cref/cbest;
-  double smeas = cref/cmeas;
-  if (smeas < 0.1*(sbest-1)+1)
+static double compute_score(double cmeas, double cref, double cbest) {
+  double sbest = cref / cbest;
+  double smeas = cref / cmeas;
+  if (smeas < 0.1 * (sbest - 1) + 1)
     return 0;
-  if (smeas > 1.1*(sbest-1)+1)
+  if (smeas > 1.1 * (sbest - 1) + 1)
     return 120;
-  return 100*((smeas-1.0)/(sbest-1.0) + 0.1);
+  return 100 * ((smeas - 1.0) / (sbest - 1.0) + 0.1);
 }
 
-int main(int argc, char *argv[])
-{
+int main(int argc, char *argv[]) {
   int i;
   double cme;
-  double cme_c,cme_rc;
-  int EnableScore=0;
-  
-  if (argc == 3)
-  {
-  	EnableScore = 1;
-  	verbose = 0;
+  double cme_c, cme_rc;
+  int EnableScore = 0;
+
+  if (argc == 3) {
+    EnableScore = 1;
+    verbose = 0;
   }
   init_tests();
-  set_fcyc_clear_cache(1);  /* Set so that clears cache between runs */
+  set_fcyc_clear_cache(1); /* Set so that clears cache between runs */
   for (i = 0; rc_fun_tab[i].f != NULL; i++) {
-      cme = 100.0;
-      time_rc(rc_fun_tab[i].f,
-	    rc_fun_tab[i].rc_type, rc_fun_tab[i].descr, &cme);
-    if (i == 0)
-    {
-    	cme_c = cme;
-    	if (EnableScore==0)
-    	{
-      printf("  ×î¸ß\"ÁÐÇóºÍ\"µÃ·Ö   ======================== %.0f\n",
-	     compute_score(cme, cstandard[0].cref, cstandard[0].cbest));
-	    }
-	  }
-    if (i == 1)
-    {
-    	cme_rc = cme;
-    	if (EnableScore==0)
-    	{
-      printf("  ×î¸ß\"ÐÐºÍÁÐÇóºÍ\"µÃ·Ö ====================== %.0f\n",
-	     compute_score(cme, cstandard[1].cref, cstandard[1].cbest));
-	    }
-	  }
+    cme = 100.0;
+    time_rc(rc_fun_tab[i].f, rc_fun_tab[i].rc_type, rc_fun_tab[i].descr, &cme);
+    if (i == 0) {
+      cme_c = cme;
+      if (EnableScore == 0) {
+        printf("  æœ€é«˜\"åˆ—æ±‚å’Œ\"å¾—åˆ†   ======================== %.0f\n",
+               compute_score(cme, cstandard[0].cref, cstandard[0].cbest));
+      }
+    }
+    if (i == 1) {
+      cme_rc = cme;
+      if (EnableScore == 0) {
+        printf("  æœ€é«˜\"è¡Œå’Œåˆ—æ±‚å’Œ\"å¾—åˆ† ====================== %.0f\n",
+               compute_score(cme, cstandard[1].cref, cstandard[1].cbest));
+      }
+    }
   }
-  
+
   if (EnableScore)
-  	printf("%.2f\t %.0f\t %.2f\t %.0f\t 0\t 0\n",cme_c,compute_score(cme_c, cstandard[0].cref, cstandard[0].cbest),
-  	cme_rc,compute_score(cme_rc, cstandard[1].cref, cstandard[1].cbest));
+    printf("%.2f\t %.0f\t %.2f\t %.0f\t 0\t 0\n", cme_c,
+           compute_score(cme_c, cstandard[0].cref, cstandard[0].cbest), cme_rc,
+           compute_score(cme_rc, cstandard[1].cref, cstandard[1].cbest));
   return 0;
 }
diff --git a/perflab/matrix/rowcol_test.o b/perflab/matrix/rowcol_test.o
new file mode 100644
index 0000000..d214ba4
Binary files /dev/null and b/perflab/matrix/rowcol_test.o differ
diff --git a/perflab/poly/Makefile b/perflab/poly/Makefile
new file mode 100644
index 0000000..9f55dad
--- /dev/null
+++ b/perflab/poly/Makefile
@@ -0,0 +1,35 @@
+CC = gcc
+NVCC = nvcc
+CFLAGS = -Wall -O2 -g
+CUDA_FLAGS = -O2 -g
+LDFLAGS = -lm -lcudart
+
+# Source files
+SRCS = poly_test.c clock.c cpe.c fcyc.c lsquare.c
+CUDA_SRCS = poly.cu
+OBJS = $(SRCS:.c=.o) poly.o
+
+# Target executable
+TARGET = poly_test
+
+# Default target
+all: $(TARGET)
+
+# Rule to build the executable
+$(TARGET): $(OBJS)
+	$(CC) $(OBJS) -o $(TARGET) $(LDFLAGS)
+
+# Rule to build object files
+%.o: %.c
+	$(CC) $(CFLAGS) -c $< -o $@
+
+# Rule to build CUDA object files
+poly.o: poly.cu
+	$(NVCC) $(CUDA_FLAGS) -c $< -o $@
+
+# Clean rule
+clean:
+	rm -f $(OBJS) $(TARGET)
+
+# Phony targets
+.PHONY: all clean 
\ No newline at end of file
diff --git a/perflab/poly/clock.c b/perflab/poly/clock.c
index a587590..159ba4e 100644
--- a/perflab/poly/clock.c
+++ b/perflab/poly/clock.c
@@ -13,11 +13,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <intrin.h>
-//#include <intrinsics.h>
-#include <windows.h>
-#include <time.h>
+#include <x86intrin.h>
+// #include <intrinsics.h>
 #include "clock.h"
+#include <time.h>
+#include <windows.h>
 
 /* Use x86 cycle counter */
 
@@ -27,203 +27,195 @@ static unsigned cyc_lo = 0;
 
 /* Set *hi and *lo to the high and low order bits  of the cycle counter.
    Implementation requires assembly code to use the rdtsc instruction. */
-void access_counter(unsigned *hi, unsigned *lo)
-{
+void access_counter(unsigned *hi, unsigned *lo) {
 
-	long long counter;
+  long long counter;
 
-	counter = __rdtsc();
-	(*hi) = (unsigned int)(counter >> 32);
-	(*lo) = (unsigned int)counter;
-/*
+  counter = __rdtsc();
+  (*hi) = (unsigned int)(counter >> 32);
+  (*lo) = (unsigned int)counter;
+  /*
 
-	LARGE_INTEGER lPerformanceCount;
+          LARGE_INTEGER lPerformanceCount;
 
-	QueryPerformanceCounter(&lPerformanceCount);
-	(*hi) = (unsigned int)lPerformanceCount.HighPart;
-	(*lo) = (unsigned int)lPerformanceCount.LowPart;
-//	printf("%08X %08X\n",(*hi),(*lo));
-*/
+          QueryPerformanceCounter(&lPerformanceCount);
+          (*hi) = (unsigned int)lPerformanceCount.HighPart;
+          (*lo) = (unsigned int)lPerformanceCount.LowPart;
+  //	printf("%08X %08X\n",(*hi),(*lo));
+  */
 }
 
-
 /* Record the current value of the cycle counter. */
-void start_counter()
-{
-    access_counter(&cyc_hi, &cyc_lo);
-}
+void start_counter() { access_counter(&cyc_hi, &cyc_lo); }
 
 /* Return the number of cycles since the last call to start_counter. */
-double get_counter()
-{
-    unsigned ncyc_hi, ncyc_lo;
-    unsigned hi, lo, borrow;
-    double result;
+double get_counter() {
+  unsigned ncyc_hi, ncyc_lo;
+  unsigned hi, lo, borrow;
+  double result;
 
-    /* Get cycle counter */
-    access_counter(&ncyc_hi, &ncyc_lo);
+  /* Get cycle counter */
+  access_counter(&ncyc_hi, &ncyc_lo);
 
-    /* Do double precision subtraction */
-    lo = ncyc_lo - cyc_lo;
-    borrow = cyc_lo > ncyc_lo;
-    hi = ncyc_hi - cyc_hi - borrow;
-    result = (double) hi * (1 << 30) * 4 + lo;
-    return result;
+  /* Do double precision subtraction */
+  lo = ncyc_lo - cyc_lo;
+  borrow = cyc_lo > ncyc_lo;
+  hi = ncyc_hi - cyc_hi - borrow;
+  result = (double)hi * (1 << 30) * 4 + lo;
+  return result;
 }
-void make_CPU_busy(void)
-{
-	volatile double old_tick,new_tick;
-	start_counter();
-	old_tick = get_counter();
-	new_tick = get_counter();
-	while (new_tick - old_tick < 1000000000)
-		new_tick = get_counter();
+void make_CPU_busy(void) {
+  volatile double old_tick, new_tick;
+  start_counter();
+  old_tick = get_counter();
+  new_tick = get_counter();
+  while (new_tick - old_tick < 1000000000)
+    new_tick = get_counter();
 }
 
-//CPUµÄÆµÂÊ
-double mhz(int verbose)
-{
-    LARGE_INTEGER lFrequency;
-    LARGE_INTEGER lPerformanceCount_Start;
-    LARGE_INTEGER lPerformanceCount_End;
-	double mhz;
-	double fTime;
-	__int64 _i64StartCpuCounter;
-	__int64 _i64EndCpuCounter;
-    //On a multiprocessor machine, it should not matter which processor is called.
-    //However, you can get different results on different processors due to bugs in
-    //the BIOS or the HAL. To specify processor affinity for a thread, use the SetThreadAffinityMask function.
-    HANDLE hThread=GetCurrentThread();
-    SetThreadAffinityMask(hThread,0x1);
+// CPUï¿½ï¿½Æµï¿½ï¿½
+double mhz(int verbose) {
+  LARGE_INTEGER lFrequency;
+  LARGE_INTEGER lPerformanceCount_Start;
+  LARGE_INTEGER lPerformanceCount_End;
+  double mhz;
+  double fTime;
+  __int64 _i64StartCpuCounter;
+  __int64 _i64EndCpuCounter;
+  // On a multiprocessor machine, it should not matter which processor is
+  // called. However, you can get different results on different processors due
+  // to bugs in the BIOS or the HAL. To specify processor affinity for a thread,
+  // use the SetThreadAffinityMask function.
+  HANDLE hThread = GetCurrentThread();
+  SetThreadAffinityMask(hThread, 0x1);
 
-    //Ö÷°åÉÏ¸ß¾«¶È¶¨Ê±Æ÷µÄ¾§ÕñÆµÂÊ
-    //Õâ¸ö¶¨Ê±Æ÷Ó¦¸Ã¾ÍÊÇÒ»Æ¬8253»òÕß8254
-    //ÔÚintel ich7ÖÐ¼¯³ÉÁË8254
-    QueryPerformanceFrequency(&lFrequency);
-//    if (verbose>0)
-//    	printf("¸ß¾«¶È¶¨Ê±Æ÷µÄ¾§ÕñÆµÂÊ£º%1.0fHz.\n",(double)lFrequency.QuadPart);
+  // ï¿½ï¿½ï¿½ï¿½ï¿½Ï¸ß¾ï¿½ï¿½È¶ï¿½Ê±ï¿½ï¿½ï¿½Ä¾ï¿½ï¿½ï¿½Æµï¿½ï¿½
+  // ï¿½ï¿½ï¿½ï¿½ï¿½Ê±ï¿½ï¿½Ó¦ï¿½Ã¾ï¿½ï¿½ï¿½Ò»Æ„1ï¿½78253ï¿½ï¿½ï¿½ï¿½8254
+  // ï¿½ï¿½intel ich7ï¿½Ð¼ï¿½ï¿½ï¿½ï¿½ï¿½8254
+  QueryPerformanceFrequency(&lFrequency);
+  //    if (verbose>0)
+  //    	printf("ï¿½ß¾ï¿½ï¿½È¶ï¿½Ê±ï¿½ï¿½ï¿½Ä¾ï¿½ï¿½ï¿½Æµï¿½Ê£ï¿½%1.0fHz.\n",(double)lFrequency.QuadPart);
 
-    //Õâ¸ö¶¨Ê±Æ÷Ã¿¾­¹ýÒ»¸öÊ±ÖÓÖÜÆÚ£¬Æä¼ÆÊýÆ÷»á+1
-    QueryPerformanceCounter(&lPerformanceCount_Start);
+  // ï¿½ï¿½ï¿½ï¿½ï¿½Ê±ï¿½ï¿½Ã¿ï¿½ï¿½ï¿½ï¿½Ò»ï¿½ï¿½Ê±ï¿½ï¿½ï¿½ï¿½ï¿½Ú£ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½+1
+  QueryPerformanceCounter(&lPerformanceCount_Start);
 
-    //RDTSCÖ¸Áî:»ñÈ¡CPU¾­ÀúµÄÊ±ÖÓÖÜÆÚÊý
-    _i64StartCpuCounter=__rdtsc();
+  // RDTSCÖ¸ï¿½ï¿½:ï¿½ï¿½È¡CPUï¿½ï¿½ï¿½ï¿½ï¿½ï¿½Ê±ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½
+  _i64StartCpuCounter = __rdtsc();
 
-    //ÑÓÊ±³¤Ò»µã,Îó²î»áÐ¡Ò»µã
-    //int nTemp=100000;
-    //while (--nTemp);
-    Sleep(200);
+  // ï¿½ï¿½Ê±ï¿½ï¿½Ò»ï¿½ï¿½,ï¿½ï¿½ï¿½ï¿½Ð¡Ò»ï¿½ï¿½
+  // int nTemp=100000;
+  // while (--nTemp);
+  Sleep(200);
 
-    QueryPerformanceCounter(&lPerformanceCount_End);
+  QueryPerformanceCounter(&lPerformanceCount_End);
 
-    _i64EndCpuCounter=__rdtsc();
+  _i64EndCpuCounter = __rdtsc();
 
-    //f=1/T => f=¼ÆÊý´ÎÊý/(¼ÆÊý´ÎÊý*T)
-    //ÕâÀïµÄ¡°¼ÆÊý´ÎÊý*T¡±¾ÍÊÇÊ±¼ä²î
-    fTime=((double)lPerformanceCount_End.QuadPart-(double)lPerformanceCount_Start.QuadPart)
-        /(double)lFrequency.QuadPart;
+  // f=1/T => f=ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½/(ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½*T)
+  // ï¿½ï¿½ï¿½ï¿½Ä¡ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿„1ï¿½7*Tï¿½ï¿½ï¿½ï¿½ï¿½ï¿½Ê±ï¿½ï¿½ï¿„1ï¿½7
+  fTime = ((double)lPerformanceCount_End.QuadPart -
+           (double)lPerformanceCount_Start.QuadPart) /
+          (double)lFrequency.QuadPart;
 
- 		mhz = (_i64EndCpuCounter-_i64StartCpuCounter)/(fTime*1000000.0);
-    if (verbose>0)
-    	printf("CPUÆµÂÊÎª:%1.6fMHz.\n",mhz);
-    return mhz;
+  mhz = (_i64EndCpuCounter - _i64StartCpuCounter) / (fTime * 1000000.0);
+  if (verbose > 0)
+    printf("CPUÆµï¿½ï¿½Îª:%1.6fMHz.\n", mhz);
+  return mhz;
 }
 
-double CPU_Factor1(void)
-{
-	double result;
-	int i,j,k,ii,jj,kk;
-	LARGE_INTEGER lStart,lEnd;
+double CPU_Factor1(void) {
+  double result;
+  int i, j, k, ii, jj, kk;
+  LARGE_INTEGER lStart, lEnd;
   LARGE_INTEGER lFrequency;
   HANDLE hThread;
   double fTime;
 
   QueryPerformanceFrequency(&lFrequency);
 
-	ii = 43273;
-	kk = 1238;
-	result = 1;
-	jj = 1244;
+  ii = 43273;
+  kk = 1238;
+  result = 1;
+  jj = 1244;
 
-    hThread=GetCurrentThread();
-    SetThreadAffinityMask(hThread,0x1);
+  hThread = GetCurrentThread();
+  SetThreadAffinityMask(hThread, 0x1);
   QueryPerformanceCounter(&lStart);
   //_asm("cpuid");
-	start_counter();
-	for (i=0;i<100;i++)
-		for (j=0;j<1000;j++)
-			for (k=0;k<1000;k++)
-				kk += kk*ii+jj;
+  start_counter();
+  for (i = 0; i < 100; i++)
+    for (j = 0; j < 1000; j++)
+      for (k = 0; k < 1000; k++)
+        kk += kk * ii + jj;
 
-	result = get_counter();
-	QueryPerformanceCounter(&lEnd);
-  fTime=((double)lEnd.QuadPart-(double)lStart.QuadPart);
-	printf("CPUÔËÐÐÊ±¼äÎª%f",result);
-	printf("\t %f\n",fTime);
-	return result;
+  result = get_counter();
+  QueryPerformanceCounter(&lEnd);
+  fTime = ((double)lEnd.QuadPart - (double)lStart.QuadPart);
+  printf("CPUï¿½ï¿½ï¿½ï¿½Ê±ï¿½ï¿½Îª%f", result);
+  printf("\t %f\n", fTime);
+  return result;
 }
 
-double CPU_Factor(void)
-{
- double frequency;
- double multiplier = 1000 * 1000 * 1000;//nano
- LARGE_INTEGER lFrequency;
- LARGE_INTEGER start,stop;
- HANDLE hThread;
- int i;
- const int gigahertz= 1000*1000*1000;
- const int known_instructions_per_loop = 27317; 
+double CPU_Factor(void) {
+  double frequency;
+  double multiplier = 1000 * 1000 * 1000; // nano
+  LARGE_INTEGER lFrequency;
+  LARGE_INTEGER start, stop;
+  HANDLE hThread;
+  int i;
+  const int gigahertz = 1000 * 1000 * 1000;
+  const int known_instructions_per_loop = 27317;
 
- int iterations = 100000000;
- int g = 0;
- double normal_ticks_per_second;
-double ticks;
-double time;
-double loops_per_sec;
-double instructions_per_loop;
-double ratio;
-double actual_freq;
+  int iterations = 100000000;
+  int g = 0;
+  double normal_ticks_per_second;
+  double ticks;
+  double time;
+  double loops_per_sec;
+  double instructions_per_loop;
+  double ratio;
+  double actual_freq;
 
- QueryPerformanceFrequency(&lFrequency);
- frequency = (double)lFrequency.QuadPart;
+  QueryPerformanceFrequency(&lFrequency);
+  frequency = (double)lFrequency.QuadPart;
 
- hThread=GetCurrentThread();
- SetThreadAffinityMask(hThread,0x1);
- QueryPerformanceCounter(&start);
- for( i = 0; i < iterations; i++)
- {
-   g++;
-   g++;
-   g++;
-   g++;
- }
- QueryPerformanceCounter(&stop);
+  hThread = GetCurrentThread();
+  SetThreadAffinityMask(hThread, 0x1);
+  QueryPerformanceCounter(&start);
+  for (i = 0; i < iterations; i++) {
+    g++;
+    g++;
+    g++;
+    g++;
+  }
+  QueryPerformanceCounter(&stop);
 
- //normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ 3199
- normal_ticks_per_second = frequency * 1000;
- ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
- time = (ticks * multiplier) /frequency;
- loops_per_sec = iterations / (time/multiplier);
- instructions_per_loop = normal_ticks_per_second  / loops_per_sec;
+  // normal ticks differs from the WMI data, i.e 3125, when WMI 3201, and CPUZ
+  // 3199
+  normal_ticks_per_second = frequency * 1000;
+  ticks = (double)((double)stop.QuadPart - (double)start.QuadPart);
+  time = (ticks * multiplier) / frequency;
+  loops_per_sec = iterations / (time / multiplier);
+  instructions_per_loop = normal_ticks_per_second / loops_per_sec;
 
- ratio = (instructions_per_loop / known_instructions_per_loop);
- actual_freq = normal_ticks_per_second / ratio;
-/* 
- actual_freq = normal_ticks_per_second / ratio;
- actual_freq = known_instructions_per_loop*iterations*multiplier/time;
+  ratio = (instructions_per_loop / known_instructions_per_loop);
+  actual_freq = normal_ticks_per_second / ratio;
+  /*
+   actual_freq = normal_ticks_per_second / ratio;
+   actual_freq = known_instructions_per_loop*iterations*multiplier/time;
 
-	2293 = x/time;
-	
-	2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
- loops_per_sec = iterations*frequency / ticks
- 
- instructions_per_loop =   / loops_per_sec;
-*/ 
- printf("Perf counter freq: %f\n", normal_ticks_per_second);
- printf("Loops per sec:      %f\n", loops_per_sec);
- printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
- printf("Presumed freq: %f\n", actual_freq);
- printf("ratio: %f\n", ratio);
- printf("time=%f\n",time);
- return ratio;
+          2293 = x/time;
+
+          2292.599713*1191533038.809362=known_instructions_per_loop*100000000*1000
+   loops_per_sec = iterations*frequency / ticks
+
+   instructions_per_loop =   / loops_per_sec;
+  */
+  printf("Perf counter freq: %f\n", normal_ticks_per_second);
+  printf("Loops per sec:      %f\n", loops_per_sec);
+  printf("Perf counter freq div loops per sec: %f\n", instructions_per_loop);
+  printf("Presumed freq: %f\n", actual_freq);
+  printf("ratio: %f\n", ratio);
+  printf("time=%f\n", time);
+  return ratio;
 }
diff --git a/perflab/poly/poly.cu b/perflab/poly/poly.cu
new file mode 100644
index 0000000..73347fe
--- /dev/null
+++ b/perflab/poly/poly.cu
@@ -0,0 +1,325 @@
+/**************************************************************************
+	å¤šé¡¹å¼è®¡ç®—å‡½æ•°ã€‚æŒ‰ä¸‹é¢çš„è¦æ±‚ç¼–è¾‘æ­¤æ–‡ä»¶ï¼š
+	1. å°†ä½ çš„å­¦å·ã€å§“åï¼Œä»¥æ³¨é‡Šçš„æ–¹å¼å†™åˆ°ä¸‹é¢ï¼›
+	2. å®žçŽ°ä¸åŒç‰ˆæœ¬çš„å¤šé¡¹å¼è®¡ç®—å‡½æ•°ï¼›
+	3. ç¼–è¾‘peval_fun_rec peval_fun_tabæ•°ç»„ï¼Œå°†ä½ çš„æœ€å¥½çš„ç­”æ¡ˆ
+		ï¼ˆæœ€å°CPEã€æœ€å°C10ï¼‰ä½œä¸ºæ•°ç»„çš„å‰ä¸¤é¡¹
+***************************************************************************/
+   
+/*
+	å­¦å·ï¼š201209054233
+	å§“åï¼šå¤œåŠåŠ ç­ç‹‚
+*/
+
+
+
+#include  <stdio.h>
+#include  <stdlib.h>
+#include  <cuda_runtime.h>
+typedef int (*peval_fun)(int*, int, int);
+
+typedef struct {
+  peval_fun f;
+  char *descr;
+} peval_fun_rec, *peval_fun_ptr;
+
+
+/**************************************************************************
+ Edit this comment to indicate your name and Andrew ID
+#ifdef ASSIGN
+   Submission by Harry Q. Bovik, bovik@andrew.cmu.edu
+#else
+   Instructor's version.
+   Created by Randal E. Bryant, Randy.Bryant@cs.cmu.edu, 10/07/02
+#endif
+***************************************************************************/
+
+/*
+	å®žçŽ°ä¸€ä¸ªæŒ‡å®šçš„å¸¸ç³»æ•°å¤šé¡¹å¼è®¡ç®—
+	ç¬¬ä¸€æ¬¡ï¼Œè¯·ç›´æŽ¥è¿è¡Œç¨‹åºï¼Œä»¥ä¾¿èŽ·çŸ¥ä½ éœ€è¦å®žçŽ°çš„å¸¸ç³»æ•°æ˜¯å•¥
+*/
+int const_poly_eval(int *not_use, int not_use2, int x)
+{
+    int result = 0;
+/*    int i;
+    int xpwr = 1; // xçš„å¹‚æ¬¡
+    int a[4] = {21,90,42,88};
+    for (i = 0; i <= 3; i++) {
+	result += a[i]*xpwr;
+	xpwr   *= x;
+    }
+*/
+// 90 = 64 + 32 - 4 - 2
+// 42 = 32 + 8 + 2
+// 88 = 64 + 16 + 8
+	int x64,x32,x16,x8,x4,x2;
+	
+	x64 = x << 6;
+	x32 = x << 5;
+	x16 = x << 4;
+	x8 = x << 3;
+	x4 = x << 2;
+	x2 = x << 1;
+	result = 21 + x64+x32-x4-x2 + ((x32+x8+x2) + (x64+x16+x8)*x)*x;
+    return result;
+}
+
+
+
+/* å¤šé¡¹å¼è®¡ç®—å‡½æ•°ã€‚æ³¨æ„ï¼šè¿™ä¸ªåªæ˜¯ä¸€ä¸ªå‚è€ƒå®žçŽ°ï¼Œä½ éœ€è¦å®žçŽ°è‡ªå·±çš„ç‰ˆæœ¬ */
+
+/*
+	å‹æƒ…æç¤ºï¼šlccæ”¯æŒATTæ ¼å¼çš„åµŒå…¥å¼æ±‡ç¼–ï¼Œä¾‹å¦‚
+	
+	_asm("movl %eax,%ebx");
+	_asm("pushl %edx");
+	
+	å¯ä»¥åœ¨lccä¸­project->configuration->Compiler->Code Generation->Generate .asmï¼Œ
+	å°†å…¶é€‰ä¸­åŽï¼Œå¯ä»¥åœ¨lccç›®å½•ä¸‹é¢ç”Ÿæˆå¯¹åº”ç¨‹åºçš„æ±‡ç¼–ä»£ç å®žçŽ°ã€‚é€šè¿‡æŸ¥çœ‹æ±‡ç¼–æ–‡ä»¶ï¼Œ
+	ä½ å¯ä»¥äº†è§£ç¼–è¯‘å™¨æ˜¯å¦‚ä½•å®žçŽ°ä½ çš„ä»£ç çš„ã€‚æœ‰äº›å®žçŽ°å¯èƒ½éžå¸¸ä½Žæ•ˆã€‚
+	ä½ å¯ä»¥åœ¨é€‚å½“çš„åœ°æ–¹åŠ å…¥åµŒå…¥å¼æ±‡ç¼–ï¼Œæ¥å¤§å¹…åº¦æé«˜è®¡ç®—æ€§èƒ½ã€‚
+*/
+
+int poly_eval(int *a, int degree, int x)
+{
+    int result = 0;
+    int i;
+    int xpwr = 1; /* xçš„å¹‚æ¬¡ */
+//    printf("é˜¶=%d\n",degree);
+    for (i = 0; i <= degree; i++) {
+	result += a[i]*xpwr;
+	xpwr   *= x;
+    }
+    return result;
+}
+
+/* CUDAä¼˜åŒ–çš„å¤šé¡¹å¼è®¡ç®—å‡½æ•° - ä½ŽCPEç‰ˆæœ¬ */
+int cuda_poly_eval_low_cpe(int *a, int degree, int x)
+{
+    // å¯¹äºŽä½ŽCPEç‰ˆæœ¬ï¼Œæˆ‘ä»¬ä½¿ç”¨CUDAå¹¶è¡Œè®¡ç®—å¤šé¡¹å¼çš„å„ä¸ªé¡¹
+    // ç„¶åŽå°†ç»“æžœä¼ å›žä¸»æœºè¿›è¡Œæ±‚å’Œ
+    
+    // åˆ†é…è®¾å¤‡å†…å­˜
+    int *d_a, *d_results;
+    cudaError_t err;
+    
+    // åˆ†é…å†…å­˜
+    err = cudaMalloc(&d_a, (degree + 1) * sizeof(int));
+    if (err != cudaSuccess) {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+        return 0;
+    }
+    
+    err = cudaMalloc(&d_results, (degree + 1) * sizeof(int));
+    if (err != cudaSuccess) {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+        cudaFree(d_a);
+        return 0;
+    }
+    
+    // å°†ç³»æ•°ä»Žä¸»æœºå¤åˆ¶åˆ°è®¾å¤‡
+    err = cudaMemcpy(d_a, a, (degree + 1) * sizeof(int), cudaMemcpyHostToDevice);
+    if (err != cudaSuccess) {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+        cudaFree(d_a);
+        cudaFree(d_results);
+        return 0;
+    }
+    
+    // å®šä¹‰CUDAæ ¸å‡½æ•°
+    dim3 blockDim(256);
+    dim3 gridDim((degree + 1 + blockDim.x - 1) / blockDim.x);
+    
+    // å¯åŠ¨æ ¸å‡½æ•°
+    cudaPolyEvalLowCPE<<<gridDim, blockDim>>>(d_a, degree, x, d_results);
+    
+    // æ£€æŸ¥æ ¸å‡½æ•°æ‰§è¡Œé”™è¯¯
+    err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+        cudaFree(d_a);
+        cudaFree(d_results);
+        return 0;
+    }
+    
+    // åˆ†é…ä¸»æœºå†…å­˜ç”¨äºŽç»“æžœ
+    int *h_results = (int *)malloc((degree + 1) * sizeof(int));
+    if (h_results == NULL) {
+        printf("Memory allocation error\n");
+        cudaFree(d_a);
+        cudaFree(d_results);
+        return 0;
+    }
+    
+    // å°†ç»“æžœä»Žè®¾å¤‡å¤åˆ¶å›žä¸»æœº
+    err = cudaMemcpy(h_results, d_results, (degree + 1) * sizeof(int), cudaMemcpyDeviceToHost);
+    if (err != cudaSuccess) {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+        free(h_results);
+        cudaFree(d_a);
+        cudaFree(d_results);
+        return 0;
+    }
+    
+    // åœ¨ä¸»æœºä¸Šæ±‚å’Œ
+    int result = 0;
+    for (int i = 0; i <= degree; i++) {
+        result += h_results[i];
+    }
+    
+    // é‡Šæ”¾å†…å­˜
+    free(h_results);
+    cudaFree(d_a);
+    cudaFree(d_results);
+    
+    return result;
+}
+
+/* CUDAä¼˜åŒ–çš„å¤šé¡¹å¼è®¡ç®—å‡½æ•° - 10é˜¶ä¼˜åŒ–ç‰ˆæœ¬ */
+int cuda_poly_eval_degree10(int *a, int degree, int x)
+{
+    // å¯¹äºŽ10é˜¶å¤šé¡¹å¼ï¼Œæˆ‘ä»¬å¯ä»¥ä½¿ç”¨æ›´ä¼˜åŒ–çš„æ–¹æ³•
+    // ä½¿ç”¨CUDAå¹¶è¡Œè®¡ç®—ï¼Œä½†é’ˆå¯¹10é˜¶å¤šé¡¹å¼è¿›è¡Œç‰¹æ®Šä¼˜åŒ–
+    
+    // åˆ†é…è®¾å¤‡å†…å­˜
+    int *d_a, *d_result;
+    cudaError_t err;
+    
+    // åˆ†é…å†…å­˜
+    err = cudaMalloc(&d_a, (degree + 1) * sizeof(int));
+    if (err != cudaSuccess) {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+        return 0;
+    }
+    
+    err = cudaMalloc(&d_result, sizeof(int));
+    if (err != cudaSuccess) {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+        cudaFree(d_a);
+        return 0;
+    }
+    
+    // å°†ç³»æ•°ä»Žä¸»æœºå¤åˆ¶åˆ°è®¾å¤‡
+    err = cudaMemcpy(d_a, a, (degree + 1) * sizeof(int), cudaMemcpyHostToDevice);
+    if (err != cudaSuccess) {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+        cudaFree(d_a);
+        cudaFree(d_result);
+        return 0;
+    }
+    
+    // å®šä¹‰CUDAæ ¸å‡½æ•°
+    dim3 blockDim(256);
+    dim3 gridDim(1);  // åªéœ€è¦ä¸€ä¸ªå—ï¼Œå› ä¸ºæˆ‘ä»¬åªéœ€è¦ä¸€ä¸ªç»“æžœ
+    
+    // å¯åŠ¨æ ¸å‡½æ•°
+    cudaPolyEvalDegree10<<<gridDim, blockDim>>>(d_a, degree, x, d_result);
+    
+    // æ£€æŸ¥æ ¸å‡½æ•°æ‰§è¡Œé”™è¯¯
+    err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+        cudaFree(d_a);
+        cudaFree(d_result);
+        return 0;
+    }
+    
+    // èŽ·å–ç»“æžœ
+    int result;
+    err = cudaMemcpy(&result, d_result, sizeof(int), cudaMemcpyDeviceToHost);
+    if (err != cudaSuccess) {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+        cudaFree(d_a);
+        cudaFree(d_result);
+        return 0;
+    }
+    
+    // é‡Šæ”¾å†…å­˜
+    cudaFree(d_a);
+    cudaFree(d_result);
+    
+    return result;
+}
+
+/* CUDAæ ¸å‡½æ•° - ä½ŽCPEç‰ˆæœ¬ */
+__global__ void cudaPolyEvalLowCPE(int *a, int degree, int x, int *results)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx <= degree) {
+        // è®¡ç®—xçš„å¹‚
+        int xpwr = 1;
+        for (int i = 0; i < idx; i++) {
+            xpwr *= x;
+        }
+        
+        // è®¡ç®—è¿™ä¸€é¡¹çš„ç»“æžœ
+        results[idx] = a[idx] * xpwr;
+    }
+}
+
+/* CUDAæ ¸å‡½æ•° - 10é˜¶ä¼˜åŒ–ç‰ˆæœ¬ */
+__global__ void cudaPolyEvalDegree10(int *a, int degree, int x, int *result)
+{
+    // ä½¿ç”¨å…±äº«å†…å­˜æ¥å­˜å‚¨ä¸­é—´ç»“æžœ
+    __shared__ int shared_result;
+    
+    // åªæœ‰ç¬¬ä¸€ä¸ªçº¿ç¨‹åˆå§‹åŒ–å…±äº«ç»“æžœ
+    if (threadIdx.x == 0) {
+        shared_result = 0;
+    }
+    __syncthreads();
+    
+    // æ¯ä¸ªçº¿ç¨‹è®¡ç®—ä¸€éƒ¨åˆ†é¡¹
+    int local_result = 0;
+    int xpwr = 1;
+    
+    // è®¡ç®—xçš„å¹‚
+    for (int i = 0; i < threadIdx.x; i++) {
+        xpwr *= x;
+    }
+    
+    // è®¡ç®—è¿™ä¸€é¡¹çš„ç»“æžœ
+    if (threadIdx.x <= degree) {
+        local_result = a[threadIdx.x] * xpwr;
+    }
+    
+    // ä½¿ç”¨åŽŸå­æ“ä½œç´¯åŠ ç»“æžœ
+    atomicAdd(&shared_result, local_result);
+    
+    // åŒæ­¥æ‰€æœ‰çº¿ç¨‹
+    __syncthreads();
+    
+    // åªæœ‰ç¬¬ä¸€ä¸ªçº¿ç¨‹å°†ç»“æžœå†™å›žå…¨å±€å†…å­˜
+    if (threadIdx.x == 0) {
+        *result = shared_result;
+    }
+}
+
+/*
+	è¿™ä¸ªè¡¨æ ¼åŒ…å«å¤šä¸ªæ•°ç»„å…ƒç´ ï¼Œæ¯ä¸€ç»„å…ƒç´ ï¼ˆå‡½æ•°åå­—, "æè¿°å­—ç¬¦ä¸²"ï¼‰
+	å°†ä½ è®¤ä¸ºæœ€å¥½çš„ä¸¤ä¸ªå®žçŽ°ï¼Œæ”¾åœ¨æœ€å‰é¢ã€‚
+	æ¯”å¦‚ï¼š
+	{my_poly_eval1, "è¶…çº§åžƒåœ¾å®žçŽ°"},
+	{my_poly_eval2, "å¥½ä¸€ç‚¹çš„å®žçŽ°"},
+*/
+   
+peval_fun_rec peval_fun_tab[] = 
+{
+
+  /* ç¬¬ä¸€é¡¹ï¼Œåº”å½“æ˜¯ä½ å†™çš„æœ€å¥½CPEçš„å‡½æ•°å®žçŽ° */
+ {cuda_poly_eval_low_cpe, "CUDA optimized low CPE implementation"},
+  /* ç¬¬äºŒé¡¹ï¼Œåº”å½“æ˜¯ä½ å†™çš„åœ¨10é˜¶æ—¶å…·æœ‰æœ€å¥½æ€§èƒ½çš„å®žçŽ° */
+ {cuda_poly_eval_degree10, "CUDA optimized degree 10 implementation"},
+
+ {poly_eval, "poly_eval: å‚è€ƒå®žçŽ°"},
+
+ /* ä¸‹é¢çš„ä»£ç ä¸èƒ½ä¿®æ”¹æˆ–è€…åˆ é™¤ï¼ï¼è¡¨æ˜Žæ•°ç»„åˆ—è¡¨ç»“æŸ */
+ {NULL, ""}
+};
+
+
+
+
+
+
+
diff --git a/perflab/poly/poly.o b/perflab/poly/poly.o
new file mode 100644
index 0000000..1b650e0
Binary files /dev/null and b/perflab/poly/poly.o differ
diff --git a/perflab/poly/poly_test.c b/perflab/poly/poly_test.c
index 8c68435..c2e8701 100644
--- a/perflab/poly/poly_test.c
+++ b/perflab/poly/poly_test.c
@@ -6,6 +6,7 @@
 #include "poly.h"
 #include "cpe.h"
 #include "clock.h"
+#include <time.h>
 
 double CPU_Mhz;
 
@@ -17,7 +18,7 @@ static int coeff[MAXDEGREE+1];
 
 #define MAX_ITER_COUNT 100
 
-#define REF_CPU_MHZ 2292.6		// ÕâÊÇÎÒµÄ´¦ÀíÆ÷Ö÷Æµ
+#define REF_CPU_MHZ 2292.6		// ï¿½ï¿½ï¿½ï¿½ï¿½ÒµÄ´ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½Æµ
 
 /* Define performance standards */
 static struct {
@@ -26,7 +27,7 @@ static struct {
 } cstandard[3] =
 {{4.00, 1.75}, /* CPE */
  {50, 43}, /* C(10) */
- {57,31} /* ³£ÏµÊý¶àÏîÊ½¼ÆËã */
+ {57,31} /* ï¿½ï¿½Ïµï¿½ï¿½ï¿½ï¿½ï¿½ï¿½Ê½ï¿½ï¿½ï¿½ï¿½ */
 };
 
 int coeff_const[4];
@@ -82,7 +83,7 @@ static void init_const_poly(void)
     	coeff_const[i] = rand_div+10;
     }
 
-    printf("ÄãÐèÒªÐÞ¸Äpoly.cµÄconst_poly_evalº¯Êý£¬ÊµÏÖÏÂÃæµÄ³£Êý¶àÏîÊ½¼ÆËã£¡\n");
+    printf("ï¿½ï¿½ï¿½ï¿½Òªï¿½Þ¸ï¿½poly.cï¿½ï¿½const_poly_evalï¿½ï¿½ï¿½ï¿½ï¿½ï¿½Êµï¿½ï¿½ï¿½ï¿½ï¿½ï¿½Ä³ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½Ê½ï¿½ï¿½ï¿½ã£¡\n");
     printf("\tresult=%d+%d*x+%d*x^2+%d*x^3\n",coeff_const[0],coeff_const[1],coeff_const[2],coeff_const[3]);
 
 		fixval_const = ref_poly_eval(coeff_const, 3, xval);
@@ -97,15 +98,15 @@ void test_const_poly(void)
 	int my_cal = const_poly_eval(coeff_const, 3, xval);
 	if (fixval_const != my_cal)
 	{
-		printf("³£ÏµÊý¶àÏîÊ½¼ÆËãconst_poly_evalÊµÏÖ´íÎó£¨x=%d£©£¬Ô¤ÆÚ½á¹ûÊÇ%d£¬µ«ÊÇ¼ÆËãµÃµ½µÄÊÇ%d\n",xval,fixval_const,my_cal);
+		printf("ï¿½ï¿½Ïµï¿½ï¿½ï¿½ï¿½ï¿½ï¿½Ê½ï¿½ï¿½ï¿½ï¿½const_poly_evalÊµï¿½Ö´ï¿½ï¿½ï¿½x=%dï¿½ï¿½ï¿½ï¿½Ô¤ï¿½Ú½ï¿½ï¿½ï¿½ï¿½%dï¿½ï¿½ï¿½ï¿½ï¿½Ç¼ï¿½ï¿½ï¿½Ãµï¿½ï¿½ï¿½ï¿½ï¿½%d\n",xval,fixval_const,my_cal);
 		exit(0);
 	}
 	fix_time = 0;
 	for (i=0;i<MAX_ITER_COUNT;i++)
 		fix_time += measure_function(run_fun_const, 3);
 	fix_time = fix_time / MAX_ITER_COUNT;
-	    printf("  ³£ÏµÊý¶àÏîÊ½¼ÆËãÊ±¼ä = %.1f\n", fix_time);
-      printf("  ×î¸ßµÄ³£ÏµÊý¶àÏîÊ½¼ÆËãµÃ·Ö ============== %.0f\n",
+	    printf("  ï¿½ï¿½Ïµï¿½ï¿½ï¿½ï¿½ï¿½ï¿½Ê½ï¿½ï¿½ï¿½ï¿½Ê±ï¿½ï¿½ = %.1f\n", fix_time);
+      printf("  ï¿½ï¿½ßµÄ³ï¿½Ïµï¿½ï¿½ï¿½ï¿½ï¿½ï¿½Ê½ï¿½ï¿½ï¿½ï¿½Ã·ï¿½ ============== %.0f\n",
 	     compute_score(fix_time, cstandard[2].cref, cstandard[2].cbest));
 }
 
@@ -132,7 +133,7 @@ int test_poly(peval_fun f, FILE *rpt) {
 	    ok = 0;
 	    if (rpt) {
 		fprintf(rpt,
- "´íÎó£¡¶àÏîÊ½¼ÆËã²»¶Ô£¡½×=%dÊ±£¬¼ÆËãµÄÖµÊÇ%d£¬¶øÕýÈ·ÖµÊÇ%d\n",
+ "ï¿½ï¿½ï¿½ó£¡¶ï¿½ï¿½ï¿½Ê½ï¿½ï¿½ï¿½ã²»ï¿½Ô£ï¿½ï¿½ï¿½=%dÊ±ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½Öµï¿½ï¿½%dï¿½ï¿½ï¿½ï¿½ï¿½ï¿½È·Öµï¿½ï¿½%d\n",
 			MAXDEGREE-i, v, pval[i]);
 	    }
 	}
@@ -142,7 +143,7 @@ int test_poly(peval_fun f, FILE *rpt) {
 	ok = 0;
 	if (rpt) {
 	    fprintf(rpt,
-    "´íÎó£¡¶àÏîÊ½¼ÆËã²»¶Ô£¡½×=%dÊ±£¬¼ÆËãµÄÖµÊÇ%d£¬¶øÕýÈ·ÖµÊÇ%d\n",
+    "ï¿½ï¿½ï¿½ó£¡¶ï¿½ï¿½ï¿½Ê½ï¿½ï¿½ï¿½ã²»ï¿½Ô£ï¿½ï¿½ï¿½=%dÊ±ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½Öµï¿½ï¿½%dï¿½ï¿½ï¿½ï¿½ï¿½ï¿½È·Öµï¿½ï¿½%d\n",
 		    FIXDEGREE, v, fixval);
 	}
     }
@@ -175,7 +176,7 @@ void run_poly(peval_fun f, char *descr, double *cpep, double *cfixp)
 		double cpe=0;
 		double fix_time=0;
     pfun = f;
-    printf("º¯Êý£º%s\n", descr);
+    printf("ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½%s\n", descr);
     if (test_poly(f, stdout)) {
     	cpe = 0;
     	for (i=0;i<MAX_ITER_COUNT;i++)
@@ -206,7 +207,7 @@ static double compute_score(double cmeas, double cref, double cbest)
   return 100*((smeas-1.0)/(sbest-1.0) + 0.1);
 }
 
-/* ²úÉúÒ»¸ö0~divv-1Ö®¼äµÄËæ»úÊý£¬Í¬Ê±¸üÐÂËæ»úÊýÖÖ×Ó */
+/* ï¿½ï¿½ï¿½ï¿½Ò»ï¿½ï¿½0~divv-1Ö®ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½Í¬Ê±ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ */
 void GenerateRandomNumber(unsigned long divv)
 {
 	unsigned long long x = rand1_h;
@@ -230,18 +231,18 @@ int main(int argc, char *argv[])
 
 //	CPU_Factor();
 //	GetCpuClock();
-	printf("\t2015¶àÏîÊ½ÓÅ»¯ÊµÑé£¬»¶Ó­Äã£¡\n");
+	printf("\t2015ï¿½ï¿½ï¿½ï¿½Ê½ï¿½Å»ï¿½Êµï¿½é£¬ï¿½ï¿½Ó­ï¿½ã£¡\n");
 	printf("============================\n");
 
 	if (argc == 1)
 	{
-		printf("Ê¹ÓÃ·½·¨£º%s Ñ§ºÅºó6Î» [Ñ§ºÅºó6Î»] [Ñ§ºÅºó6Î»] ...\n",argv[0]);
-		printf("ÄãÐèÒªÒÀ¾ÝÌáÊ¾¸ÄÐ´poly.c³ÌÐò£¬ÊµÏÖÒ»¸ö³£ÏµÊý¶àÏîÊ½µÄ¼ÆËã£¬¾¡¿ÉÄÜ¿ìÅ¶....\n");
-		printf("ÁíÍâ£¬ÄãÐèÒª¸ÄÐ´poly.c³ÌÐò£¬ÊµÏÖÈÎÒâ½×µÄ¶àÏîÊ½¼ÆËãºÍ10½×µÄ¶àÏîÊ½¼ÆËã£¬Òª¿ì£¡\n");
+		printf("Ê¹ï¿½Ã·ï¿½ï¿½ï¿½ï¿½ï¿½%s Ñ§ï¿½Åºï¿½6Î» [Ñ§ï¿½Åºï¿½6Î»] [Ñ§ï¿½Åºï¿½6Î»] ...\n",argv[0]);
+		printf("ï¿½ï¿½ï¿½ï¿½Òªï¿½ï¿½ï¿½ï¿½ï¿½ï¿½Ê¾ï¿½ï¿½Ð´poly.cï¿½ï¿½ï¿½ï¿½Êµï¿½ï¿½Ò»ï¿½ï¿½ï¿½ï¿½Ïµï¿½ï¿½ï¿½ï¿½ï¿½ï¿½Ê½ï¿½Ä¼ï¿½ï¿½ã£¬ï¿½ï¿½ï¿½ï¿½ï¿½Ü¿ï¿½Å¶....\n");
+		printf("ï¿½ï¿½ï¿½â£¬ï¿½ï¿½ï¿½ï¿½Òªï¿½ï¿½Ð´poly.cï¿½ï¿½ï¿½ï¿½Êµï¿½ï¿½ï¿½ï¿½ï¿½ï¿½×µÄ¶ï¿½ï¿½ï¿½Ê½ï¿½ï¿½ï¿½ï¿½ï¿½10ï¿½×µÄ¶ï¿½ï¿½ï¿½Ê½ï¿½ï¿½ï¿½ã£¬Òªï¿½ì£¡\n");
 		return 0;
 	}
 
-	/*ÒÀ¾ÝÑ§ºÅ£¬³õÊ¼»¯Ò»¸öËæ»úÊý·¢ÉúÆ÷*/
+	/*ï¿½ï¿½ï¿½ï¿½Ñ§ï¿½Å£ï¿½ï¿½ï¿½Ê¼ï¿½ï¿½Ò»ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½*/
 	rand1_h = (unsigned long)atoi(argv[1]);
 	rand1_l=0x29A;
 	GenerateRandomNumber(0);
@@ -266,10 +267,10 @@ int main(int argc, char *argv[])
   	//make_CPU_busy();
     run_poly(peval_fun_tab[i].f, peval_fun_tab[i].descr, &cpe, &cfix);
     if (i == 0)
-      printf("  ×î¸ßµÄCPEµÃ·Ö =========================== %.0f\n",
+      printf("  ï¿½ï¿½ßµï¿½CPEï¿½Ã·ï¿½ =========================== %.0f\n",
 	     compute_score(cpe, cstandard[0].cref, cstandard[0].cbest));
     if (i == 1)
-      printf("  ×î¸ßµÄC(10)µÃ·Ö ========================= %.0f\n",
+      printf("  ï¿½ï¿½ßµï¿½C(10)ï¿½Ã·ï¿½ ========================= %.0f\n",
 	     compute_score(cfix, cstandard[1].cref, cstandard[1].cbest));
   }
   return 0;
diff --git a/perflab/poly/poly_test.o b/perflab/poly/poly_test.o
new file mode 100644
index 0000000..2a115cc
Binary files /dev/null and b/perflab/poly/poly_test.o differ