csapp2025/profile/prog.c

/* Example of Program for Profiling */
/* Create a dictionary of strings */

#include "options.h"
#include "string.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define USESTRLEN 1
size_t Strlen(const char *s);

/* Some statistics */
int wcnt = 0;       /* Number of words */
int ucnt = 0;       /* Number of unique words */
int mcnt = 0;       /* Count of Most frequent word */
int scnt = 0;       /* Count of number of singletons */
char *mstring = ""; /* Most frequent word */
int llen = 0;       /* Length of the longest word */
char *lstring = ""; /* A longest string */
int lcnt = 0;       /* Number of words having maximum length */

/* Use function pointers to keep track of which options we are using */
typedef void (*lower_fun_t)(char *s);

/* Lower case conversion routines */

/* Convert string to lower case: optimized with lookup table */
static unsigned char lcase_table[256];
static int table_initialized = 0;

void init_lcase_table() {
  if (!table_initialized) {
    int i;
    for (i = 0; i < 256; i++)
      lcase_table[i] = i;
    for (i = 'A'; i <= 'Z'; i++)
      lcase_table[i] = i - ('A' - 'a');
    table_initialized = 1;
  }
}

void lower1(char *s) {
  init_lcase_table();
  unsigned char *us = (unsigned char *)s;
  while (*us) {
    *us = lcase_table[*us];
    us++;
  }
}

/* Convert string to lower case: faster */
void lower2(char *s) {
  int i;
  int len = Strlen(s);

  for (i = 0; i < len; i++)
    if (s[i] >= 'A' && s[i] <= 'Z')
      s[i] -= ('A' - 'a');
}

/* Set of lower case functions */
#define LCNT 2
lower_fun_t lower_fun_set[LCNT] = {lower1, lower2};
char *lower_fun_names[LCNT] = {"lower1", "lower2"};

/* Implementation of library function strlen */
/* Compute length of string */
size_t Strlen(const char *s) {
#ifdef USESTRLEN
  return strlen(s);
#else
  int length = 0;
  while (*s != '\0') {
    s++;
    length++;
  }
  return length;
#endif
}

/* The hash table */

typedef struct HELE {
  char *word;
  int freq;
  struct HELE *next;
} h_rec, *h_ptr;

/* The hash table */
h_ptr *htable;
int tsize;

static void new_table(int size) {
  tsize = size;
  htable = (h_ptr *)calloc(size, sizeof(h_ptr));
  if (!htable) {
    fprintf(stderr, "Couldn't allocate hash array, exiting\n");
    exit(1);
  }
}

static h_ptr new_ele(char *s) {
  h_ptr result = (h_ptr)malloc(sizeof(h_rec));
  int wlen = Strlen(s);
  if (wlen > llen) {
    lstring = s;
    llen = wlen;
    lcnt = 1;
  } else if (wlen == llen)
    lcnt++;
  if (!result) {
    fprintf(stderr, "Couldn't allocate hash element, exiting\n");
    exit(1);
  }

  result->word = s;
  result->freq = 1;
  return result;
}

/* Some hash functions */

/* Division hashing */

typedef unsigned (*hash_fun_t)(char *s);

unsigned h_mod(char *s) {
  unsigned val = 0;
  int c;
  while ((c = *s++))
    val = (val * 128 + c) % tsize;
  return val;
}

/* Simply add characters together */
unsigned h_add(char *s) {
  unsigned val = 0;
  int c;
  while ((c = *s++))
    val += c;
  return val % tsize;
}

/* Combine with Xors */
unsigned h_xor(char *s) {
  unsigned val = 0;
  int c;
  while ((c = *s++))
    val = ((val ^ c) << 4) | ((val >> 28) & 0xF);
  return val % tsize;
}

/* FNV-1a hash function */
unsigned h_fnv1a(char *s) {
  unsigned hash = 2166136261u;
  unsigned char *us = (unsigned char *)s;
  while (*us) {
    hash ^= *us++;
    hash *= 16777619u;
  }
  return hash % tsize;
}

#define HCNT 4
hash_fun_t hash_fun_set[HCNT] = {h_mod, h_add, h_xor, h_fnv1a};
char *hash_fun_names[HCNT] = {"h_mod", "h_add", "h_xor", "h_fnv1a"};

char *save_string(char *s) {
  char *result = (char *)malloc(Strlen(s) + 1);
  if (!result) {
    fprintf(stderr, "Couldn't allocate space for string, exiting\n");
    exit(1);
  }
  strcpy(result, s);
  return result;
}

/* Recursively find string in list.  Add to end if not found */
h_ptr find_ele_rec(h_ptr ls, char *s) {
  if (!ls) {
    /* Come to end of list.  Insert this one */
    ucnt++;
    return new_ele(save_string(s));
  }
  if (strcmp(s, ls->word) == 0) {
    ls->freq++;
    if (ls->freq > mcnt) {
      mcnt = ls->freq;
      mstring = ls->word;
    }
    return ls;
  }
  ls->next = find_ele_rec(ls->next, s);
  return ls;
}

/* Iteratively find string in list.  Add to front if not found */
h_ptr find_ele_iter_f(h_ptr ls, char *s) {
  h_ptr ele = ls;
  for (ele = ls; ele; ele = ele->next) {
    char *word = ele->word;
    if (strcmp(s, word) == 0) {
      int freq = ++ele->freq;
      if (freq > mcnt) {
        mcnt = freq;
        mstring = word;
      }
      return ls;
    }
  }
  ele = new_ele(save_string(s));
  ucnt++;
  ele->next = ls;
  return ele;
}

/* Iteratively find string in list.  Add to end if not found */
h_ptr find_ele_iter_r(h_ptr ls, char *s) {
  h_ptr ele = ls;
  h_ptr last = NULL;
  for (ele = ls; ele; ele = ele->next) {
    char *word = ele->word;
    if (strcmp(s, word) == 0) {
      int freq = ++ele->freq;
      if (freq > mcnt) {
        mcnt = freq;
        mstring = word;
      }
      return ls;
    }
    last = ele;
  }
  ele = new_ele(save_string(s));
  ucnt++;
  ele->next = NULL;
  if (last) {
    last->next = ele;
    return ls;
  } else
    return ele;
}

typedef h_ptr (*find_ele_fun_t)(h_ptr, char *);

#define FCNT 3
find_ele_fun_t find_ele_fun_set[FCNT] = {find_ele_rec, find_ele_iter_f,
                                         find_ele_iter_r};
char *find_ele_fun_names[FCNT] = {"find_ele_rec", "find_ele_iter_f",
                                  "find_ele_iter_r"};

/* Comparision function for sorting */
int compare_ele(const void *vele1, const void *vele2) {
  h_ptr ele1 = *(h_ptr *)vele1;
  h_ptr ele2 = *(h_ptr *)vele2;
  return ele2->freq - ele1->freq;
}

/* Sort hash table elements by frequency */
h_ptr sort_words(int quick) {
  h_ptr ls = NULL;
  h_ptr ele;
  h_ptr *array = calloc(ucnt, sizeof(h_ptr));
  int i, j;
  int cnt = 0;
  scnt = 0; /* Count singletons */
  for (i = 0; i < tsize; i++)
    for (ele = htable[i]; ele; ele = ele->next) {
      if (ele->freq == 1)
        scnt++;
      if (quick)
        array[cnt] = ele;
      else {
        for (j = cnt; j > 0 && ele->freq > array[j - 1]->freq; j--)
          array[j] = array[j - 1];
        array[j] = ele;
      }
      cnt++;
    }
  if (quick) {
    qsort((void *)array, cnt, sizeof(h_ptr), compare_ele);
  }
  ls = array[0];
  for (j = 0; j < cnt - 1; j++)
    array[j]->next = array[j + 1];
  array[cnt - 1]->next = NULL;
  free((void *)array);
  return ls;
}

void insert_string(char *s, hash_fun_t hash_fun, lower_fun_t lower_fun,
                   find_ele_fun_t find_ele_fun) {
  int index;
  lower_fun(s);
  index = hash_fun(s);
  htable[index] = find_ele_fun(htable[index], s);
}

/* Extract word from file */
#define BSIZE 1024
char buf[BSIZE];
int bufvalid = 0;
FILE *infile;

void init_token(FILE *in) {
  bufvalid = 0;
  infile = in;
}

/* Added some non-ASCII characters encountered in European parliament corpus */
static char *skipchar = " \t\n\r.,:;/<>()[]{}?!\"-'\0xc2\0xa0";

/* Keep getting tokens.  Return NULL when no more */
char *get_word() {
  char *s = NULL;
  while (1) {
    if (bufvalid) {
      s = strtok(NULL, skipchar);
      if (s)
        break;
    }
    if (!fgets(buf, BSIZE, infile))
      return NULL;
    bufvalid = 1;
    s = strtok(buf, skipchar);
    if (s)
      break;
  }
  wcnt++;
  return s;
}

#define MAXNG 10

char *get_token(int ngram) {
  /* Buffer of last ngram-1 tokens */
  static char token_buf[MAXNG][BSIZE];
  static int first = 1;
  static int bindex = 0; /* In which buffer to insert next token */
  static char sbuf[BSIZE];
  char *nextpos = sbuf;
  int i;
  int index;

  if (ngram == 1)
    return get_word();
  if (first) {
    /* Get ngram-1 tokens */
    while (bindex < ngram - 1) {
      char *word = get_word();
      if (!word) {
        return NULL; /* Document doesn't have enough tokens */
      }
      strcpy(token_buf[bindex++], word);
    }
    first = 0;
  }
  /* Get new token */
  char *word = get_word();
  if (!word) {
    return NULL; /* No more ngrams */
  }
  strcpy(token_buf[bindex++], word);
  if (bindex >= MAXNG)
    bindex = 0;
  /* Generate string of last ngram-1 tokens */
  index = bindex - ngram;
  if (index < 0)
    index += MAXNG;
  for (i = 0; i < ngram; i++) {
    if (i != 0)
      *nextpos++ = ' ';
    word = token_buf[index];
    strcpy(nextpos, word);
    nextpos += Strlen(word);
    index++;
    if (index >= MAXNG)
      index = 0;
  }
#if 0
     printf("Next n-gram = '%s'\n", sbuf);
#endif
  return sbuf;
}

/* Find statistics of word frequency in document */
void word_freq(FILE *src, int verbose, int ngram, int size, int quick,
               hash_fun_t hash_fun, lower_fun_t lower_fun,
               find_ele_fun_t find_ele_fun) {
  char *s;
  h_ptr ls;

  init_token(src);
  new_table(size);

  while ((s = get_token(ngram))) {
    insert_string(s, hash_fun, lower_fun, find_ele_fun);
  }
  if (verbose > 0) {
    ls = sort_words(quick);
    while (ls && verbose--) {
      printf("%d\t'%s'\n", ls->freq, ls->word);
      ls = ls->next;
    }
  }
  printf("%d n-grams, %d unique, %d singletons.  Most common (%d) = '%s'.  "
         "Longest (%d have length %d) = '%s'\n",
         wcnt, ucnt, scnt, mcnt, mstring, lcnt, llen, lstring);
}

int main(int argc, char *argv[]) {
  int verbose = 1;
  int size = 32768; // 修改默认值为32768
  int hash_fun_index = 3;
  int lower_fun_index = 0;
  int find_fun_index = 0;
  int ngram = 1;
  int quick = 0;
  char *fname = NULL;
  FILE *infile = stdin;
  add_int_option("verbose", &verbose);
  add_int_option("size", &size);
  add_int_option("hash", &hash_fun_index);
  add_int_option("lower", &lower_fun_index);
  add_int_option("find", &find_fun_index);
  add_int_option("ngram", &ngram);
  add_int_option("quicksort", &quick);
  add_string_option("file", &fname);
  parse_options(argc, argv, NULL);
  show_options(stdout);
  printf("N-gram size %d\n", ngram);
  printf("Lower case function %s\n", lower_fun_names[lower_fun_index]);
  printf("Hash function %s\n", hash_fun_names[hash_fun_index]);
  printf("Find element function %s\n", find_ele_fun_names[find_fun_index]);
  if ((unsigned)hash_fun_index >= HCNT) {
    fprintf(stderr, "Invalid hash function index %d\n", hash_fun_index);
    exit(1);
  }
  if ((unsigned)lower_fun_index >= LCNT) {
    fprintf(stderr, "Invalid lower function index %d\n", lower_fun_index);
    exit(1);
  }
  if ((unsigned)find_fun_index >= FCNT) {
    fprintf(stderr, "Invalid find function index %d\n", find_fun_index);
    exit(1);
  }
  if (fname) {
    infile = fopen(fname, "r");
    if (!infile) {
      fprintf(stderr, "Couldn't open file '%s'\n", fname);
      exit(1);
    }
  }
  word_freq(infile, verbose, ngram, size, quick, hash_fun_set[hash_fun_index],
            lower_fun_set[lower_fun_index], find_ele_fun_set[find_fun_index]);
  printf("Total time = %f seconds\n", (double)clock() / CLOCKS_PER_SEC);
  fclose(infile);
  return 0;
}