Monday, April 2, 2012

Create distinct wordlist sorted on frequency

Create distinct wordlist sorted on frequency

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define MAXLINE         1024
#define MAXTOKENS       256
#define MINLEN          3

struct tnode {
 char *word;
 int count;
 struct tnode *left, *right;
};

char **split(char *, char *);
struct tnode *addtree(struct tnode *, char *);
struct tnode *talloc(void);
void treeprint(struct tnode *);
void treefree(struct tnode *);
void arrayprint(struct tnode **, int);
int compare(const void *, const void *);
int tnodecount(struct tnode *);
int treetoarray(struct tnode *, struct tnode **);

int main(void) {
 char *delim = ".,:;`'\"+-_(){}[]<>*&^%$#@!?~/|\\= \t\r\n1234567890";
 char **tokens = NULL;
 char line[MAXLINE];
 struct tnode *root = {0};
 struct tnode **tarray = {0};
 int i, len, lcount, ncount;

 i = len = lcount = ncount = 0;

 while(fgets(line, MAXLINE, stdin) != NULL) {
  len = strlen(line);
  if(len < MINLEN)
   continue;
  else
   lcount++;

  if(line[len - 1] == '\n')
   line[--len] = '\0';

  tokens = split(line, delim);
  for(i = 0; tokens[i] != NULL; i++)
   root = addtree(root, tokens[i]);

  for(i = 0; tokens[i] != NULL; i++)
   free(tokens[i]);
  free(tokens);
 }

 ncount = tnodecount(root);
 if(ncount == 0)
  return 1;
 /* treeprint(root); */

 tarray = malloc(ncount * sizeof(struct tnode *));
 if(tarray == NULL)
  return 1;
 if(treetoarray(root, tarray) == -1)
  return 1;
 qsort(tarray, ncount, sizeof(struct tnode *), compare);
 arrayprint(tarray, ncount);

 treefree(root);

 return 0;
}

struct tnode *addtree(struct tnode *p, char *w) {
 int cond;

 if(p == NULL) {
  p = talloc();
  p->word = strdup(w);
  p->count = 1;
  p->left = p->right = NULL;
 } else if((cond = strcmp(w, p->word)) == 0)
  p->count++;
 else if(cond < 0)
  p->left = addtree(p->left, w);
 else
  p->right = addtree(p->right, w);

 return p;
}

struct tnode *talloc(void) {
 return(struct tnode *)malloc(sizeof(struct tnode));
}

void treeprint(struct tnode *p) {
 if(p != NULL) {
  treeprint(p->left);
  printf("%4d %s\n", p->count, p->word);
  treeprint(p->right);
 }
}

void treefree(struct tnode *p) {
 if(p != NULL) {
  treefree(p->left);
  treefree(p->right);

  free(p->word);
  free(p);
 }

 return;
}

void arrayprint(struct tnode **p, int x) {
 int i = 0;

 for(i = 0; i < x; i++)
  printf("%4d %s\n", p[i]->count, p[i]->word);

 return;
}

char **split(char *string, char *delim) {
 char **tokens = NULL;
 char *working = NULL;
 char *token = NULL;
 int idx = 0;

 tokens  = malloc(sizeof(char *) * MAXTOKENS);
 if(tokens == NULL)
  return NULL;
 working = malloc(sizeof(char) * strlen(string) + 1);
 if(working == NULL)
  return NULL;

 strcpy(working, string);
 for(idx = 0; idx < MAXTOKENS; idx++)
  tokens[idx] = NULL;

 token = strtok(working, delim);
 idx = 0;

 while((idx < (MAXTOKENS - 1)) && (token != NULL)) {
  tokens[idx] = malloc(sizeof(char) * strlen(token) + 1);
  if(tokens[idx] != NULL) {
   strcpy(tokens[idx], token);
   idx++;
   token = strtok(NULL, delim);
  }
 }

 free(working);
 return tokens;
}

int treetoarray(struct tnode *tree, struct tnode **array) {
 static struct tnode **write = NULL;

 if(tree == NULL)
  return -1;

 if(array != NULL)
  write = array;

 if(tree != NULL) {
  *write = tree, write++;
  if(tree->left != NULL)
   treetoarray(tree->left, NULL);
  if(tree->right != NULL)
   treetoarray(tree->right, NULL);
 }

 return 0;
}

int compare(const void *x, const void *y) {
 struct tnode * const *a = x;
 struct tnode * const *b = y;

 if(a == NULL || b == NULL)
  return -2;
 else {
  if((*a)->count < (*b)->count)
   return 1;
  else if((*a)->count > (*b)->count)
   return -1;
  else
   return 0;
 }
}

int tnodecount(struct tnode *p) {
 if(p == NULL)
  return 0;
 else {
  if(p->left == NULL && p->right == NULL)
   return 1;
  else
   return(1 + (tnodecount(p->left) + tnodecount(p->right)));
 }
}

No comments:

Post a Comment