Monday, April 2, 2012

Filter text with a stop word list

Filter text with a stop word list

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAXTOKENS       256
#define MAXLINE         1024
#define MINLEN          3
#define STMINLEN        2

struct tnode {
 char *word;
 int count;
 struct tnode *left, *right;
};

struct tnode *buildstoptree(char *, struct tnode *);
struct tnode *addtree(struct tnode *, char *);
struct tnode *findstopword(struct tnode *, char *);
struct tnode *talloc(void);
void freetree(struct tnode *);
char **split(char *, char *);

int main(int argc, char *argv[]) {
 /* delim does not include \' [\047] quote */
 char *delim = ".,:;`\"+-_(){}[]<>*&^%$#@!?~/|\\= \t\r\n1234567890";
 char **tokens = NULL;
 struct tnode *root = {0};
 struct tnode *querry = {0};
 char line[MAXLINE];
 int i = 0;

 if(argc != 2) {
  fprintf(stderr, "Usage: tokstop STOPLIST.txt\n");
  return 1;
 }

 root = buildstoptree(argv[1], root);
 if(root == NULL)
  return 1;

 while(fgets(line, MAXLINE, stdin) != NULL) {
  if(strlen(line) < MINLEN)
   continue;

  tokens = split(line, delim);
  for(i = 0; tokens[i] != NULL; i++) {
   querry = findstopword(root, tokens[i]);
   if(querry == NULL)
    printf("%s ", tokens[i]);
  }

  for(i = 0; tokens[i] != NULL; i++)
   free(tokens[i]);
  free(tokens[i]);
  printf("\n");
 }

 freetree(root);
 return 0;
}

/* read stoplist into binary tree, expects one entry per line */
struct tnode *buildstoptree(char *fname, struct tnode *p) {
 FILE *fp = {0};
 char line[MAXLINE];
 int len = 0, lcount = 0;

 fp = fopen(fname, "r");
 if(fp == NULL) {
  fprintf(stderr, "Error - fopen(%s)\n", fname);
  return NULL;
 }

 while(fgets(line, MAXLINE, fp) != NULL) {
  len = strlen(line);
  if(len < STMINLEN)
   continue;
  else
   lcount++;

  if(line[len - 1] == '\n')
   line[--len] = '\0';

  p = addtree(p, line);
 }

 if(lcount == 0) {
  fprintf(stderr, "Error - Zero stopwords..\n");
  return NULL;
 }

 fclose(fp);
 return p;
}

/* split string into tokens, return token array */
char **split(char *string, char *delim) {
 char **tokens = NULL;
 char *working = NULL;
 char *token = NULL;
 int idx = 0;

 tokens  = malloc(sizeof(char *) * MAXTOKENS);
 if(tokens == NULL)
  return NULL;
 working = malloc(sizeof(char) * strlen(string) + 1);
 if(working == NULL)
  return NULL;

 /* to make sure, copy string to a safe place */
 strcpy(working, string);
 for(idx = 0; idx < MAXTOKENS; idx++)
  tokens[idx] = NULL;

 token = strtok(working, delim);
 idx = 0;

 /* always keep the last entry NULL terminated */
 while((idx < (MAXTOKENS - 1)) && (token != NULL)) {
  tokens[idx] = malloc(sizeof(char) * strlen(token) + 1);
  if(tokens[idx] != NULL) {
   strcpy(tokens[idx], token);
   idx++;
   token = strtok(NULL, delim);
  }
 }

 free(working);
 return tokens;
}

/* install word in binary tree */
struct tnode *addtree(struct tnode *p, char *w) {
 int cond;

 if(p == NULL) {
  p = talloc();
  p->word = strdup(w);
  p->count = 1;
  p->left = p->right = NULL;
 } else if((cond = strcmp(w, p->word)) == 0)
  p->count++;
 else if(cond < 0)
  p->left = addtree(p->left, w);
 else
  p->right = addtree(p->right, w);

 return p;
}

/* make new tnode */
struct tnode *talloc(void) {
 return(struct tnode *)malloc(sizeof(struct tnode));
}

/* find value w in binary tree */
struct tnode *findstopword(struct tnode *p, char *w) {
 struct tnode *temp;
 int cond = 0;

 temp = p;

 while(temp != NULL) {
  if((cond = strcmp(temp->word, w)) == 0)
   return temp;
  else if(cond > 0)
   temp = temp->left;
  else
   temp = temp->right;
 }

 return NULL;
}

/* free binary tree */
void freetree(struct tnode *p) {
 if(p != NULL) {
  free(p->left);
  free(p->right);

  free(p->word);
  free(p);
 }
}

No comments:

Post a Comment