Filter text with a stop word list
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAXTOKENS 256
#define MAXLINE 1024
#define MINLEN 3
#define STMINLEN 2
struct tnode {
char *word;
int count;
struct tnode *left, *right;
};
struct tnode *buildstoptree(char *, struct tnode *);
struct tnode *addtree(struct tnode *, char *);
struct tnode *findstopword(struct tnode *, char *);
struct tnode *talloc(void);
void freetree(struct tnode *);
char **split(char *, char *);
int main(int argc, char *argv[]) {
/* delim does not include \' [\047] quote */
char *delim = ".,:;`\"+-_(){}[]<>*&^%$#@!?~/|\\= \t\r\n1234567890";
char **tokens = NULL;
struct tnode *root = {0};
struct tnode *querry = {0};
char line[MAXLINE];
int i = 0;
if(argc != 2) {
fprintf(stderr, "Usage: tokstop STOPLIST.txt\n");
return 1;
}
root = buildstoptree(argv[1], root);
if(root == NULL)
return 1;
while(fgets(line, MAXLINE, stdin) != NULL) {
if(strlen(line) < MINLEN)
continue;
tokens = split(line, delim);
for(i = 0; tokens[i] != NULL; i++) {
querry = findstopword(root, tokens[i]);
if(querry == NULL)
printf("%s ", tokens[i]);
}
for(i = 0; tokens[i] != NULL; i++)
free(tokens[i]);
free(tokens[i]);
printf("\n");
}
freetree(root);
return 0;
}
/* read stoplist into binary tree, expects one entry per line */
struct tnode *buildstoptree(char *fname, struct tnode *p) {
FILE *fp = {0};
char line[MAXLINE];
int len = 0, lcount = 0;
fp = fopen(fname, "r");
if(fp == NULL) {
fprintf(stderr, "Error - fopen(%s)\n", fname);
return NULL;
}
while(fgets(line, MAXLINE, fp) != NULL) {
len = strlen(line);
if(len < STMINLEN)
continue;
else
lcount++;
if(line[len - 1] == '\n')
line[--len] = '\0';
p = addtree(p, line);
}
if(lcount == 0) {
fprintf(stderr, "Error - Zero stopwords..\n");
return NULL;
}
fclose(fp);
return p;
}
/* split string into tokens, return token array */
char **split(char *string, char *delim) {
char **tokens = NULL;
char *working = NULL;
char *token = NULL;
int idx = 0;
tokens = malloc(sizeof(char *) * MAXTOKENS);
if(tokens == NULL)
return NULL;
working = malloc(sizeof(char) * strlen(string) + 1);
if(working == NULL)
return NULL;
/* to make sure, copy string to a safe place */
strcpy(working, string);
for(idx = 0; idx < MAXTOKENS; idx++)
tokens[idx] = NULL;
token = strtok(working, delim);
idx = 0;
/* always keep the last entry NULL terminated */
while((idx < (MAXTOKENS - 1)) && (token != NULL)) {
tokens[idx] = malloc(sizeof(char) * strlen(token) + 1);
if(tokens[idx] != NULL) {
strcpy(tokens[idx], token);
idx++;
token = strtok(NULL, delim);
}
}
free(working);
return tokens;
}
/* install word in binary tree */
struct tnode *addtree(struct tnode *p, char *w) {
int cond;
if(p == NULL) {
p = talloc();
p->word = strdup(w);
p->count = 1;
p->left = p->right = NULL;
} else if((cond = strcmp(w, p->word)) == 0)
p->count++;
else if(cond < 0)
p->left = addtree(p->left, w);
else
p->right = addtree(p->right, w);
return p;
}
/* make new tnode */
struct tnode *talloc(void) {
return(struct tnode *)malloc(sizeof(struct tnode));
}
/* find value w in binary tree */
struct tnode *findstopword(struct tnode *p, char *w) {
struct tnode *temp;
int cond = 0;
temp = p;
while(temp != NULL) {
if((cond = strcmp(temp->word, w)) == 0)
return temp;
else if(cond > 0)
temp = temp->left;
else
temp = temp->right;
}
return NULL;
}
/* free binary tree */
void freetree(struct tnode *p) {
if(p != NULL) {
free(p->left);
free(p->right);
free(p->word);
free(p);
}
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAXTOKENS 256
#define MAXLINE 1024
#define MINLEN 3
#define STMINLEN 2
struct tnode {
char *word;
int count;
struct tnode *left, *right;
};
struct tnode *buildstoptree(char *, struct tnode *);
struct tnode *addtree(struct tnode *, char *);
struct tnode *findstopword(struct tnode *, char *);
struct tnode *talloc(void);
void freetree(struct tnode *);
char **split(char *, char *);
int main(int argc, char *argv[]) {
/* delim does not include \' [\047] quote */
char *delim = ".,:;`\"+-_(){}[]<>*&^%$#@!?~/|\\= \t\r\n1234567890";
char **tokens = NULL;
struct tnode *root = {0};
struct tnode *querry = {0};
char line[MAXLINE];
int i = 0;
if(argc != 2) {
fprintf(stderr, "Usage: tokstop STOPLIST.txt\n");
return 1;
}
root = buildstoptree(argv[1], root);
if(root == NULL)
return 1;
while(fgets(line, MAXLINE, stdin) != NULL) {
if(strlen(line) < MINLEN)
continue;
tokens = split(line, delim);
for(i = 0; tokens[i] != NULL; i++) {
querry = findstopword(root, tokens[i]);
if(querry == NULL)
printf("%s ", tokens[i]);
}
for(i = 0; tokens[i] != NULL; i++)
free(tokens[i]);
free(tokens[i]);
printf("\n");
}
freetree(root);
return 0;
}
/* read stoplist into binary tree, expects one entry per line */
struct tnode *buildstoptree(char *fname, struct tnode *p) {
FILE *fp = {0};
char line[MAXLINE];
int len = 0, lcount = 0;
fp = fopen(fname, "r");
if(fp == NULL) {
fprintf(stderr, "Error - fopen(%s)\n", fname);
return NULL;
}
while(fgets(line, MAXLINE, fp) != NULL) {
len = strlen(line);
if(len < STMINLEN)
continue;
else
lcount++;
if(line[len - 1] == '\n')
line[--len] = '\0';
p = addtree(p, line);
}
if(lcount == 0) {
fprintf(stderr, "Error - Zero stopwords..\n");
return NULL;
}
fclose(fp);
return p;
}
/* split string into tokens, return token array */
char **split(char *string, char *delim) {
char **tokens = NULL;
char *working = NULL;
char *token = NULL;
int idx = 0;
tokens = malloc(sizeof(char *) * MAXTOKENS);
if(tokens == NULL)
return NULL;
working = malloc(sizeof(char) * strlen(string) + 1);
if(working == NULL)
return NULL;
/* to make sure, copy string to a safe place */
strcpy(working, string);
for(idx = 0; idx < MAXTOKENS; idx++)
tokens[idx] = NULL;
token = strtok(working, delim);
idx = 0;
/* always keep the last entry NULL terminated */
while((idx < (MAXTOKENS - 1)) && (token != NULL)) {
tokens[idx] = malloc(sizeof(char) * strlen(token) + 1);
if(tokens[idx] != NULL) {
strcpy(tokens[idx], token);
idx++;
token = strtok(NULL, delim);
}
}
free(working);
return tokens;
}
/* install word in binary tree */
struct tnode *addtree(struct tnode *p, char *w) {
int cond;
if(p == NULL) {
p = talloc();
p->word = strdup(w);
p->count = 1;
p->left = p->right = NULL;
} else if((cond = strcmp(w, p->word)) == 0)
p->count++;
else if(cond < 0)
p->left = addtree(p->left, w);
else
p->right = addtree(p->right, w);
return p;
}
/* make new tnode */
struct tnode *talloc(void) {
return(struct tnode *)malloc(sizeof(struct tnode));
}
/* find value w in binary tree */
struct tnode *findstopword(struct tnode *p, char *w) {
struct tnode *temp;
int cond = 0;
temp = p;
while(temp != NULL) {
if((cond = strcmp(temp->word, w)) == 0)
return temp;
else if(cond > 0)
temp = temp->left;
else
temp = temp->right;
}
return NULL;
}
/* free binary tree */
void freetree(struct tnode *p) {
if(p != NULL) {
free(p->left);
free(p->right);
free(p->word);
free(p);
}
}
No comments:
Post a Comment