Monday, April 2, 2012

Extract character ngrams from textdata

Extract character ngrams from textdata

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAXLINE         1024
#define MINLEN          3

/* print all ngrams for `str' */
void printgrams(char *, int);
/* padd token to, one prefix, and (N - 1) affix */
char *mkpadgr(char *, char *, int);

int main(int argc, char *argv[]) {
 char *delim = ".,:;`'\"+-_(){}[]<>*&^%$#@!?~/|\\= \t\r\n1234567890";
 char *token = NULL;
 char line[MAXLINE];
 int nglen, i;

 i = nglen = 0;

 if(argc != 2) {
  fprintf(stderr, "Usage: chargram INT\n");
  return 1;
 } else
  nglen = atoi(argv[1]);

 while(fgets(line, MAXLINE, stdin) != NULL) {
  if(strlen(line) < MINLEN)
   continue;

  token = strtok(line, delim);
  while(token != NULL) {
   printgrams(token, nglen);
   token = strtok(NULL, delim);
  }
 }

 return 0;
}

/* print all ngrams for `str' */
void printgrams(char *str, int N) {
 char *padded = NULL;
 char *gram = NULL;
 int i = 0, j = 0;

 padded = mkpadgr(str, "_", N);

 for(i = 0; i < strlen(str) + 1; i++) {
  gram = &padded[i];
  for(j = 0; j < N; j++) {
   printf("%c", gram[j]);
  }
  printf("\n");
 }

 free(padded);
 return;
}

/* padd word according to one prefix, and (N - 1) affix */
char *mkpadgr(char *str, char *padd, int N) {
 char *buff = NULL;
 int i = 0;

 buff = calloc(strlen(str) + 2 + (N - 1), sizeof(char));
 strcat(buff, padd), strcat(buff, str);
 for(i = 0; i < (N - 1); i++)
  strcat(buff, padd);

 return buff;
}

No comments:

Post a Comment