Extract character ngrams from textdata
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAXLINE 1024
#define MINLEN 3
/* print all ngrams for `str' */
void printgrams(char *, int);
/* padd token to, one prefix, and (N - 1) affix */
char *mkpadgr(char *, char *, int);
int main(int argc, char *argv[]) {
char *delim = ".,:;`'\"+-_(){}[]<>*&^%$#@!?~/|\\= \t\r\n1234567890";
char *token = NULL;
char line[MAXLINE];
int nglen, i;
i = nglen = 0;
if(argc != 2) {
fprintf(stderr, "Usage: chargram INT\n");
return 1;
} else
nglen = atoi(argv[1]);
while(fgets(line, MAXLINE, stdin) != NULL) {
if(strlen(line) < MINLEN)
continue;
token = strtok(line, delim);
while(token != NULL) {
printgrams(token, nglen);
token = strtok(NULL, delim);
}
}
return 0;
}
/* print all ngrams for `str' */
void printgrams(char *str, int N) {
char *padded = NULL;
char *gram = NULL;
int i = 0, j = 0;
padded = mkpadgr(str, "_", N);
for(i = 0; i < strlen(str) + 1; i++) {
gram = &padded[i];
for(j = 0; j < N; j++) {
printf("%c", gram[j]);
}
printf("\n");
}
free(padded);
return;
}
/* padd word according to one prefix, and (N - 1) affix */
char *mkpadgr(char *str, char *padd, int N) {
char *buff = NULL;
int i = 0;
buff = calloc(strlen(str) + 2 + (N - 1), sizeof(char));
strcat(buff, padd), strcat(buff, str);
for(i = 0; i < (N - 1); i++)
strcat(buff, padd);
return buff;
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAXLINE 1024
#define MINLEN 3
/* print all ngrams for `str' */
void printgrams(char *, int);
/* padd token to, one prefix, and (N - 1) affix */
char *mkpadgr(char *, char *, int);
int main(int argc, char *argv[]) {
char *delim = ".,:;`'\"+-_(){}[]<>*&^%$#@!?~/|\\= \t\r\n1234567890";
char *token = NULL;
char line[MAXLINE];
int nglen, i;
i = nglen = 0;
if(argc != 2) {
fprintf(stderr, "Usage: chargram INT\n");
return 1;
} else
nglen = atoi(argv[1]);
while(fgets(line, MAXLINE, stdin) != NULL) {
if(strlen(line) < MINLEN)
continue;
token = strtok(line, delim);
while(token != NULL) {
printgrams(token, nglen);
token = strtok(NULL, delim);
}
}
return 0;
}
/* print all ngrams for `str' */
void printgrams(char *str, int N) {
char *padded = NULL;
char *gram = NULL;
int i = 0, j = 0;
padded = mkpadgr(str, "_", N);
for(i = 0; i < strlen(str) + 1; i++) {
gram = &padded[i];
for(j = 0; j < N; j++) {
printf("%c", gram[j]);
}
printf("\n");
}
free(padded);
return;
}
/* padd word according to one prefix, and (N - 1) affix */
char *mkpadgr(char *str, char *padd, int N) {
char *buff = NULL;
int i = 0;
buff = calloc(strlen(str) + 2 + (N - 1), sizeof(char));
strcat(buff, padd), strcat(buff, str);
for(i = 0; i < (N - 1); i++)
strcat(buff, padd);
return buff;
}
No comments:
Post a Comment