Calc. score intersecting ngrams
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char **mkcgram(char *, int);
char *mkpadgr(char *, char *, int);
char *strndup(const char *, size_t);
int strnlen(const char *, int);
int main(int argc, char *argv[]) {
char **cgram1 = NULL;
char **cgram2 = NULL;
int dupcount = 0, nglen = 0;
int i = 0, j = 0;
if(argc != 4) {
fprintf(stderr, "Usage: ngramisect INT WORD1 WORD2\n");
return 1;
}
nglen = atoi(argv[1]);
/* get ngrams for first word */
cgram1 = mkcgram(argv[2], nglen);
for(i = 0; i < strlen(argv[2]) + 1; i++)
printf("cgram1[%d] = %s\n", i, cgram1[i]);
printf("---\n");
/* get ngrams for second word */
cgram2 = mkcgram(argv[3], nglen);
for(i = 0; i < strlen(argv[3]) + 1; i++)
printf("cgram2[%d] = %s\n", i, cgram2[i]);
/* compare two arrays, count duplicates */
for(i = 0; i < strlen(argv[2]) + 1; i++)
for(j = 0; j < strlen(argv[3]) + 1; j++)
if(strcmp(cgram1[i], cgram2[j]) == 0)
dupcount++;
/* calc. score */
printf("---\n");
printf("total ngrams : %d\n", strlen(argv[2]) + 1 + strlen(argv[3]) + 1);
printf("duplicates : %d\n", dupcount);
printf("uniq ngrams : %d\n",
(strlen(argv[2]) + 1 + strlen(argv[3]) + 1) - dupcount);
printf("score : %0.2f\n", (double)
dupcount / ((strlen(argv[2]) + 1 + strlen(argv[3]) + 1) - dupcount));
/* clean up .. */
for(i = 0; i < strlen(argv[2]) + 1; i++)
free(cgram1[i]);
free(cgram1);
for(i = 0; i < strlen(argv[3]) + 1; i++)
free(cgram2[i]);
free(cgram2);
return 0;
}
/* return array of ngrams */
char **mkcgram(char *str, int N) {
char **retval = NULL;
char *padded = NULL;
int i = 0;
padded = mkpadgr(str, "_", N);
retval = malloc((strlen(str) + 2) * sizeof(char *));
for(i = 0; i < strlen(str) + 1; i++)
retval[i] = strndup(&padded[i], N);
free(padded);
return retval;
}
/* padd word according to one prefix, and (N - 1) affix */
char *mkpadgr(char *str, char *padd, int N) {
char *buff = NULL;
int i = 0;
buff = calloc(strlen(str) + 2 + (N - 1), sizeof(char));
strcat(buff, padd), strcat(buff, str);
for(i = 0; i < (N - 1); i++)
strcat(buff, padd);
return buff;
}
char *strndup(const char *str, size_t n) {
char *retval = NULL;
size_t len = 0;
len = strnlen(str, n);
retval = malloc(len + 1);
if(retval == NULL)
return NULL;
retval[len] = '\0';
return (char *)memcpy(retval, str, len);
}
int strnlen(const char *str, int max) {
const char *end = NULL;
end = memchr(str, '\0', max);
return end ? end - str : max;
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char **mkcgram(char *, int);
char *mkpadgr(char *, char *, int);
char *strndup(const char *, size_t);
int strnlen(const char *, int);
int main(int argc, char *argv[]) {
char **cgram1 = NULL;
char **cgram2 = NULL;
int dupcount = 0, nglen = 0;
int i = 0, j = 0;
if(argc != 4) {
fprintf(stderr, "Usage: ngramisect INT WORD1 WORD2\n");
return 1;
}
nglen = atoi(argv[1]);
/* get ngrams for first word */
cgram1 = mkcgram(argv[2], nglen);
for(i = 0; i < strlen(argv[2]) + 1; i++)
printf("cgram1[%d] = %s\n", i, cgram1[i]);
printf("---\n");
/* get ngrams for second word */
cgram2 = mkcgram(argv[3], nglen);
for(i = 0; i < strlen(argv[3]) + 1; i++)
printf("cgram2[%d] = %s\n", i, cgram2[i]);
/* compare two arrays, count duplicates */
for(i = 0; i < strlen(argv[2]) + 1; i++)
for(j = 0; j < strlen(argv[3]) + 1; j++)
if(strcmp(cgram1[i], cgram2[j]) == 0)
dupcount++;
/* calc. score */
printf("---\n");
printf("total ngrams : %d\n", strlen(argv[2]) + 1 + strlen(argv[3]) + 1);
printf("duplicates : %d\n", dupcount);
printf("uniq ngrams : %d\n",
(strlen(argv[2]) + 1 + strlen(argv[3]) + 1) - dupcount);
printf("score : %0.2f\n", (double)
dupcount / ((strlen(argv[2]) + 1 + strlen(argv[3]) + 1) - dupcount));
/* clean up .. */
for(i = 0; i < strlen(argv[2]) + 1; i++)
free(cgram1[i]);
free(cgram1);
for(i = 0; i < strlen(argv[3]) + 1; i++)
free(cgram2[i]);
free(cgram2);
return 0;
}
/* return array of ngrams */
char **mkcgram(char *str, int N) {
char **retval = NULL;
char *padded = NULL;
int i = 0;
padded = mkpadgr(str, "_", N);
retval = malloc((strlen(str) + 2) * sizeof(char *));
for(i = 0; i < strlen(str) + 1; i++)
retval[i] = strndup(&padded[i], N);
free(padded);
return retval;
}
/* padd word according to one prefix, and (N - 1) affix */
char *mkpadgr(char *str, char *padd, int N) {
char *buff = NULL;
int i = 0;
buff = calloc(strlen(str) + 2 + (N - 1), sizeof(char));
strcat(buff, padd), strcat(buff, str);
for(i = 0; i < (N - 1); i++)
strcat(buff, padd);
return buff;
}
char *strndup(const char *str, size_t n) {
char *retval = NULL;
size_t len = 0;
len = strnlen(str, n);
retval = malloc(len + 1);
if(retval == NULL)
return NULL;
retval[len] = '\0';
return (char *)memcpy(retval, str, len);
}
int strnlen(const char *str, int max) {
const char *end = NULL;
end = memchr(str, '\0', max);
return end ? end - str : max;
}
No comments:
Post a Comment