Monday, April 2, 2012

Calc. score intersecting Programs

Calc. score intersecting ngrams

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

char **mkcgram(char *, int);
char *mkpadgr(char *, char *, int);
char *strndup(const char *, size_t);
int strnlen(const char *, int);

int main(int argc, char *argv[]) {
 char **cgram1 = NULL;
 char **cgram2 = NULL;
 int dupcount = 0, nglen = 0;
 int i = 0, j = 0;

 if(argc != 4) {
  fprintf(stderr, "Usage: ngramisect INT WORD1 WORD2\n");
  return 1;
 }

 nglen = atoi(argv[1]);

 /* get ngrams for first word */
 cgram1 = mkcgram(argv[2], nglen);
 for(i = 0; i < strlen(argv[2]) + 1; i++)
  printf("cgram1[%d] = %s\n", i, cgram1[i]);

 printf("---\n");

 /* get ngrams for second word */
 cgram2 = mkcgram(argv[3], nglen);
 for(i = 0; i < strlen(argv[3]) + 1; i++)
  printf("cgram2[%d] = %s\n", i, cgram2[i]);

 /* compare two arrays, count duplicates */
 for(i = 0; i < strlen(argv[2]) + 1; i++)
  for(j = 0; j < strlen(argv[3]) + 1; j++)
   if(strcmp(cgram1[i], cgram2[j]) == 0)
    dupcount++;

 /* calc. score */
 printf("---\n");
 printf("total ngrams : %d\n", strlen(argv[2]) + 1 + strlen(argv[3]) + 1);
 printf("duplicates   : %d\n", dupcount);
 printf("uniq ngrams  : %d\n",
   (strlen(argv[2]) + 1 + strlen(argv[3]) + 1) - dupcount);
 printf("score        : %0.2f\n", (double)
   dupcount / ((strlen(argv[2]) + 1 + strlen(argv[3]) + 1) - dupcount));

 /* clean up .. */
 for(i = 0; i < strlen(argv[2]) + 1; i++)
  free(cgram1[i]);
 free(cgram1);
 for(i = 0; i < strlen(argv[3]) + 1; i++)
  free(cgram2[i]);
 free(cgram2);

 return 0;
}

/* return array of ngrams */
char **mkcgram(char *str, int N) {
 char **retval = NULL;
 char *padded  = NULL;
 int i = 0;

 padded = mkpadgr(str, "_", N);
 retval = malloc((strlen(str) + 2) * sizeof(char *));

 for(i = 0; i < strlen(str) + 1; i++)
  retval[i] = strndup(&padded[i], N);

 free(padded);
 return retval;
}

/* padd word according to one prefix, and (N - 1) affix */
char *mkpadgr(char *str, char *padd, int N) {
 char *buff = NULL;
 int i = 0;

 buff = calloc(strlen(str) + 2 + (N - 1), sizeof(char));
 strcat(buff, padd), strcat(buff, str);
 for(i = 0; i < (N - 1); i++)
  strcat(buff, padd);

 return buff;
}

char *strndup(const char *str, size_t n) {
 char *retval = NULL;
 size_t len = 0;

 len = strnlen(str, n);
 retval = malloc(len + 1);

 if(retval == NULL)
  return NULL;

 retval[len] = '\0';
 return (char *)memcpy(retval, str, len);
}

int strnlen(const char *str, int max) {
 const char *end = NULL;

 end = memchr(str, '\0', max);
 return end ? end - str : max;
}

No comments:

Post a Comment