Monday, April 2, 2012

Compute string distance

Compute string distance

#include <stdio.h>
#include <string.h>
#include <getopt.h>
#include <stdlib.h>
#include <unistd.h>

#define VERSION "0.0.1"
#define PACKAGE "levenshtein"

void print_help(int exval);
int levenshtein(char *stra, char *strb);

int main(int argc, char *argv[]) {
 char *src_str  = NULL;
 char *trgt_str = NULL;
 int opt = 0;
 int output_flag_opt = 0;

 if(argc == 1)
  print_help(0);

 while((opt = getopt(argc, argv, "hve")) != -1) {
  switch(opt) {
   case 'h':
    print_help(0);
   case 'v':
    return 0;
   case 'e':
    output_flag_opt = 1;
    break;
   case '?':
    fprintf(stderr, "%s: Error - No such option `%c'\n", PACKAGE, optopt);
    print_help(1);
  } /* switch */
 } /* while */

 if((argc - optind) != 2) {
  fprintf(stderr, "%s: Error - Need `SOURCE' and `TARGET' string\n\n", PACKAGE);
  print_help(1);
 }

 if(strlen(argv[optind]) < 2 || strlen(argv[optind]) > 255) {
  fprintf(stderr, "%s: Error - strlen(SOURCE) is either `< 2' or `> 255'\n", PACKAGE);
  return 1;
 } else
  src_str = strdup(argv[optind]);

 optind++;

 if(strlen(argv[optind]) < 2 || strlen(argv[optind]) > 255) {
  fprintf(stderr, "%s: Error - strlen(TARGET) is either `< 2' or `> 255'\n", PACKAGE);
  return 1;
 } else
  trgt_str = strdup(argv[optind]);

 if(output_flag_opt == 0)
  printf("%d\n", levenshtein(src_str, trgt_str));
 else {
  printf("Source   : %s\n", src_str);
  printf("Target   : %s\n", trgt_str);
  printf("Lev.dist : %d\n", levenshtein(src_str, trgt_str));
 }

 return 0;
}

int levenshtein(char *stra, char *strb) {
 int i, j, k, l, m1, m2, m3, laa, r;
 int lengtha, lengthb, lengthmin;
 int **d;
 char *a;

 lengtha = strlen(stra);
 lengthb = strlen(strb);
 lengthmin = lengtha;
 if(lengthb > lengthmin) lengthmin = lengthb;

 a = (char*)malloc((lengthmin+2)*sizeof(char));
 d = (int**)malloc((lengthmin+2)*sizeof(int*));
 for(i = 0; i < lengthmin + 2; i++) d[i] = (int*)malloc((lengthmin+2)*sizeof(int));
 for(i = 0; i <= lengtha; i++) a[i] = stra[i];

 laa = lengtha;
 d[0][0] = 0;
 for(i = 1; i <= lengtha; i++) d[i][0] = d[i-1][0]+1;
 for(j = 1; j <= lengthb; j++) d[0][j] = d[0][j-1]+1;

 for(i = 1; i <= laa; i++) {
  for(j = 1;j <= lengthb; j++) {
   r = 0;
   if(a[i-1] != strb[j-1]) {
    r = 1;
    a[i-1] = strb[j-1];
   }
   m1 = d[i-1][j-1]+r;
   for(k = lengtha; k >= i-1; k--) a[k+1] = a[k];
   a[j-1] = strb[j-1];
   lengtha++;
   m2 = d[i][j-1]+1;
   for(k = i-1; k < lengtha; k++) a[k] = a[k+1];
   lengtha--;
   m3 = d[i-1][j]+1;
   r = m1;
   if( m2 < r) r = m2;
   if(m3 < r) r = m3;
   d[i][j] = r;
   lengtha = laa;
   for(l = 0; l <= lengtha; l++) a[l] = stra[l];
  }
 }

 r = d[lengtha][lengthb];
 free(a);
 for(i = 0; i < lengthmin+2; i++) free(d[i]);
 free(d);
 return(r);
}

void print_help(int exval) {
 printf("%s,%s is a measure of the similarity inbetween two strings\n",
   PACKAGE, VERSION);
 printf("Usage: %s [-h] [-v] [-e] SOURCE TARGET\n\n", PACKAGE);

 printf(" -h    print this help and exit\n");
 printf(" -v    print version and exit\n");
 printf(" -e    print the source, target string and levenshtein distance\n\n");

 exit(exval);
}

No comments:

Post a Comment