? contrib/fuzzystrmatch/fuzzystrmatch.sql ? contrib/fuzzystrmatch/libfuzzystrmatch.so.0.0 Index: contrib/fuzzystrmatch/fuzzystrmatch.c =================================================================== RCS file: /projects/cvsroot/pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v retrieving revision 1.26 diff -c -r1.26 fuzzystrmatch.c *** contrib/fuzzystrmatch/fuzzystrmatch.c 25 Mar 2008 22:42:41 -0000 1.26 --- contrib/fuzzystrmatch/fuzzystrmatch.c 3 Apr 2008 06:08:21 -0000 *************** *** 15,20 **** --- 15,22 ---- * found at http://www.merriampark.com/ld.htm * Also looked at levenshtein.c in the PHP 4.0.6 distribution for * inspiration. + * Configurable penalty costs extension is introduced by Volkan + * YAZICI . * * metaphone() * ----------- *************** *** 48,195 **** PG_MODULE_MAGIC; /* ! * Calculates Levenshtein Distance between two strings. ! * Uses simplest and fastest cost model only, i.e. assumes a cost of 1 for ! * each deletion, substitution, or insertion. */ ! PG_FUNCTION_INFO_V1(levenshtein); ! Datum ! levenshtein(PG_FUNCTION_ARGS) { ! char *str_s = TextDatumGetCString(PG_GETARG_DATUM(0)); ! char *str_t = TextDatumGetCString(PG_GETARG_DATUM(1)); ! int cols = strlen(str_s) + 1; ! int rows = strlen(str_t) + 1; ! char *str_s0; ! int *u_cells; ! int *l_cells; ! int *tmp; ! int i; ! int j; /* ! * str_s is referred to as the "source", str_t is referred to as the ! * "target", cols = length of source + 1 to allow for the initialization ! * column, rows = length of target + 1 to allow for the initialization row */ ! ! /* ! * Restrict the length of the strings being compared to something ! * reasonable because we will have to perform rows * cols calculations. If ! * longer strings need to be compared, increase MAX_LEVENSHTEIN_STRLEN to ! * suit (but within your tolerance for speed and memory usage). ! */ ! if ((cols > MAX_LEVENSHTEIN_STRLEN + 1) || (rows > MAX_LEVENSHTEIN_STRLEN + 1)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("argument exceeds the maximum length of %d bytes", MAX_LEVENSHTEIN_STRLEN))); ! /* ! * If either rows or cols is 0, the answer is the other value. This makes ! * sense since it would take that many insertions the build a matching ! * string ! */ ! ! if (cols == 0) ! PG_RETURN_INT32(rows); ! ! if (rows == 0) ! PG_RETURN_INT32(cols); /* ! * Allocate two vectors of integers. One will be used for the "upper" row, ! * the other for the "lower" row. Initialize the "upper" row to 0..cols */ ! u_cells = palloc(sizeof(int) * cols); ! for (i = 0; i < cols; i++) ! u_cells[i] = i; ! l_cells = palloc(sizeof(int) * cols); ! /* ! * Use str_s0 to "rewind" the pointer to str_s in the nested for loop ! * below ! */ ! str_s0 = str_s; ! ! /* ! * Loop through the rows, starting at row 1. Row 0 is used for the initial ! * "upper" row. ! */ ! for (j = 1; j < rows; j++) { /* ! * We'll always start with col 1, and initialize lower row col 0 to j */ ! l_cells[0] = j; ! ! for (i = 1; i < cols; i++) { ! int c = 0; ! int c1 = 0; ! int c2 = 0; ! int c3 = 0; ! ! /* ! * The "cost" value is 0 if the character at the current col ! * position in the source string, matches the character at the ! * current row position in the target string; cost is 1 otherwise. ! */ ! c = (*str_s != *str_t); ! /* ! * c1 is upper right cell plus 1 ! */ ! c1 = u_cells[i] + 1; ! /* ! * c2 is lower left cell plus 1 ! */ ! c2 = l_cells[i - 1] + 1; - /* - * c3 is cell diagonally above to the left plus "cost" - */ - c3 = u_cells[i - 1] + c; ! /* ! * The lower right cell is set to the minimum of c1, c2, c3 ! */ ! l_cells[i] = (c1 < c2 ? c1 : c2) < c3 ? (c1 < c2 ? c1 : c2) : c3; ! /* ! * Increment the pointer to str_s ! */ ! str_s++; ! } - /* - * Lower row now becomes the upper row, and the upper row gets reused - * as the new lower row. - */ - tmp = u_cells; - u_cells = l_cells; - l_cells = tmp; ! /* ! * Increment the pointer to str_t ! */ ! str_t++; ! /* ! * Rewind the pointer to str_s ! */ ! str_s = str_s0; ! } ! /* ! * Because the final value (at position row, col) was swapped from the ! * lower row to the upper row, that's where we'll find it. ! */ ! PG_RETURN_INT32(u_cells[cols - 1]); } /* * Calculates the metaphone of an input string. * Returns number of characters requested --- 50,178 ---- PG_MODULE_MAGIC; /* ! * levenshtein_internal - Calculates Levenshtein distance metric ! * between supplied strings. Generally ! * (1, 1, 1) penalty costs suffices common ! * cases, but your mileage may vary. */ ! static int ! levenshtein_internal(char *s, char *t, int ins_c, int del_c, int sub_c) { ! int m, n; ! int *prev; ! int *curr; ! int i, j; ! char *x; ! char *y; ! ! m = strlen(s); ! n = strlen(t); ! ! if (!m) ! return n; ! if (!n) ! return m; /* ! * For security concerns, restrict excessive CPU+RAM usage. (This ! * implementation uses O(m) memory and has O(mn) complexity.) */ ! if ((m + n) > (2 * MAX_LEVENSHTEIN_STRLEN)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("argument exceeds the maximum length of %d bytes", MAX_LEVENSHTEIN_STRLEN))); ! /* One more cell for initial segments. */ ! ++m; ! ++n; /* ! * Instead of building an (m+1)x(n+1) array, we'll use two ! * different arrays of size m+1 for storing accumulated values in ! * previous calls. */ ! prev = palloc(2 * m * sizeof(char)); ! curr = prev + (m * sizeof(char)); ! for (i = 0; i < m; i++) ! prev[i] = i; ! for (y = t, j = 1; j < n; y++, j++) { + int *temp; + /* ! * First cell must increment sequentially, as we're on the ! * j'th column of the (m+1)x(n+1) array. */ ! curr[0] = j; ! ! for (x = s, i = 1; i < m; x++, i++) { ! int ins; ! int del; ! int sub; ! ! /* Calculate costs for probable operations. */ ! ins = prev[i] + ins_c; /* Insertion */ ! del = curr[i-1] + del_c; /* Deletion */ ! sub = prev[i-1] + ((*x == *y) ? 0 : sub_c); /* Substitution */ ! ! /* Take the one with minimum cost. */ ! curr[i] = ins < del ? ins : del; ! curr[i] = curr[i] < sub ? curr[i] : sub; ! } ! /* Swap current row with previous row. */ ! temp = curr; ! curr = prev; ! prev = temp; ! } ! /* ! * Because the final value was swapped from the previous row to ! * the current row, that's where we'll find it. ! */ ! return prev[m-1]; ! } ! PG_FUNCTION_INFO_V1(levenshtein_with_costs); ! Datum ! levenshtein_with_costs(PG_FUNCTION_ARGS) ! { ! char *src; ! char *dst; ! int ins_c; ! int del_c; ! int sub_c; ! ! src = text_to_cstring(PG_GETARG_TEXT_P(0)); ! dst = text_to_cstring(PG_GETARG_TEXT_P(1)); ! ! ins_c = PG_GETARG_INT32(2); ! del_c = PG_GETARG_INT32(3); ! sub_c = PG_GETARG_INT32(4); ! PG_RETURN_INT32(levenshtein_internal(src, dst, ins_c, del_c, sub_c)); ! } ! PG_FUNCTION_INFO_V1(levenshtein); ! Datum ! levenshtein(PG_FUNCTION_ARGS) ! { ! char *src; ! char *dst; ! src = text_to_cstring(PG_GETARG_TEXT_P(0)); ! dst = text_to_cstring(PG_GETARG_TEXT_P(1)); ! PG_RETURN_INT32(levenshtein_internal(src, dst, 1, 1, 1)); } + /* * Calculates the metaphone of an input string. * Returns number of characters requested Index: contrib/fuzzystrmatch/fuzzystrmatch.h =================================================================== RCS file: /projects/cvsroot/pgsql/contrib/fuzzystrmatch/fuzzystrmatch.h,v retrieving revision 1.17 diff -c -r1.17 fuzzystrmatch.h *** contrib/fuzzystrmatch/fuzzystrmatch.h 25 Mar 2008 22:42:41 -0000 1.17 --- contrib/fuzzystrmatch/fuzzystrmatch.h 3 Apr 2008 06:08:21 -0000 *************** *** 58,63 **** --- 58,64 ---- /* * External declarations */ + extern Datum levenshtein_with_costs(PG_FUNCTION_ARGS); extern Datum levenshtein(PG_FUNCTION_ARGS); extern Datum metaphone(PG_FUNCTION_ARGS); extern Datum soundex(PG_FUNCTION_ARGS); *************** *** 80,85 **** --- 81,87 ---- * Levenshtein */ #define MAX_LEVENSHTEIN_STRLEN 255 + static int levenshtein_internal(char *s, char *t, int ins_c, int del_c, int sub_c); /* Index: contrib/fuzzystrmatch/fuzzystrmatch.sql.in =================================================================== RCS file: /projects/cvsroot/pgsql/contrib/fuzzystrmatch/fuzzystrmatch.sql.in,v retrieving revision 1.9 diff -c -r1.9 fuzzystrmatch.sql.in *** contrib/fuzzystrmatch/fuzzystrmatch.sql.in 13 Nov 2007 04:24:28 -0000 1.9 --- contrib/fuzzystrmatch/fuzzystrmatch.sql.in 3 Apr 2008 06:08:21 -0000 *************** *** 3,8 **** --- 3,12 ---- -- Adjust this setting to control where the objects get created. SET search_path = public; + CREATE OR REPLACE FUNCTION levenshtein (text,text,int,int,int) RETURNS int + AS 'MODULE_PATHNAME','levenshtein_with_costs' + LANGUAGE C IMMUTABLE STRICT; + CREATE OR REPLACE FUNCTION levenshtein (text,text) RETURNS int AS 'MODULE_PATHNAME','levenshtein' LANGUAGE C IMMUTABLE STRICT;