// | |
// A utility for finding substring embeddings in patterns | |
#include <stdio.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#define MAXPATHS (256*1024) | |
// | |
// | |
static void die( | |
const char*msg | |
) { | |
fprintf(stderr,"%s\n",msg); | |
exit(1); | |
} | |
// Finds the index of an entry, only used on xxx_key arrays | |
// Caveat: the table has to be sorted | |
static int find_in( | |
char *tab[], | |
int max, | |
const char *pat | |
) { | |
int left=0, right=max-1; | |
while (left <= right) { | |
int mid = ((right-left)/2)+left; | |
int v = strcmp(pat,tab[mid]); | |
if (v>0) { | |
left = mid + 1; | |
} else if (v<0) { | |
right = mid -1; | |
} else { | |
return mid; | |
} | |
} | |
return -1; | |
} | |
// used by partition (which is used by qsort_arr) | |
// | |
static void swap2( | |
char *a[], | |
char *b[], | |
int i, | |
int j | |
) { | |
if (i==j) return; | |
char*v; | |
v=a[i]; a[i]=a[j]; a[j]=v; | |
v=b[i]; b[i]=b[j]; b[j]=v; | |
} | |
// used by qsort_arr | |
// | |
static int partition( | |
char *a[], | |
char *b[], | |
int left, | |
int right, | |
int p | |
) { | |
const char *pivotValue = a[p]; | |
int i; | |
swap2(a,b,p,right); // Move pivot to end | |
p = left; | |
for (i=left; i<right; i++) { | |
if (strcmp(a[i],pivotValue)<=0) { | |
swap2(a,b,p,i); | |
p++; | |
} | |
} | |
swap2(a,b,right,p); // Move pivot to its final place | |
return p; | |
} | |
// | |
// | |
static void qsort_arr( | |
char *a[], | |
char *b[], | |
int left, | |
int right | |
) { | |
while (right > left) { | |
int p = left + (right-left)/2; //select a pivot | |
p = partition(a,b, left, right, p); | |
if ((p-1) - left < right - (p+1)) { | |
qsort_arr(a,b, left, p-1); | |
left = p+1; | |
} else { | |
qsort_arr(a,b, p+1, right); | |
right = p-1; | |
} | |
} | |
} | |
// Removes extra '0' entries from the string | |
// | |
static char* compact( | |
char *expr | |
) { | |
int l=strlen(expr); | |
int i,j; | |
for (i=0,j=0; i<l; i++) { | |
if (expr[i]!='0') expr[j++] = expr[i]; | |
} | |
expr[j]=0; | |
return expr; | |
} | |
// convert 'n1im' to 0n1i0m0 expressed as a string | |
// | |
static void expand( | |
char *expr, | |
const char *pat, | |
int l | |
) { | |
int el = 0; | |
char last = '.'; | |
int i; | |
for (i=0; i<l; i++) { | |
char c = pat[i]; | |
if ( (last<'0' || last>'9') | |
&& (c <'0' || c >'9') | |
) { | |
expr[el++] = '0'; | |
} | |
expr[el++] = c; | |
last = c; | |
} | |
if (last<'0' || last>'9') expr[el++] = '0'; | |
expr[el]=0; | |
} | |
// Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der | |
// The second pattern needs to be a right side match of the first | |
// (modulo digits) | |
static char *combine( | |
char *expr, | |
const char *subexpr | |
) { | |
int l1 = strlen(expr); | |
int l2 = strlen(subexpr); | |
int off = l1-l2; | |
int j; | |
// this works also for utf8 sequences because the substring is identical | |
// to the last substring-length bytes of expr except for the (single byte) | |
// hyphenation encoders | |
for (j=0; j<l2; j++) { | |
if (subexpr[j]>expr[off+j]) { | |
expr[off+j] = subexpr[j]; | |
} | |
} | |
return expr; | |
} | |
// | |
// | |
int main(int argc, const char* argv[]) { | |
FILE *in, *out; | |
char *pattab_key[MAXPATHS]; | |
char *pattab_val[MAXPATHS]; | |
int patterns = 0; | |
char *newpattab_key[MAXPATHS]; | |
char *newpattab_val[MAXPATHS]; | |
int newpatterns = 0; | |
char format[132]; // 64+65+newline+zero+spare | |
int p; | |
if (argc!=3) die("Usage: <orig-file> <new-file>\n"); | |
if ((in = fopen(argv[1],"r"))==NULL) die("Could not read input"); | |
if ((out = fopen(argv[2],"w"))==NULL) die("Could not create output"); | |
// read all patterns and split in pure text (_key) & expanded patterns (_val) | |
while(fgets(format,132,in)) { | |
int l = strlen(format); | |
if (format[l-1]=='\n') { l--; format[l]=0; } // Chomp | |
if (format[0]=='%' || format[0]==0) { | |
// skip | |
} else { | |
if (format[l-1]=='%') { | |
l--; | |
format[l] = 0; // remove '%' | |
} | |
int i,j; | |
char *pat = (char*) malloc(l+1); | |
char *org = (char*) malloc(l*2+1); | |
expand(org,format,l); | |
// remove hyphenation encoders (digits) from pat | |
for (i=0,j=0; i<l; i++) { | |
// odd, but utf-8 proof | |
char c = format[i]; | |
if (c<'0' || c>'9') pat[j++]=c; | |
} | |
pat[j]=0; | |
p = patterns; | |
pattab_key[patterns] = pat; | |
pattab_val[patterns++] = org; | |
if (patterns>MAXPATHS) die("to many base patterns"); | |
} | |
} | |
fclose(in); | |
// As we use binairy search, make sure it is sorted | |
qsort_arr(pattab_key,pattab_val,0,patterns-1); | |
for (p=0; p<patterns; p++) { | |
char *pat = pattab_key[p]; | |
int patsize = strlen(pat); | |
int j,l; | |
for (l=1; l<=patsize; l++) { | |
for (j=1; j<=l; j++) { | |
int i = l-j; | |
int subpat_ndx; | |
char subpat[132]; | |
strncpy(subpat,pat+i,j); subpat[j]=0; | |
if ((subpat_ndx = find_in(pattab_key,patterns,subpat))>=0) { | |
int newpat_ndx; | |
char *newpat=malloc(l+1); | |
//printf("%s is embedded in %s\n",pattab_val[subpat_ndx],pattab_val[p]); | |
strncpy(newpat, pat+0,l); newpat[l]=0; | |
if ((newpat_ndx = find_in(newpattab_key,newpatterns,newpat))<0) { | |
char *neworg = malloc(132); // TODO: compute exact length | |
expand(neworg,newpat,l); | |
newpattab_key[newpatterns] = newpat; | |
newpattab_val[newpatterns++] = combine(neworg,pattab_val[subpat_ndx]); | |
if (newpatterns>MAXPATHS) die("to many new patterns"); | |
//printf("%*.*s|%*.*s[%s] (%s|%s) = %s\n",i,i,pat,j,j,pat+i,pat+i+j,pattab_val[p],pattab_val[subpat_ndx],neworg); | |
} else { | |
free(newpat); | |
newpattab_val[newpat_ndx] = combine( | |
newpattab_val[newpat_ndx], pattab_val[subpat_ndx] ); | |
} | |
} | |
} | |
} | |
} | |
/* for some tiny extra speed, one could forget the free()s | |
* as the memory is freed anyway on exit(). | |
* However, the gain is minimal and now the code can be cleanly | |
* incorporated into other code */ | |
for (p=0; p<newpatterns; p++) { | |
fprintf(out,"%s\n",compact(newpattab_val[p])); | |
free(newpattab_key[p]); | |
free(newpattab_val[p]); | |
} | |
fclose(out); | |
for (p=0; p<patterns; p++) { | |
free(pattab_key[p]); | |
free(pattab_val[p]); | |
} | |
return 0; | |
} |