#include #include #include typedef unsigned char byte; typedef byte bool; #define repeat while(1) #define unless(E) if (!(E)) #define true 1 #define false 0 /* sort capability. Similar to ANSI C qsort, and cribbed from the old snowball system (of 2005). (I didn't use qsort because the idea of this experiment was to go back to "first principles", which could hardly presuppose the existence of a built-in sort routine. */ static void merge_part(char * p1, char * p1_end, /* source 1 */ char * p2, char * p2_end, /* source 2 */ char * s, /* destination */ int unit, /* size of each item */ int (*f)()) /* comparison routine */ { repeat { if (p1 >= p1_end) { memmove(s, p2, p2_end - p2); return; } if (p2 >= p2_end) { memmove(s, p1, p1_end - p1); return; } if (f(p1, p2) <= 0) { memmove(s, p1, unit); p1 += unit; } else { memmove(s, p2, unit); p2 += unit; } s += unit; } } static void merge_all(char * p, char * p_end, /* source */ char * s, /* destination */ int width, /* size of blocks to merge */ int unit, /* size of each item */ int (*f)()) /* comparison routine */ { repeat { char * p2 = p + width; char * p3 = p2 + width; if (p3 > p_end) { p3 = p_end; if (p2 > p_end) p2 = p_end; } merge_part(p, p2, p2, p3, s, unit, f); if (p3 == p_end) return; p = p3; s += width + width; } } extern void sort(void * p, void * p_end, int unit, int (*f)()) { int size = (char *) p_end - (char *) p; char * s = (char *) malloc(size); char * s_end = s + size; int width = unit; repeat { merge_all((char *) p, (char *) p_end, s, width, unit, f); width += width; if (width >= size) { memmove(p, s, size); break; } merge_all(s, s_end, (char *) p, width, unit, f); width += width; if (width >= size) break; } free(s); } /* end of sort capability */ struct map { void (*lose)(struct map *); /* destructor */ int size; /* size of map */ int count; /* number of names held */ struct n_slot **p; /* pointer to vector addressing name slots */ }; struct v_slot { void (*lose)(void *); /* destructor if held in map */ void * p; /* pointer to the value */ }; struct n_slot { struct n_slot * next; /* next name with same hash */ int length; /* length of name */ byte * p; /* pointer to the name */ int count; /* number of times name entered */ int v_count; /* number of values */ int v_size; /* value vector capacity */ struct v_slot * v; /* value vector */ }; extern void map_eachslot(struct map * M, void (*f)(), void * p) { int i; int j = 0; for (i = 0; i < M->size; i++) { struct n_slot * np = M->p[i]; while (np) { f(p, np, j); j++; np = np->next; } } } static int hash(int length, byte * p, int n) { unsigned int h = 0; int i; for (i = 0; i < length; i++) { h *= p[i] << (i & 7); h ^= p[i]; } return h % n; } extern void map_lose(struct map *M) { int i; for (i = 0; i < M->size; i++) { struct n_slot * np = M->p[i]; while (np) { struct n_slot * next_np = np->next; int j; for (j = 0; j < np->v_count; j++) { struct v_slot * v = np->v + j; if (v->lose) v->lose(v->p); } free(np->v); free(np->p); free(np); np = next_np; } } free(M->p); free(M); } extern struct map * map_make(int size) { struct map * M; M = calloc(1, sizeof(struct map)); M->size = size; M->p = calloc(size, sizeof(struct n_slot *)); M->lose = map_lose; return M; } static void map_reform(struct map * M, int size) { struct n_slot **p = calloc(size, sizeof(struct n_slot *)); int i; for (i = 0; i < M->size; i++) { struct n_slot *np = M->p[i]; while (np) { struct n_slot * next_np = np->next; int h = hash(np->length, np->p, size); np->next = p[h]; p[h] = np; np = next_np; } } free(M->p); M->p = p; M->size = size; /*printf("reformed %d\n", size);*/ } extern struct n_slot * map_findname(struct map * M, int length, byte * p) { int h = hash(length, p, M->size); struct n_slot * np = M->p[h]; repeat { unless (np) return 0; if (length == np->length && memcmp(p, np->p, length) == 0) return np; np = np->next; } } extern struct n_slot * map_addname(struct map * M, int length, byte * p) { int h = hash(length, p, M->size); struct n_slot **a = & M->p[h]; struct n_slot *np = *a; repeat { unless (np) break; if (length == np->length && memcmp(p, np->p, length) == 0) { np->count++; return np; } a = & np->next; np = *a; } *a = np = calloc(1, sizeof(struct n_slot)); np->length = length; np->p = malloc(length + 1); memmove(np->p, p, length); np->p[length] = 0; np->count = 1; M->count++; if (M->count >= M->size) map_reform(M, 2 * M->size); return np; } extern void map_addval(struct n_slot * np, void (*lose)(void *), byte * p) { if (np->v_count == np->v_size) { np->v_size = (np->v_size + 1) * 3 / 2; np->v = realloc(np->v, np->v_size * sizeof(struct v_slot)); } { struct v_slot * v = np->v + np->v_count; v->lose = lose; v->p = p; np->v_count++; } } /* returns -1 if nothing to delete, otherwise the number of values after deletion. */ extern int map_delval(struct n_slot * np) { int i = np->v_count - 1; if (i < 0) return i; { struct v_slot * v = np->v + i; if (v->lose) v->lose(v->p); } np->v_count = i; return i; } /*****************************************************************************/ struct V { struct n_slot * np; long long N; /* 64 bit number to control np->name collating order */ }; static long long collation_number(int length, byte * p) { long long N = 0; int o = 55; int i; for (i = 0; i < length; i++) { /* letters */ if (o < 0) return N; byte ch = p[i]; if (ch == '^') continue; N |= (long long)ch - 'a' + 1 << o; o -= 5; } o -= 5; if (length > 30) length = 30; for (i = 0; i < length; i++) { /* accents */ if (p[i] == '^') { if (o < 0) return N; N |= (long long)i + 1 << o; o -= 5; } } return N; } static void into_v(struct V * v, struct n_slot * np, int i) { (v + i)->np = np; (v + i)->N = collation_number(np->length, np->p); } static int compare_np(const struct V * v1, const struct V * v2) { long long diff = v1->N - v2->N; if (diff < 0) return -1; if (diff > 0) return 1; /*if (diff) return diff;*/ { byte * p1 = v1->np->p; byte * p2 = v2->np->p; int accent = 0; printf("comparing %s %s\n",p1,p2); repeat { int ch1 = * p1; int ch2 = * p2; if (ch1 == ch2) { if (ch1 == 0) return accent; p1++; p2++; continue; } if (ch1 == '^') { unless (accent) accent = 1; p1++; continue; } if (ch2 == '^') { unless (accent) accent = -1; p2++; continue; } return ch1 > ch2 ? 1 : -1; } } } static void print_slot(struct n_slot * np, FILE * out) { fprintf(out, "%s ", np->p); fprintf(out, " (%d)\n", np->count); { struct v_slot * v = np->v; int j; for (j = 0; j < np->v_count; j++) fprintf(out, " %s\n", (v + j)->p); fprintf(out, "\n"); } } static void sortmap(struct map * M, FILE * out) { struct V * v = malloc(M->count * sizeof(struct V) + 1); map_eachslot(M, into_v, v); /* qsort(v, M->count, sizeof(struct V), compare_np); */ sort(v, v + M->count, sizeof(struct V), compare_np); { int i; for (i = 0; i < M->count; i++) print_slot((v + i)->np, out); } free(v); } #define LC(ch) ch >= 'A' && ch <= 'Z' ? ch - 'A' + 'a' : ch #define LETTER(ch) ch >= 'a' && ch <= 'z' || ch == '^' int main(int argc, char * argv[]) { /**int count; for (count = 0; count < 500; count++)**/ { FILE *in; FILE *out; unless (argc == 3) { fprintf(stderr, "invalid args\n"); return 1; } in = fopen(argv[1], "r"); unless (in) { fprintf(stderr,"input %s not found\n",argv[1]); return 1; } out = fopen(argv[2], "w"); unless (out) { fprintf(stderr,"output %s cannot be opened\n",argv[2]); return 1; } struct map * M = map_make(5000); byte * q; byte K[100]; byte B[1000]; bool first; repeat { int ch; int l = 0; int c = 0; repeat { switch(ch = getc(in)) { case -1: goto out2; case '\n':goto out1; case '|': c = l; break; default: B[l++] = ch; } } out1: B[l] = 0; q = malloc(l + 1); memmove(q, B, l + 1); first = true; repeat { if (c == l) break; ch = B[c++]; ch = LC(ch); unless (LETTER(ch)) continue; { int i = 0; repeat { K[i++] = ch; if (c == l) break; ch = B[c++]; ch = LC(ch); unless (LETTER(ch)) break; } K[i] = 0; { struct n_slot * np = map_addname(M, i, K); map_addval(np, first ? free : 0, q); } } first = false; } } out2: fclose(in); sortmap(M, out); M->lose(M); fclose(out); } return 0; }