00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00036
00037 #include <assert.h>
00038 #include <errno.h>
00039 #include <limits.h>
00040 #include <math.h>
00041 #include <stdarg.h>
00042 #include <stdbool.h>
00043 #include <stdint.h>
00044 #include <stdio.h>
00045 #include <stdlib.h>
00046 #include <string.h>
00047
00050
00052 #define VERSION "2007-05-15"
00053
00055 #define YEAR "2007"
00056
00060
00062
00063 #define LEVELS_LIST 0.0001, 0.001, 0.01, 0.05, 0.10
00064
00069
00070 #ifndef WORD_BITS
00072
00073 # define WORD_BITS 32
00074 #endif
00075
00076 #ifndef BITCOUNT_BITS
00078
00079 # define BITCOUNT_BITS 11
00080 #endif
00081
00083
00084 typedef unsigned bitcount_t;
00085
00089
00090 #if WORD_BITS == 32
00092 typedef uint_fast32_t word_t;
00094 # define WORD_ONE UINT32_C(1)
00096 # define MSB_SHIFT 5
00097 #elif WORD_BITS == 64
00098 typedef uint_fast64_t word_t;
00099 # define WORD_ONE UINT64_C(1)
00100 # define MSB_SHIFT 6
00101 #else
00102 # error "Unsupported WORD_BITS"
00103 #endif
00104
00106 #define LSB_MASK ((WORD_ONE << MSB_SHIFT) - WORD_ONE)
00107
00109
00110 inline static size_t
00111 get_msb_index(size_t i)
00112 {
00113 return i >> MSB_SHIFT;
00114 }
00115
00117
00118 inline static word_t
00119 get_lsb_bit(size_t i)
00120 {
00121 return WORD_ONE << (i & LSB_MASK);
00122 }
00123
00128
00130 #define STRINGIFY2(x) #x
00132 #define STRINGIFY(x) STRINGIFY2(x)
00133
00135 #define MIN(x, y) ((x) < (y) ? (x) : (y))
00137 #define MAX(x, y) ((x) > (y) ? (x) : (y))
00138
00143
00145 #define MYMALLOC(target, type, count) \
00146 do { \
00147 type *my_result = mymalloc(size_multiply(sizeof(type), count)); \
00148 target = my_result; \
00149 } while (0)
00150
00152 #define MYMALLOCZ(target, type, count, init) \
00153 do { \
00154 size_t my_count = count; \
00155 type *my_result = mymalloc(size_multiply(sizeof(type), my_count)); \
00156 for (size_t my_i = 0; my_i < my_count; my_i++) { \
00157 my_result[my_i] = init; \
00158 } \
00159 target = my_result; \
00160 } while (0)
00161
00163 #define MYMALLOC2Z(target, type, count1, count2, init) \
00164 MYMALLOCZ(target, type, size_multiply(count1, count2), init)
00165
00167 static size_t
00168 size_multiply(size_t a, size_t b)
00169 {
00170 size_t x = a;
00171 x *= b;
00172 if (x / b != a) {
00173 fprintf(stderr, "size overflow (%zu * %zu > %zu)\n", a, b, SIZE_MAX);
00174 exit(EXIT_FAILURE);
00175 }
00176 return x;
00177 }
00178
00180 static void *
00181 mymalloc(size_t s)
00182 {
00183 void *p = malloc(s);
00184 if (p == NULL) {
00185 fprintf(stderr, "malloc failed (%zu bytes)\n", s);
00186 exit(EXIT_FAILURE);
00187 }
00188 return p;
00189 }
00190
00194
00196
00197 static void
00198 myfscanf(int expected, const char *description, FILE *stream, const char *format, ...)
00199 {
00200 va_list ap;
00201 va_start(ap, format);
00202 int result = vfscanf(stream, format, ap);
00203 va_end(ap);
00204
00205 if (expected != result) {
00206 if (result == EOF) {
00207 fprintf(stderr, "%s: expected %d fields, got EOF: %s\n", description, expected, format);
00208 } else if (expected == EOF) {
00209 fprintf(stderr, "%s: expected EOF, got %d fields: %s\n", description, result, format);
00210 } else {
00211 fprintf(stderr, "%s: expected %d fields, got %d fields: %s\n", description, expected, result, format);
00212 }
00213 exit(EXIT_FAILURE);
00214 }
00215 }
00216
00218 static void
00219 bad_uint(const char *s)
00220 {
00221 fprintf(stderr, "not a valid unsigned integer: %s\n", s);
00222 exit(EXIT_FAILURE);
00223 }
00224
00226 static unsigned
00227 get_uint(const char *s)
00228 {
00229 if (*s == '\0') {
00230 bad_uint(s);
00231 }
00232 char *pend;
00233 errno = 0;
00234 unsigned long v = strtoul(s, &pend, 10);
00235 if (*pend != '\0') {
00236 bad_uint(s);
00237 }
00238 if (v == ULONG_MAX && errno == ERANGE) {
00239 bad_uint(s);
00240 }
00241 #if UINT_MAX < ULONG_MAX
00242 if (v > UINT_MAX) {
00243 bad_uint(s);
00244 }
00245 #endif
00246 return (unsigned int)v;
00247 }
00248
00253
00255 #define BUILTIN_RNG_BITS 15
00256
00258 #define BUILTIN_RNG_MAX ((1U << BUILTIN_RNG_BITS) - 1)
00259
00261 #define BUILTIN_RNG_LARGE_BITS (BUILTIN_RNG_BITS * 2)
00262
00264 #define BUILTIN_RNG_LARGE_MAX ((1U << BUILTIN_RNG_LARGE_BITS) - 1)
00265
00267 typedef uint_fast32_t builtin_rng_state_t;
00268
00270 inline static void
00271 builtin_rng_init(builtin_rng_state_t *state)
00272 {
00273 *state = 1;
00274 }
00275
00277
00278 inline static uint_fast16_t
00279 builtin_rng(builtin_rng_state_t *state)
00280 {
00281 *state = *state * 1103515245 + 12345;
00282 return (*state >> 16) & BUILTIN_RNG_MAX;
00283 }
00284
00286
00287 inline static uint_fast32_t
00288 builtin_rng_large(builtin_rng_state_t *state)
00289 {
00290 uint_fast32_t value = builtin_rng(state) << BUILTIN_RNG_BITS;
00291 value |= builtin_rng(state);
00292 return value;
00293 }
00294
00296 inline static unsigned
00297 builtin_rng_n(builtin_rng_state_t *state, unsigned n)
00298 {
00299 return (unsigned)((double)n * (builtin_rng_large(state) / (BUILTIN_RNG_LARGE_MAX + 1.0)));
00300 }
00301
00305
00307
00308 inline static unsigned
00309 myrand_n(unsigned n)
00310 {
00311 return (unsigned)((double)n * (rand() / (RAND_MAX + 1.0)));
00312 }
00313
00315 inline static void
00316 myswap(unsigned i, unsigned j, unsigned * restrict table)
00317 {
00318 unsigned tmp = table[j];
00319 table[j] = table[i];
00320 table[i] = tmp;
00321 }
00322
00324 inline static void
00325 identity_permutation(unsigned n, unsigned * restrict table)
00326 {
00327 for (unsigned i = 0; i < n; i++) {
00328 table[i] = i;
00329 }
00330 }
00331
00333
00334 inline static void
00335 rand_permutation(unsigned n, unsigned * restrict table)
00336 {
00337 identity_permutation(n, table);
00338 for (unsigned i = 0; i < n; i++) {
00339 unsigned j = myrand_n(n - i) + i;
00340 myswap(i, j, table);
00341 }
00342 }
00343
00345
00346 inline static void
00347 builtin_rng_permutation(builtin_rng_state_t * restrict state, unsigned n, unsigned * restrict table)
00348 {
00349 identity_permutation(n, table);
00350 for (unsigned i = 0; i < n; i++) {
00351 unsigned j = builtin_rng_n(state, n - i) + i;
00352 myswap(i, j, table);
00353 }
00354 }
00355
00359
00361
00362 inline static unsigned
00363 naive_bitcount(unsigned w)
00364 {
00365 unsigned c = 0;
00366 while (w != 0) {
00367 c += (w & 1U);
00368 w >>= 1;
00369 }
00370 return c;
00371 }
00372
00374 static bitcount_t bitcounts[1U << BITCOUNT_BITS];
00375
00377 static void
00378 init_bitcount(void)
00379 {
00380 for (unsigned i = 0; i < (1U << BITCOUNT_BITS); i++) {
00381 bitcounts[i] = naive_bitcount(i);
00382 }
00383 }
00384
00386 inline static unsigned
00387 bitcount(word_t w)
00388 {
00389 return
00390 #if BITCOUNT_BITS == 8
00391 bitcounts[w & 0xFFU]
00392 + bitcounts[(w >> 8) & 0xFFU]
00393 + bitcounts[(w >> 16) & 0xFFU]
00394 + bitcounts[(w >> 24) & 0xFFU]
00395 # if WORD_BITS == 64
00396 + bitcounts[(w >> 32) & 0xFFU]
00397 + bitcounts[(w >> 40) & 0xFFU]
00398 + bitcounts[(w >> 48) & 0xFFU]
00399 + bitcounts[(w >> 56) & 0xFFU]
00400 # endif
00401 #elif BITCOUNT_BITS == 11
00402 bitcounts[w & 0x7FFU]
00403 + bitcounts[(w >> 11) & 0x7FFU]
00404 + bitcounts[(w >> 22) & 0x7FFU]
00405 # if WORD_BITS == 64
00406 + bitcounts[(w >> 33) & 0x7FFU]
00407 + bitcounts[(w >> 44) & 0x7FFU]
00408 + bitcounts[(w >> 55) & 0x7FFU]
00409 # endif
00410 #elif BITCOUNT_BITS == 16
00411 bitcounts[w & 0xFFFFU]
00412 + bitcounts[(w >> 16) & 0xFFFFU]
00413 # if WORD_BITS == 64
00414 + bitcounts[(w >> 32) & 0xFFFFU]
00415 + bitcounts[(w >> 48) & 0xFFFFU]
00416 # endif
00417 #else
00418 # error "Unsupported BITCOUNT_BITS"
00419 #endif
00420 ;
00421 }
00422
00424 typedef struct {
00426 word_t at_least_1;
00428 word_t at_least_2;
00429 } zom_t;
00430
00432 inline static zom_t zom_add(zom_t x, zom_t y)
00433 {
00434 zom_t z;
00435 z.at_least_1 = x.at_least_1 | y.at_least_1;
00436 z.at_least_2 = x.at_least_2 | y.at_least_2 | (x.at_least_1 & y.at_least_1);
00437 return z;
00438 }
00439
00441 inline static word_t zom_exactly_1(zom_t x)
00442 {
00443 return x.at_least_1 & ~x.at_least_2;
00444 }
00445
00447 inline static void
00448 word_clear(word_t * restrict array, size_t n)
00449 {
00450 for (unsigned i = 0; i < n; i++) {
00451 array[i] = 0;
00452 }
00453 }
00454
00456 inline static void
00457 zom_clear(zom_t * restrict array, size_t n)
00458 {
00459 for (unsigned i = 0; i < n; i++) {
00460 array[i].at_least_1 = 0;
00461 array[i].at_least_2 = 0;
00462 }
00463 }
00464
00469
00471 static void
00472 usage(const char *name)
00473 {
00474 printf(
00475 "Type and hapax accumulation curves\n"
00476 "\n"
00477 "usage:\n"
00478 " %s [OPTION]... ITERATIONS SLOTS\n"
00479 " %s [OPTION]... ITERATIONS --slot-size SLOT_SIZE\n"
00480 "\n"
00481 " --help This help.\n"
00482 " --version Version and copyright information.\n"
00483 " --slot-size Spesify the size of a slot instead of the number of the slots.\n"
00484 " --hapax Count hapaxes only.\n"
00485 " --items-from-table Item counts from the incidence matrix.\n"
00486 " --brief Skip redundant identical rows in output.\n"
00487 " --raw-data Output data for each permutation.\n"
00488 " --nonrandom For debugging: use the identity permutation.\n"
00489 " --builtin-rng For debugging: use the built-in pseudorandom number generator.\n"
00490 "\n"
00491 " ITERATIONS The number of iterations.\n"
00492 " SLOTS The number of slots.\n"
00493 " SLOT_SIZE The size of each slot.\n"
00494 "\n"
00495 "Input format (fields separated by any whitespace):\n"
00496 "\n"
00497 " <number-of-samples> <number-of-types>\n"
00498 " <sample-1-label> <sample-1-total-items> ...\n"
00499 " <sample-m-label> <sample-m-total-items>\n"
00500 " <type-1-label> ...\n"
00501 " <type-n-label>\n"
00502 " <count-sample-1-type-1> ... <count-sample-1-type-n> ...\n"
00503 " <count-sample-m-type-1> ... <count-sample-m-type-n> ...\n"
00504 "\n"
00505 "Output format (one line for each slot):\n"
00506 "\n"
00507 " <item-count> <lower-bounds> ... <upper-bounds> ...\n"
00508 " ...\n"
00509 "\n"
00510 "Lower and upper bounds are printed for the following quantiles:\n"
00511 "\n"
00512 " " STRINGIFY((LEVELS_LIST)) "\n"
00513 "\n"
00514 "Output format for --raw-data:\n"
00515 "\n"
00516 " <item-count> <value>\n"
00517 " ...\n"
00518 "\n",
00519 name, name);
00520
00521 exit(EXIT_SUCCESS);
00522 }
00523
00525 static void
00526 version(void)
00527 {
00528 printf(
00529 "Type and hapax accumulation curves, version " VERSION "\n"
00530 "Copyright (C) " YEAR " Jukka Suomela\n"
00531 "\n"
00532 "This program comes with ABSOLUTELY NO WARRANTY.\n"
00533 "This is free software, and you are welcome to redistribute it\n"
00534 "under the terms of the GNU General Public License\n"
00535 "<http://www.gnu.org/licenses/gpl.html>.\n\n"
00536 );
00537 exit(EXIT_SUCCESS);
00538 }
00539
00541 typedef struct {
00543
00544 word_t * restrict incidenceb;
00546
00547 zom_t * restrict incidencezom;
00549
00550 unsigned iterations;
00552
00553 unsigned slots;
00555
00556 unsigned requested_slot_size;
00558
00559 unsigned nsample;
00561
00562 unsigned ntype;
00564
00565 unsigned type_words;
00567
00568 unsigned total_items;
00570
00571 unsigned *sample_items;
00573 bool slot_size;
00575 bool items_from_table;
00577
00578 bool hapax;
00580
00581 bool brief;
00583
00584 bool raw_data;
00586 bool nonrandom;
00588 bool builtin_rng;
00589 } input_t;
00590
00592
00593 static void
00594 parse_command_line(input_t * restrict pinput, int argc, char **argv)
00595 {
00596 pinput->iterations = 0;
00597 pinput->slots = 0;
00598 pinput->requested_slot_size = 0;
00599 pinput->hapax = false;
00600 pinput->slot_size = false;
00601 pinput->items_from_table = false;
00602 pinput->brief = false;
00603 pinput->raw_data = false;
00604 pinput->nonrandom = false;
00605 pinput->builtin_rng = false;
00606
00607 for (int i = 1; i < argc; i++) {
00608 if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
00609 usage(argv[0]);
00610 } else if (strcmp(argv[i], "--version") == 0 || strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "-V") == 0) {
00611 version();
00612 } else if (strcmp(argv[i], "--hapax") == 0) {
00613 pinput->hapax = true;
00614 } else if (strcmp(argv[i], "--slot-size") == 0) {
00615 pinput->slot_size = true;
00616 } else if (strcmp(argv[i], "--items-from-table") == 0) {
00617 pinput->items_from_table = true;
00618 } else if (strcmp(argv[i], "--raw-data") == 0) {
00619 pinput->raw_data = true;
00620 } else if (strcmp(argv[i], "--brief") == 0) {
00621 pinput->brief = true;
00622 } else if (strcmp(argv[i], "--nonrandom") == 0) {
00623 pinput->nonrandom = true;
00624 } else if (strcmp(argv[i], "--builtin-rng") == 0) {
00625 pinput->builtin_rng = true;
00626 } else if (pinput->iterations == 0) {
00627 pinput->iterations = get_uint(argv[i]);
00628 if (pinput->iterations == 0) {
00629 fprintf(stderr, "invalid arguments: the number of iterations cannot be 0\n");
00630 exit(EXIT_FAILURE);
00631 }
00632 } else if (!pinput->raw_data && !pinput->slot_size && pinput->slots == 0) {
00633 pinput->slots = get_uint(argv[i]);
00634 if (pinput->slots < 2) {
00635 fprintf(stderr, "invalid arguments: the number of slots has to be at least 2\n");
00636 exit(EXIT_FAILURE);
00637 }
00638 } else if (!pinput->raw_data && pinput->slot_size && pinput->requested_slot_size == 0) {
00639 pinput->requested_slot_size = get_uint(argv[i]);
00640 if (pinput->requested_slot_size == 0) {
00641 fprintf(stderr, "invalid arguments: the slot size cannot be 0\n");
00642 exit(EXIT_FAILURE);
00643 }
00644 } else {
00645 fprintf(stderr, "too many command line arguments: '%s'\n", argv[i]);
00646 exit(EXIT_FAILURE);
00647 }
00648 }
00649
00650 if (pinput->iterations == 0) {
00651 fprintf(stderr, "invalid arguments: the number of iterations not given\n");
00652 exit(EXIT_FAILURE);
00653 }
00654 if (!pinput->raw_data) {
00655 if (pinput->slot_size) {
00656 if (pinput->requested_slot_size == 0) {
00657 fprintf(stderr, "invalid arguments: the slot size not given\n");
00658 exit(EXIT_FAILURE);
00659 }
00660 } else {
00661 if (pinput->slots == 0) {
00662 fprintf(stderr, "invalid arguments: the number of iterations not given\n");
00663 exit(EXIT_FAILURE);
00664 }
00665 }
00666 }
00667 }
00668
00670
00671 static void
00672 process_input(input_t * restrict pinput)
00673 {
00674 myfscanf(2, "<number-of-samples> <number-of-types>", stdin, "%u %u", &pinput->nsample, &pinput->ntype);
00675 if (pinput->nsample == 0) {
00676 fprintf(stderr, "invalid input: the number of samples has to be at least 1\n");
00677 exit(EXIT_FAILURE);
00678 }
00679 if (pinput->ntype == 0) {
00680 fprintf(stderr, "invalid input: the number of types has to be at least 1\n");
00681 exit(EXIT_FAILURE);
00682 }
00683
00684 pinput->type_words = (pinput->ntype + WORD_BITS - 1) / WORD_BITS;
00685 pinput->total_items = 0;
00686 MYMALLOC(pinput->sample_items, unsigned, pinput->nsample);
00687 for (unsigned i = 0; i < pinput->nsample; i++) {
00688 unsigned v;
00689 myfscanf(1, "<sample-label> <sample-total-items>", stdin, "%*s %u", &v);
00690 if (pinput->items_from_table) {
00691 pinput->sample_items[i] = 0;
00692 } else {
00693 if (v == 0) {
00694 fprintf(stderr, "invalid input in sample %u: the number of items has to be at least 1\n", i);
00695 exit(EXIT_FAILURE);
00696 }
00697 pinput->sample_items[i] = v;
00698 pinput->total_items += pinput->sample_items[i];
00699 }
00700 }
00701 for (unsigned i = 0; i < pinput->ntype; i++) {
00702 myfscanf(0, "<type-label>", stdin, "%*s");
00703 }
00704 if (pinput->hapax) {
00705 const zom_t zero = { 0, 0 };
00706 MYMALLOC2Z(pinput->incidencezom, zom_t, pinput->nsample, pinput->type_words, zero);
00707 } else {
00708 MYMALLOC2Z(pinput->incidenceb, word_t, pinput->nsample, pinput->type_words, 0);
00709 }
00710 for (unsigned i = 0; i < pinput->nsample; i++) {
00711 for (unsigned j = 0; j < pinput->ntype; j++) {
00712 unsigned v;
00713 myfscanf(1, "<count-sample-type>", stdin, "%u", &v);
00714 if (v > 0) {
00715 size_t pos = (size_t)i * pinput->type_words + get_msb_index(j);
00716 word_t mask = get_lsb_bit(j);
00717 if (pinput->hapax) {
00718 pinput->incidencezom[pos].at_least_1 |= mask;
00719 if (v > 1) {
00720 pinput->incidencezom[pos].at_least_2 |= mask;
00721 }
00722 } else {
00723 pinput->incidenceb[pos] |= mask;
00724 }
00725 }
00726 if (pinput->items_from_table) {
00727 pinput->sample_items[i] += v;
00728 pinput->total_items += v;
00729 }
00730 }
00731 }
00732 myfscanf(EOF, "end of input", stdin, "%*s");
00733
00734 if (pinput->total_items == 0) {
00735 fprintf(stderr, "invalid input: the total number of items has to be at least 1\n");
00736 exit(EXIT_FAILURE);
00737 }
00738 }
00739
00741 static void
00742 free_input(const input_t *pinput)
00743 {
00744 free(pinput->sample_items);
00745 if (pinput->hapax) {
00746 free(pinput->incidencezom);
00747 } else {
00748 free(pinput->incidenceb);
00749 }
00750 }
00751
00755
00757 typedef struct {
00759 unsigned * restrict slot_threshold;
00761
00762 unsigned * restrict lower_bound;
00764
00765 unsigned * restrict upper_bound;
00766 } output_t;
00767
00769 typedef struct {
00771 unsigned lower;
00773 unsigned upper;
00774 } bounds_t;
00775
00777
00778 static void
00779 prepare_slots(input_t * restrict pinput, output_t * restrict poutput)
00780 {
00781 if (pinput->slot_size) {
00782 assert(pinput->slots == 0);
00783 pinput->slots = (pinput->total_items + pinput->requested_slot_size - 1) / pinput->requested_slot_size + 1;
00784 }
00785 MYMALLOC(poutput->slot_threshold, unsigned, pinput->slots);
00786 if (pinput->slot_size) {
00787 for (unsigned i = 0; i < pinput->slots - 1; i++) {
00788 poutput->slot_threshold[i] = i * pinput->requested_slot_size;
00789 }
00790 } else {
00791 for (unsigned i = 0; i < pinput->slots; i++) {
00792 poutput->slot_threshold[i] = (unsigned)lround((double)pinput->total_items * (double)i / (double)(pinput->slots - 1));
00793 }
00794 }
00795 poutput->slot_threshold[0] = 0;
00796 poutput->slot_threshold[pinput->slots - 1] = pinput->total_items;
00797 }
00798
00800 inline static void
00801 next_permutation(builtin_rng_state_t * restrict rng_state, const input_t *pinput, unsigned * restrict sample_order)
00802 {
00803 if (pinput->nonrandom) {
00804 identity_permutation(pinput->nsample, sample_order);
00805 } else if (pinput->builtin_rng) {
00806 builtin_rng_permutation(rng_state, pinput->nsample, sample_order);
00807 } else {
00808 rand_permutation(pinput->nsample, sample_order);
00809 }
00810 }
00811
00813 inline static void
00814 calculate_bounds_normal(const input_t *pinput, unsigned sample, word_t * restrict accum, unsigned * restrict p_accum_types, bounds_t * restrict pb_sample)
00815 {
00816 const word_t * restrict vector = pinput->incidenceb + (size_t)sample * pinput->type_words;
00817
00818 unsigned types = 0;
00819 for (unsigned j = 0; j < pinput->type_words; j++) {
00820 accum[j] |= vector[j];
00821 types += bitcount(accum[j]);
00822 }
00823
00824 assert(*p_accum_types <= types);
00825
00826 pb_sample->lower = *p_accum_types;
00827 pb_sample->upper = types;
00828 *p_accum_types = types;
00829 }
00830
00832 inline static void
00833 calculate_bounds_hapax(const input_t *pinput, unsigned sample, zom_t * restrict accum, unsigned * restrict p_accum_hapaxes, bounds_t * restrict pb_sample)
00834 {
00835 const zom_t * restrict vector = pinput->incidencezom + (size_t)sample * pinput->type_words;
00836
00837 unsigned removed = 0;
00838 unsigned created = 0;
00839 unsigned temporary = 0;
00840 for (unsigned j = 0; j < pinput->type_words; j++) {
00841 const zom_t current = vector[j];
00842 const zom_t accum_old = accum[j];
00843 const zom_t accum_new = zom_add(accum_old, current);
00844 accum[j] = accum_new;
00845
00846 const word_t hapax_old = zom_exactly_1(accum_old);
00847 const word_t hapax_new = zom_exactly_1(accum_new);
00848 const word_t hapax_temporary = ~accum_old.at_least_1 & current.at_least_2;
00849 assert((hapax_old & hapax_temporary) == 0);
00850 assert((hapax_new & hapax_temporary) == 0);
00851 temporary += bitcount(hapax_temporary);
00852 removed += bitcount(hapax_old & ~hapax_new);
00853 created += bitcount(~hapax_old & hapax_new);
00854 }
00855
00856 assert(*p_accum_hapaxes >= removed);
00857 assert(*p_accum_hapaxes + created + temporary <= pinput->ntype);
00858
00859 pb_sample->lower = *p_accum_hapaxes - removed;
00860 pb_sample->upper = *p_accum_hapaxes + created + temporary;
00861 *p_accum_hapaxes = *p_accum_hapaxes + created - removed;
00862 }
00863
00865
00866 inline static void
00867 update_bounds_normal(bounds_t * restrict pb_this_gap, const bounds_t *pb_sample)
00868 {
00869
00870 assert(pb_this_gap->lower <= pb_sample->lower);
00871 assert(pb_sample->lower <= pb_this_gap->upper);
00872 assert(pb_this_gap->upper <= pb_sample->upper);
00873 pb_this_gap->upper = pb_sample->upper;
00874 }
00875
00877
00878 inline static void
00879 update_bounds_hapax(bounds_t * restrict pb_this_gap, const bounds_t *pb_sample)
00880 {
00881 pb_this_gap->lower = MIN(pb_sample->lower, pb_this_gap->lower);
00882 pb_this_gap->upper = MAX(pb_sample->upper, pb_this_gap->upper);
00883 }
00884
00886 inline static void
00887 record(unsigned slots, const output_t *poutput, bounds_t b_slot, unsigned current_slot)
00888 {
00889 poutput->lower_bound[(size_t)b_slot.lower * slots + current_slot]++;
00890 poutput->upper_bound[(size_t)b_slot.upper * slots + current_slot]++;
00891 }
00892
00894
00895 inline static void
00896 record_normal(unsigned slots, const output_t *poutput, bounds_t b_prev_gap, bounds_t b_this_gap, unsigned current_slot)
00897 {
00898
00899 assert(b_prev_gap.lower <= b_this_gap.lower);
00900 assert(b_prev_gap.upper <= b_this_gap.upper);
00901 const bounds_t b_slot = {
00902 b_prev_gap.lower,
00903 b_this_gap.upper
00904 };
00905 record(slots, poutput, b_slot, current_slot);
00906 }
00907
00909
00910 inline static void
00911 record_hapax(unsigned slots, const output_t *poutput, bounds_t b_prev_gap, bounds_t b_this_gap, unsigned current_slot)
00912 {
00913
00914 const bounds_t b_slot = {
00915 MIN(b_prev_gap.lower, b_this_gap.lower),
00916 MAX(b_prev_gap.upper, b_this_gap.upper)
00917 };
00918 record(slots, poutput, b_slot, current_slot);
00919 }
00920
00922 #define DEF_CALCULATE_STATISTICS(suffix, representation) \
00923 static void \
00924 calculate_statistics_ ## suffix(const input_t * restrict pinput, const output_t *poutput) \
00925 { \
00926 builtin_rng_state_t rng_state; \
00927 builtin_rng_init(&rng_state); \
00928 \
00929 for (unsigned iteration = 0; iteration < pinput->iterations; iteration++) { \
00930 unsigned sample_order[pinput->nsample]; \
00931 next_permutation(&rng_state, pinput, sample_order); \
00932 \
00933 representation ## _t accum[pinput->type_words]; \
00934 representation ## _clear(accum, pinput->type_words); \
00935 \
00936 unsigned accum_items = 0; \
00937 unsigned accum_value = 0; \
00938 unsigned current_slot = 0; \
00939 \
00940 bounds_t b_prev_gap = { 0, 0 }; \
00941 bounds_t b_this_gap = { 0, 0 }; \
00942 \
00943 for (unsigned i = 0; i < pinput->nsample; i++) { \
00944 const unsigned sample = sample_order[i]; \
00945 accum_items += pinput->sample_items[sample]; \
00946 \
00947 bounds_t b_sample; \
00948 calculate_bounds_ ## suffix(pinput, sample, accum, &accum_value, &b_sample); \
00949 update_bounds_ ## suffix(&b_this_gap, &b_sample); \
00950 \
00951 while (current_slot + 1 < pinput->slots && accum_items >= poutput->slot_threshold[current_slot + 1]) { \
00952 record_ ## suffix(pinput->slots, poutput, b_prev_gap, b_this_gap, current_slot); \
00953 current_slot++; \
00954 b_prev_gap = b_this_gap; \
00955 b_this_gap = b_sample; \
00956 } \
00957 if (accum_items == poutput->slot_threshold[current_slot]) { \
00958 b_this_gap.lower = accum_value; \
00959 b_this_gap.upper = accum_value; \
00960 } \
00961 } \
00962 \
00963 assert(current_slot == pinput->slots - 1); \
00964 record_ ## suffix(pinput->slots, poutput, b_prev_gap, b_this_gap, current_slot); \
00965 } \
00966 }
00967
00969 DEF_CALCULATE_STATISTICS(normal, word)
00970
00971
00972 DEF_CALCULATE_STATISTICS(hapax, zom)
00973
00975 static void
00976 calculate_statistics(const input_t *pinput, output_t * restrict poutput)
00977 {
00978 MYMALLOC2Z(poutput->lower_bound, unsigned, pinput->ntype+1, pinput->slots, 0);
00979 MYMALLOC2Z(poutput->upper_bound, unsigned, pinput->ntype+1, pinput->slots, 0);
00980
00981 if (pinput->hapax) {
00982 calculate_statistics_hapax(pinput, poutput);
00983 } else {
00984 calculate_statistics_normal(pinput, poutput);
00985 }
00986 }
00987
00989 static void
00990 free_output(const output_t *poutput)
00991 {
00992 free(poutput->slot_threshold);
00993 free(poutput->lower_bound);
00994 free(poutput->upper_bound);
00995 }
00996
01000
01002 static void
01003 print_raw_data(unsigned accum_items, unsigned accum_value)
01004 {
01005 printf("%8u %4u\n", accum_items, accum_value);
01006 }
01007
01009 #define DEF_CALCULATE_RAW_DATA(suffix, representation) \
01010 static void \
01011 calculate_raw_data_ ## suffix(const input_t *pinput) \
01012 { \
01013 builtin_rng_state_t rng_state; \
01014 builtin_rng_init(&rng_state); \
01015 \
01016 for (unsigned iteration = 0; iteration < pinput->iterations; iteration++) { \
01017 unsigned sample_order[pinput->nsample]; \
01018 next_permutation(&rng_state, pinput, sample_order); \
01019 \
01020 representation ## _t accum[pinput->type_words]; \
01021 representation ## _clear(accum, pinput->type_words); \
01022 \
01023 unsigned accum_items = 0; \
01024 unsigned accum_value = 0; \
01025 \
01026 print_raw_data(accum_items, accum_value); \
01027 for (unsigned i = 0; i < pinput->nsample; i++) { \
01028 const unsigned sample = sample_order[i]; \
01029 accum_items += pinput->sample_items[sample]; \
01030 \
01031 bounds_t b_sample; \
01032 calculate_bounds_ ## suffix(pinput, sample, accum, &accum_value, &b_sample); \
01033 \
01034 print_raw_data(accum_items, accum_value); \
01035 } \
01036 } \
01037 }
01038
01040 DEF_CALCULATE_RAW_DATA(normal, word)
01041
01042
01043 DEF_CALCULATE_RAW_DATA(hapax, zom)
01044
01046 static void
01047 calculate_raw_data(const input_t *pinput)
01048 {
01049 if (pinput->hapax) {
01050 calculate_raw_data_hapax(pinput);
01051 } else {
01052 calculate_raw_data_normal(pinput);
01053 }
01054 }
01055
01059
01061 const double LEVELS[] = { LEVELS_LIST };
01062
01064 #define NLEVELS (sizeof LEVELS / sizeof LEVELS[0])
01065
01067 static void
01068 print_row(unsigned slot_threshold, const unsigned *lower_bounds, const unsigned *upper_bounds)
01069 {
01070 printf("%8u", slot_threshold);
01071 for (unsigned l = 0; l < NLEVELS; l++) {
01072 printf(" %4u", lower_bounds[l]);
01073 }
01074 for (unsigned l = 0; l < NLEVELS; l++) {
01075 printf(" %4u", upper_bounds[NLEVELS - 1 - l]);
01076 }
01077 printf("\n");
01078 }
01079
01081 static void
01082 print_result(const input_t *pinput, const output_t *poutput)
01083 {
01084 unsigned prev_lower_bounds[NLEVELS];
01085 unsigned prev_upper_bounds[NLEVELS];
01086 bool prev_valid = false;
01087 bool prev_delayed = false;
01088
01089 for (unsigned i = 0; i < pinput->slots; i++) {
01090 unsigned lower_bounds[NLEVELS];
01091 unsigned cum = 0;
01092 unsigned next_level = 0;
01093 for (unsigned j = 0; j <= pinput->ntype; j++) {
01094 cum += poutput->lower_bound[(size_t)j * pinput->slots + i];
01095 double fract = (double)cum / (double)pinput->iterations;
01096 while (next_level < NLEVELS && LEVELS[next_level] < fract) {
01097 lower_bounds[next_level] = j;
01098 next_level++;
01099 }
01100 }
01101 assert(next_level == NLEVELS);
01102
01103 unsigned upper_bounds[NLEVELS];
01104 cum = 0;
01105 next_level = 0;
01106 for (unsigned j = 0; j <= pinput->ntype; j++) {
01107 cum += poutput->upper_bound[(size_t)(pinput->ntype - j) * pinput->slots + i];
01108 double fract = (double)cum / (double)pinput->iterations;
01109 while (next_level < NLEVELS && LEVELS[next_level] < fract) {
01110 upper_bounds[next_level] = pinput->ntype - j;
01111 next_level++;
01112 }
01113 }
01114 assert(next_level == NLEVELS);
01115
01116 if (pinput->brief) {
01117 if (prev_valid) {
01118 bool same = true;
01119 for (unsigned l = 0; l < NLEVELS; l++) {
01120 if (prev_lower_bounds[l] != lower_bounds[l] || prev_upper_bounds[l] != upper_bounds[l]) {
01121 same = false;
01122 break;
01123 }
01124 }
01125 if (same) {
01126 prev_delayed = true;
01127 } else if (prev_delayed) {
01128 print_row(poutput->slot_threshold[i - 1], prev_lower_bounds, prev_upper_bounds);
01129 prev_valid = false;
01130 } else {
01131 prev_valid = false;
01132 }
01133 }
01134 if (!prev_valid) {
01135 print_row(poutput->slot_threshold[i], lower_bounds, upper_bounds);
01136 for (unsigned l = 0; l < NLEVELS; l++) {
01137 prev_lower_bounds[l] = lower_bounds[l];
01138 prev_upper_bounds[l] = upper_bounds[l];
01139 }
01140 prev_valid = true;
01141 prev_delayed = false;
01142 }
01143 } else {
01144 print_row(poutput->slot_threshold[i], lower_bounds, upper_bounds);
01145 }
01146 }
01147
01148 if (pinput->brief && prev_valid && prev_delayed) {
01149 print_row(poutput->slot_threshold[pinput->slots - 1], prev_lower_bounds, prev_upper_bounds);
01150 }
01151 }
01152
01156
01158 int
01159 main(int argc, char **argv)
01160 {
01161 init_bitcount();
01162
01163 input_t input;
01164 parse_command_line(&input, argc, argv);
01165 process_input(&input);
01166
01167 if (input.raw_data) {
01168 calculate_raw_data(&input);
01169 } else {
01170 output_t output;
01171 prepare_slots(&input, &output);
01172 calculate_statistics(&input, &output);
01173 print_result(&input, &output);
01174 free_output(&output);
01175 }
01176 free_input(&input);
01177 return EXIT_SUCCESS;
01178 }
01179