/* vstr stuff and bug fixes by James Antill ... LGPL and above MIT. * Timesstamp portability code by Michael B Allen gcc -g -Wall -W -O2 -o csv_vstr csv_vstr.c `pkg-config --cflags --libs vstr` */ /* BEG: configuration... */ #define VCSV_OPT_FLAGS VCSV_FLAG_LINE #define USE_INTERNAL_MEMCHR 1 #define USE_VDUMP 0 #define USE_DEBUG 0 #define LOOP_FOR 1 /* END: configuration... */ #if !(USE_DEBUG) # define NDEBUG 1 #endif #define VSTR_COMPILE_INCLUDE 1 #include #include #include #include /* option flags... */ #define VCSV_FLAG_NONE 0 #define VCSV_FLAG_LINE 1 /* only parse one line */ #define TRUE 1 #define FALSE 0 #define VCSV_ST_PRE 0 #define VCSV_ST_BEG 1 #define VCSV_ST_GET_BEG_DQUOT 2 #define VCSV_ST_GET_END_DQUOT 3 #define VCSV_ST_GET_NORM 4 #define VCSV_ST_SKIP_COMMA 5 #define VCSV_ST_SKIP_TRASH 6 #define VCSV_ST_SKIP_RET 7 #define VCSV_ST_INIT 8 #if USE_INTERNAL_MEMCHR # define VCSV__MEMCHR(x) vcsv__memchr(iter->ptr, (x), iter->len) static inline void *vcsv__memchr(const void *passed_s1, int c, size_t n) { const unsigned char *s1 = passed_s1; const void *ret = 0; int tmp = 0; switch (n) { /* if less than a word, don't trust glibc */ #if 0 case 15: tmp = s1[14] == c; if (tmp) ret = s1 + 14; case 14: tmp = s1[13] == c; if (tmp) ret = s1 + 13; case 13: tmp = s1[12] == c; if (tmp) ret = s1 + 12; case 12: tmp = s1[11] == c; if (tmp) ret = s1 + 11; case 11: tmp = s1[10] == c; if (tmp) ret = s1 + 10; case 10: tmp = s1[ 9] == c; if (tmp) ret = s1 + 9; case 9: tmp = s1[ 8] == c; if (tmp) ret = s1 + 8; case 8: tmp = s1[ 7] == c; if (tmp) ret = s1 + 7; #endif case 7: tmp = s1[ 6] == c; if (tmp) ret = s1 + 6; case 6: tmp = s1[ 5] == c; if (tmp) ret = s1 + 5; case 5: tmp = s1[ 4] == c; if (tmp) ret = s1 + 4; case 4: tmp = s1[ 3] == c; if (tmp) ret = s1 + 3; case 3: tmp = s1[ 2] == c; if (tmp) ret = s1 + 2; case 2: tmp = s1[ 1] == c; if (tmp) ret = s1 + 1; case 1: tmp = s1[ 0] == c; if (tmp) ret = s1 + 0; break; default: ret = memchr(s1, c, n); break; } return ((void *)ret); } #else # define VCSV__MEMCHR(x) memchr(iter->ptr, (x), iter->len) #endif #define VCSV__INC(x) do { size_t local_inc_tmp = (x); \ assert(local_inc_tmp <= iter->len); \ assert(local_inc_tmp <= len); \ \ iter->ptr += local_inc_tmp; \ iter->len -= local_inc_tmp; \ \ len -= local_inc_tmp; \ } while (FALSE) static Vstr_base *out = NULL; static Vstr_base *vcsv_data = NULL; static inline void vcsv_end(size_t pos, size_t data_len, size_t beg_len, size_t len, Vstr_sects *rows) { vstr_sects_add(rows, pos + (data_len - beg_len), beg_len - len); } size_t vcsv_row_parse(Vstr_base *s1, size_t pos, size_t *passed_len, Vstr_sects *rows, unsigned int flags) { size_t len = *passed_len; size_t data_len = len; size_t beg_len = 0; unsigned int state = VCSV_ST_INIT; const char *ptr = NULL; size_t tmp = 0; Vstr_iter iter[1]; if (!len) return (0); if (!vstr_iter_fwd_beg(s1, pos, len, iter)) abort(); while (len) { if (!iter->len) vstr_iter_fwd_nxt(iter); switch (state) { case VCSV_ST_SKIP_TRASH: if (!(ptr = VCSV__MEMCHR(','))) { VCSV__INC(iter->len); break; } tmp = ptr - iter->ptr; VCSV__INC(tmp); state = VCSV_ST_SKIP_COMMA; break; case VCSV_ST_SKIP_COMMA: assert(*iter->ptr == ','); VCSV__INC(1); state = VCSV_ST_PRE; beg_len = len; break; case VCSV_ST_PRE: if ((*iter->ptr == '\n') || (*iter->ptr == '\r')) { vcsv_end(pos, data_len, beg_len, len, rows); state = VCSV_ST_SKIP_RET; } else state = VCSV_ST_BEG; break; case VCSV_ST_SKIP_RET: if (flags & VCSV_FLAG_LINE) return (data_len - len); /* FALL THROUGH */ case VCSV_ST_INIT: while ((*iter->ptr == '\n') || (*iter->ptr == '\r')) { /* skip blanks to start... */ VCSV__INC(1); if (!iter->len && !vstr_iter_fwd_nxt(iter)) return (data_len); } beg_len = len; /* FALL THROUGH */ case VCSV_ST_BEG: if (*iter->ptr == '"') { state = VCSV_ST_GET_BEG_DQUOT; --beg_len; } else if (*iter->ptr == ',') { vcsv_end(pos, data_len, beg_len, len, rows); state = VCSV_ST_SKIP_COMMA; continue; } else state = VCSV_ST_GET_NORM; VCSV__INC(1); break; case VCSV_ST_GET_BEG_DQUOT: if (!(ptr = VCSV__MEMCHR('"'))) { VCSV__INC(iter->len); break; } tmp = ptr - iter->ptr; VCSV__INC(tmp + 1); state = VCSV_ST_GET_END_DQUOT; if (!len) { vcsv_end(pos, data_len, beg_len, 1, rows); return (data_len); } break; case VCSV_ST_GET_END_DQUOT: { unsigned int found_ret = FALSE; ++len; /* go back to the '"' */ switch (*iter->ptr) { case '\r': case '\n': found_ret = TRUE; case ',': vcsv_end(pos, data_len, beg_len, len, rows); beg_len = 0; if (found_ret) state = VCSV_ST_SKIP_RET; else state = VCSV_ST_SKIP_COMMA; break; default: vcsv_end(pos, data_len, beg_len, len, rows); beg_len = 0; state = VCSV_ST_SKIP_TRASH; break; case '"': { size_t tpos = pos + (data_len - beg_len) + (beg_len - len); vstr_del(s1, tpos, 1); --data_len; /* update lengths and re-init iter */ --beg_len; --*passed_len; len -= 2; /* for above */ vstr_iter_fwd_beg(s1, tpos + 1, len, iter); state = VCSV_ST_GET_BEG_DQUOT; continue; } break; } --len; /* reverse above */ } break; case VCSV_ST_GET_NORM: tmp = 0; while (tmp < iter->len) { if ((iter->ptr[tmp] == ',') || (iter->ptr[tmp] == '\r') || (iter->ptr[tmp] == '\n')) break; ++tmp; } VCSV__INC(tmp); if (!iter->len) break; vcsv_end(pos, data_len, beg_len, len, rows); if (iter->ptr[0] == ',') state = VCSV_ST_SKIP_COMMA; else state = VCSV_ST_SKIP_RET; break; default: assert(FALSE); } } if ((state != VCSV_ST_SKIP_RET) && (state != VCSV_ST_SKIP_TRASH)) vcsv_end(pos, data_len, beg_len, len, rows); return (data_len); } #if USE_VDUMP # define VDUMP(ret, rows) vdump(ret, rows) static void vdump(unsigned int ret, Vstr_sects *rows) { unsigned int scan = 0; vstr_add_fmt(out, out->len, "${rep_chr:%c%zu}\n" "%d\n" "${rep_chr:%c%zu}\n", '=', 79, ret, '=', 79); while (scan < rows->num) { size_t pos = 1; size_t len = 0; ++scan; len = VSTR_SECTS_NUM(rows, scan)->len; if (len) pos = VSTR_SECTS_NUM(rows, scan)->pos; vstr_add_fmt(out, out->len, "|${vstr:%p%zu%zu%u}|\n", vcsv_data, pos, len, 0); } vstr_add_fmt(out, out->len, "${rep_chr:%c%zu}\n", '-', 79); while (out->len) if (!vstr_sc_write_fd(out, 1, out->len, 1, NULL)) abort(); } #else # define VDUMP(ret, rows) do { ; } while (FALSE) /* nothing */ #endif #if defined(_WIN32) #include #define MILLISECONDS_BETWEEN_1970_AND_1601 11644473600000Ui64 typedef unsigned __int64 uint64_t; uint64_t timestamp(void) { FILETIME ftime; uint64_t ret; GetSystemTimeAsFileTime(&ftime); ret = ftime.dwHighDateTime; ret <<= 32Ui64; ret |= ftime.dwLowDateTime; ret = ret / 10000Ui64 - MILLISECONDS_BETWEEN_1970_AND_1601; return ret; } #else #include #include static inline uint64_t timestamp(void) { struct timeval tval; gettimeofday(&tval, NULL); return tval.tv_sec * 1000LL + tval.tv_usec / 1000; } #endif int main(int argc, char *argv[]) { Vstr_sects *vrows = NULL; size_t ret = 0; if (!vstr_init()) abort(); out = vstr_make_base(NULL); vstr_cntl_conf(NULL, VSTR_CNTL_CONF_SET_FMT_CHAR_ESC, '$'); vstr_sc_fmt_add_all(NULL); if (!out) abort(); if (argc != 2) { vstr_add_fmt(out, out->len, "%s [csv file]\n", argv[0]); while (out->len) if (!vstr_sc_write_fd(out, 1, out->len, 2, NULL)) abort(); } vcsv_data = vstr_make_base(NULL); vrows = vstr_sects_make(500); if (!vcsv_data || !vrows) abort(); do { unsigned int err = 0; uint64_t t0 = timestamp(); unsigned int count = 0; Vstr_base *vcsv_mmap = vstr_make_base(NULL); if (!vcsv_mmap) abort(); if (!vstr_sc_mmap_file(vcsv_mmap, 0, argv[1], 0, 0, &err)) break; if (err) abort(); while (++count <= LOOP_FOR) { size_t pos = 1; size_t len = vcsv_mmap->len; vstr_add_vstr(vcsv_data, 0, vcsv_mmap, pos, len, VSTR_TYPE_ADD_DEF); assert(!vcsv_data->conf->malloc_bad); while ((ret = vcsv_row_parse(vcsv_data, pos, &len, vrows, VCSV_OPT_FLAGS))) { VDUMP(ret, vrows); assert(ret <= vcsv_data->len); pos += ret; len -= ret; vrows->num = 0; assert(len <= vcsv_data->len); assert(pos + len - 1 == vcsv_data->len); } } vstr_add_fmt(out, out->len, "%'llu milliseconds\n", (timestamp() - t0)); while (out->len) if (!vstr_sc_write_fd(out, 1, out->len, 2, NULL)) abort(); vstr_free_base(vcsv_mmap); } while (FALSE); vstr_sects_free(vrows); vstr_free_base(out); vstr_free_base(vcsv_data); vstr_exit(); exit (EXIT_SUCCESS); }