/* vstr stuff and bug fixes by james antill ... LGPL and above MIT. timesstamp() function by Michael B Allen gcc -g -Wall -W -O2 -o csv_vstr csv_vstr.c `pkg-config --cflags --libs vstr` */ /* configuration... */ #define VCSV_OPT_FLAGS VCSV_FLAG_LINE #define USE_VDUMP 1 #define USE_DEBUG 0 #if !(USE_DEBUG) # define NDEBUG 1 #endif #define VSTR_COMPILE_INCLUDE 1 #include #include #include #include /* option flags... */ #define VCSV_FLAG_NONE 0 #define VCSV_FLAG_LINE 1 /* only parse one line */ #define TRUE 1 #define FALSE 0 #define VCSV_ST_PRE 0 #define VCSV_ST_BEG 1 #define VCSV_ST_GET_BEG_DQUOT 2 #define VCSV_ST_GET_END_DQUOT 3 #define VCSV_ST_GET_NORM 4 #define VCSV_ST_SKIP_COMMA 5 #define VCSV_ST_SKIP_TRASH 6 #define VCSV_ST_SKIP_RET 7 #define VCSV_ST_INIT 8 #define VCSV__MEMCHR(x) memchr(iter->ptr, (x), iter->len) #define VCSV__INC(x) do { size_t local_inc_tmp = (x); \ assert(local_inc_tmp <= iter->len); \ \ iter->ptr += local_inc_tmp; \ iter->len -= local_inc_tmp; \ \ len -= local_inc_tmp; \ } while (FALSE) static Vstr_base *out = NULL; static Vstr_base *vcsv_data = NULL; static inline void vcsv_end(size_t pos, size_t data_len, size_t beg_len, size_t len, Vstr_sects *rows) { vstr_sects_add(rows, pos + (data_len - beg_len), beg_len - len); } static int vcsv_row_parse(Vstr_base *s1, size_t pos, size_t len, Vstr_sects *rows, unsigned int flags) { size_t data_len = len; size_t ret_len = len; size_t beg_len = 0; unsigned int state = VCSV_ST_INIT; const char *ptr = NULL; size_t tmp = 0; Vstr_iter iter[1]; if (!len) return (0); if (!vstr_iter_fwd_beg(s1, pos, len, iter)) abort(); while (len) { if (!iter->len) if (!vstr_iter_fwd_nxt(iter)) abort(); switch (state) { case VCSV_ST_SKIP_TRASH: if (!(ptr = VCSV__MEMCHR(','))) { VCSV__INC(iter->len); break; } tmp = ptr - iter->ptr; VCSV__INC(tmp); state = VCSV_ST_SKIP_COMMA; break; case VCSV_ST_SKIP_COMMA: assert(*iter->ptr == ','); VCSV__INC(1); state = VCSV_ST_PRE; beg_len = len; break; case VCSV_ST_PRE: if ((*iter->ptr == '\n') || (*iter->ptr == '\r')) { vcsv_end(pos, data_len, beg_len, len, rows); state = VCSV_ST_SKIP_RET; } else state = VCSV_ST_BEG; break; case VCSV_ST_SKIP_RET: if (flags & VCSV_FLAG_LINE) return (ret_len - len); case VCSV_ST_INIT: while ((*iter->ptr == '\n') || (*iter->ptr == '\r')) { /* skip blanks to start... */ VCSV__INC(1); if (!iter->len && !vstr_iter_fwd_nxt(iter)) return (ret_len); } beg_len = len; /* FALL THROUGH */ case VCSV_ST_BEG: if (*iter->ptr == '"') { state = VCSV_ST_GET_BEG_DQUOT; --beg_len; } else if (*iter->ptr == ',') { vcsv_end(pos, data_len, beg_len, len, rows); state = VCSV_ST_SKIP_COMMA; continue; } else { beg_len = len; /* mark start */ state = VCSV_ST_GET_NORM; } VCSV__INC(1); break; case VCSV_ST_GET_BEG_DQUOT: if (!(ptr = VCSV__MEMCHR('"'))) { VCSV__INC(iter->len); break; } tmp = ptr - iter->ptr; VCSV__INC(tmp + 1); state = VCSV_ST_GET_END_DQUOT; if (!len) { vcsv_end(pos, data_len, beg_len, 1, rows); return (ret_len); } break; case VCSV_ST_GET_END_DQUOT: { unsigned int found_ret = FALSE; ++len; /* go back to the '"' */ switch (*iter->ptr) { case '\r': case '\n': found_ret = TRUE; case ',': vcsv_end(pos, data_len, beg_len, len, rows); beg_len = 0; if (found_ret) state = VCSV_ST_SKIP_RET; else state = VCSV_ST_SKIP_COMMA; break; default: vcsv_end(pos, data_len, beg_len, len, rows); beg_len = 0; state = VCSV_ST_SKIP_TRASH; break; case '"': { size_t tpos = pos + (data_len - beg_len) + (beg_len - len); vstr_del(s1, tpos, 1); --data_len; /* update lengths and re-init iter */ --beg_len; len -= 2; /* for above */ if (!vstr_iter_fwd_beg(s1, tpos + 1, len, iter)) abort(); state = VCSV_ST_GET_BEG_DQUOT; continue; } break; } --len; /* reverse above */ } break; case VCSV_ST_GET_NORM: tmp = 0; while (tmp < iter->len) { if ((iter->ptr[tmp] == ',') || (iter->ptr[tmp] == '\r') || (iter->ptr[tmp] == '\n')) break; ++tmp; } VCSV__INC(tmp); if (!iter->len) break; vcsv_end(pos, data_len, beg_len, len, rows); if (iter->ptr[0] == ',') state = VCSV_ST_SKIP_COMMA; else state = VCSV_ST_SKIP_RET; break; default: abort(); } } if ((state != VCSV_ST_SKIP_RET) && (state != VCSV_ST_SKIP_TRASH)) vcsv_end(pos, data_len, beg_len, len, rows); return (ret_len); } #if USE_VDUMP # define VDUMP(ret, rows) vdump(ret, rows) static void vdump(unsigned int ret, Vstr_sects *rows) { unsigned int scan = 0; vstr_add_fmt(out, out->len, "${rep_chr:%c%zu}\n" "%d\n" "${rep_chr:%c%zu}\n", '=', 79, ret, '=', 79); while (scan < rows->num) { size_t pos = 1; size_t len = 0; ++scan; len = VSTR_SECTS_NUM(rows, scan)->len; if (len) pos = VSTR_SECTS_NUM(rows, scan)->pos; vstr_add_fmt(out, out->len, "|${vstr:%p%zu%zu%u}|\n", vcsv_data, pos, len, 0); } vstr_add_fmt(out, out->len, "${rep_chr:%c%zu}\n", '-', 79); while (out->len) if (!vstr_sc_write_fd(out, 1, out->len, 1, NULL)) abort(); } #else # define VDUMP(ret, rows) /* nothing */ #endif #if defined(_WIN32) #include #define MILLISECONDS_BETWEEN_1970_AND_1601 11644473600000Ui64 typedef unsigned __int64 uint64_t; uint64_t timestamp(void) { FILETIME ftime; uint64_t ret; GetSystemTimeAsFileTime(&ftime); ret = ftime.dwHighDateTime; ret <<= 32Ui64; ret |= ftime.dwLowDateTime; ret = ret / 10000Ui64 - MILLISECONDS_BETWEEN_1970_AND_1601; return ret; } #else #include #include static inline uint64_t timestamp(void) { struct timeval tval; gettimeofday(&tval, NULL); return tval.tv_sec * 1000LL + tval.tv_usec / 1000; } #endif int main(int argc, char *argv[]) { VSTR_SECTS_DECL(vrows, 256); int ret = 0; if (argc != 2) abort(); if (!vstr_init()) abort(); VSTR_SECTS_DECL_INIT(vrows); vcsv_data = vstr_make_base(NULL); out = vstr_make_base(NULL); vstr_cntl_conf(NULL, VSTR_CNTL_CONF_SET_FMT_CHAR_ESC, '$'); vstr_sc_fmt_add_all(NULL); do { size_t pos = 1; size_t len = 0; unsigned int err = 0; uint64_t t0 = timestamp(); if (!vstr_sc_mmap_file(vcsv_data, 0, argv[1], 0, 0, &err)) break; len = vcsv_data->len; if (err) abort(); while ((ret = vcsv_row_parse(vcsv_data, pos, len, vrows, VCSV_OPT_FLAGS))) { VDUMP(ret, vrows); pos += ret; len -= ret; vrows->num = 0; } vstr_add_fmt(out, out->len, "%'llu milliseconds\n", (timestamp() - t0)); while (out->len) if (!vstr_sc_write_fd(out, 1, out->len, 2, NULL)) abort(); } while (FALSE); exit (EXIT_SUCCESS); }