ex_csv.c
#define VCSV_OPT_FLAGS VCSV_FLAG_LINE
#define USE_VDUMP 1
#define USE_DEBUG 0
#if !(USE_DEBUG)
# define NDEBUG 1
#endif
#define VSTR_COMPILE_INCLUDE 1
#include <vstr.h>
#include <assert.h>
#include <string.h>
#include <unistd.h>
#define VCSV_FLAG_NONE 0
#define VCSV_FLAG_LINE 1
#define TRUE 1
#define FALSE 0
#define VCSV_ST_PRE 0
#define VCSV_ST_BEG 1
#define VCSV_ST_GET_BEG_DQUOT 2
#define VCSV_ST_GET_END_DQUOT 3
#define VCSV_ST_GET_NORM 4
#define VCSV_ST_SKIP_COMMA 5
#define VCSV_ST_SKIP_TRASH 6
#define VCSV_ST_SKIP_RET 7
#define VCSV_ST_INIT 8
#define VCSV__MEMCHR(x) memchr(iter->ptr, (x), iter->len)
#define VCSV__INC(x) do { size_t local_inc_tmp = (x); \
assert(local_inc_tmp <= iter->len); \
\
iter->ptr += local_inc_tmp; \
iter->len -= local_inc_tmp; \
\
len -= local_inc_tmp; \
} while (FALSE)
static Vstr_base *out = NULL;
static Vstr_base *vcsv_data = NULL;
static inline void vcsv_end(size_t pos, size_t data_len,
size_t beg_len, size_t len,
Vstr_sects *rows)
{
vstr_sects_add(rows, pos + (data_len - beg_len), beg_len - len);
}
static int
vcsv_row_parse(Vstr_base *s1, size_t pos, size_t len,
Vstr_sects *rows, unsigned int flags)
{
size_t data_len = len;
size_t ret_len = len;
size_t beg_len = 0;
unsigned int state = VCSV_ST_INIT;
const char *ptr = NULL;
size_t tmp = 0;
Vstr_iter iter[1];
if (!len)
return (0);
if (!vstr_iter_fwd_beg(s1, pos, len, iter))
abort();
while (len)
{
if (!iter->len)
if (!vstr_iter_fwd_nxt(iter)) abort();
switch (state)
{
case VCSV_ST_SKIP_TRASH:
if (!(ptr = VCSV__MEMCHR(',')))
{
VCSV__INC(iter->len);
break;
}
tmp = ptr - iter->ptr;
VCSV__INC(tmp);
state = VCSV_ST_SKIP_COMMA;
break;
case VCSV_ST_SKIP_COMMA:
assert(*iter->ptr == ',');
VCSV__INC(1);
state = VCSV_ST_PRE;
beg_len = len;
break;
case VCSV_ST_PRE:
if ((*iter->ptr == '\n') || (*iter->ptr == '\r'))
{
vcsv_end(pos, data_len, beg_len, len, rows);
state = VCSV_ST_SKIP_RET;
}
else
state = VCSV_ST_BEG;
break;
case VCSV_ST_SKIP_RET:
if (flags & VCSV_FLAG_LINE)
return (ret_len - len);
case VCSV_ST_INIT:
while ((*iter->ptr == '\n') || (*iter->ptr == '\r'))
{
VCSV__INC(1);
if (!iter->len && !vstr_iter_fwd_nxt(iter))
return (ret_len);
}
beg_len = len;
case VCSV_ST_BEG:
if (*iter->ptr == '"')
{
state = VCSV_ST_GET_BEG_DQUOT;
--beg_len;
}
else if (*iter->ptr == ',')
{
vcsv_end(pos, data_len, beg_len, len, rows);
state = VCSV_ST_SKIP_COMMA;
continue;
}
else
{
beg_len = len;
state = VCSV_ST_GET_NORM;
}
VCSV__INC(1);
break;
case VCSV_ST_GET_BEG_DQUOT:
if (!(ptr = VCSV__MEMCHR('"')))
{
VCSV__INC(iter->len);
break;
}
tmp = ptr - iter->ptr;
VCSV__INC(tmp + 1);
state = VCSV_ST_GET_END_DQUOT;
if (!len)
{
vcsv_end(pos, data_len, beg_len, 1, rows);
return (ret_len);
}
break;
case VCSV_ST_GET_END_DQUOT:
{
unsigned int found_ret = FALSE;
++len;
switch (*iter->ptr)
{
case '\r':
case '\n':
found_ret = TRUE;
case ',':
vcsv_end(pos, data_len, beg_len, len, rows);
beg_len = 0;
if (found_ret)
state = VCSV_ST_SKIP_RET;
else
state = VCSV_ST_SKIP_COMMA;
break;
default:
vcsv_end(pos, data_len, beg_len, len, rows);
beg_len = 0;
state = VCSV_ST_SKIP_TRASH;
break;
case '"':
{
size_t tpos = pos + (data_len - beg_len) + (beg_len - len);
vstr_del(s1, tpos, 1);
--data_len;
--beg_len;
len -= 2;
if (!vstr_iter_fwd_beg(s1, tpos + 1, len, iter)) abort();
state = VCSV_ST_GET_BEG_DQUOT;
continue;
}
break;
}
--len;
}
break;
case VCSV_ST_GET_NORM:
tmp = 0;
while (tmp < iter->len)
{
if ((iter->ptr[tmp] == ',') ||
(iter->ptr[tmp] == '\r') ||
(iter->ptr[tmp] == '\n'))
break;
++tmp;
}
VCSV__INC(tmp);
if (!iter->len)
break;
vcsv_end(pos, data_len, beg_len, len, rows);
if (iter->ptr[0] == ',')
state = VCSV_ST_SKIP_COMMA;
else
state = VCSV_ST_SKIP_RET;
break;
default:
abort();
}
}
if ((state != VCSV_ST_SKIP_RET) && (state != VCSV_ST_SKIP_TRASH))
vcsv_end(pos, data_len, beg_len, len, rows);
return (ret_len);
}
#if USE_VDUMP
# define VDUMP(ret, rows) vdump(ret, rows)
static void vdump(unsigned int ret, Vstr_sects *rows)
{
unsigned int scan = 0;
vstr_add_fmt(out, out->len,
"${rep_chr:%c%zu}\n" "%d\n" "${rep_chr:%c%zu}\n",
'=', 79, ret, '=', 79);
while (scan < rows->num)
{
size_t pos = 1;
size_t len = 0;
++scan;
len = VSTR_SECTS_NUM(rows, scan)->len;
if (len)
pos = VSTR_SECTS_NUM(rows, scan)->pos;
vstr_add_fmt(out, out->len, "|${vstr:%p%zu%zu%u}|\n",
vcsv_data, pos, len, 0);
}
vstr_add_fmt(out, out->len, "${rep_chr:%c%zu}\n", '-', 79);
while (out->len)
if (!vstr_sc_write_fd(out, 1, out->len, 1, NULL))
abort();
}
#else
# define VDUMP(ret, rows)
#endif
#if defined(_WIN32)
#include <Windows.h>
#define MILLISECONDS_BETWEEN_1970_AND_1601 11644473600000Ui64
typedef unsigned __int64 uint64_t;
uint64_t
timestamp(void)
{
FILETIME ftime;
uint64_t ret;
GetSystemTimeAsFileTime(&ftime);
ret = ftime.dwHighDateTime;
ret <<= 32Ui64;
ret |= ftime.dwLowDateTime;
ret = ret / 10000Ui64 - MILLISECONDS_BETWEEN_1970_AND_1601;
return ret;
}
#else
#include <sys/time.h>
#include <inttypes.h>
static inline uint64_t
timestamp(void)
{
struct timeval tval;
gettimeofday(&tval, NULL);
return tval.tv_sec * 1000LL + tval.tv_usec / 1000;
}
#endif
int main(int argc, char *argv[])
{
VSTR_SECTS_DECL(vrows, 256);
int ret = 0;
if (argc != 2) abort();
if (!vstr_init())
abort();
VSTR_SECTS_DECL_INIT(vrows);
vcsv_data = vstr_make_base(NULL);
out = vstr_make_base(NULL);
vstr_cntl_conf(NULL, VSTR_CNTL_CONF_SET_FMT_CHAR_ESC, '$');
vstr_sc_fmt_add_all(NULL);
do
{
size_t pos = 1;
size_t len = 0;
unsigned int err = 0;
uint64_t t0 = timestamp();
if (!vstr_sc_mmap_file(vcsv_data, 0, argv[1], 0, 0, &err))
break;
len = vcsv_data->len;
if (err)
abort();
while ((ret = vcsv_row_parse(vcsv_data, pos, len, vrows, VCSV_OPT_FLAGS)))
{
VDUMP(ret, vrows);
pos += ret;
len -= ret;
vrows->num = 0;
}
vstr_add_fmt(out, out->len, "%'llu milliseconds\n", (timestamp() - t0));
while (out->len)
if (!vstr_sc_write_fd(out, 1, out->len, 2, NULL))
abort();
} while (FALSE);
exit (EXIT_SUCCESS);
}