Tutorial on ustr APIs

ustr is a string library designed to be very space efficient, and easily integrate with "normal" C string handling code. This means that it is designed to allow the programer to create the "strings" from allocated memory, the stack and read-only memory.

Also note that all error checking is included in every example, it may make the examples somewhat easier to read if it wasn't included ... however including error checking is what the code must look like in a real application.

Skel: Skeleton template for tutorial snippets.
Hello World - const: A very simple hello world, using constant strings.
Hello World - dup: Create the string.
Hello World - multiple: Create the string from multiple functions.
Hello World - stack: Use the stack instead of the heap.
fgrep: Simple fgrep.
mkdir: Simple mkdir -p.
Text to html converter: Convert plain text into similar html.

Skeleton code required to run the tutorial examples

This is the skeleton code required to run all the tutorial examples, the full versions may require a few more headers though.

#include "ustr.h"

#include <errno.h>

static void die(const char *prog_name, const char *msg)
{
  fprintf(stderr, "%s: %s\n", prog_name, msg);
  exit (EXIT_FAILURE);
}

Hello World - const

This example is the simplest possible, basically being the same as calling puts(). Note however that the length of the string is passed through to the IO function without any compiler magic by using the USTR1_CHK (checked constant ustr creation) macro, if it looks a little weird at first you can use the custr example program, to create constant ustr's automatically).

Note: Although the USTR1_CHK() macro is handed constant data and should be able to determine at compile time whether the length check is correct, GCC doesn't think this is constant and so for file scope variables you'll need to use the USTR1 (non-checking constant ustr creation) macro. Or even the simple USTR("") macro for empty ustr strings.

/* simplest, just "create" a Ustr from a constant string */
static void hello_world_one(void)
{
  Ustr *hello_world = USTR1_CHK(\xC, "Hello world!");

  if (!ustr_io_putfileline(&hello_world, stdout))
    die("hello_world", strerror(errno));
}

Hello World - dup

This example creates the string before writting it out. Note that although the ustr is allocated, it doesn't need to be free'd because a default configured string of zero length is represented by "".

static void hello_world_two(void)
{ /* next, create a Ustr from a printf() like format. This needs to be
   * free'd. */
  Ustr *out = ustr_dup_fmt("%s %s", "Hello", "world!");

  if (!out || !ustr_io_putfileline(&out, stdout))
    die("hello_world", strerror(errno));
}

Hello World - multiple parts

This example creates the string using multiple functions. The interesting part to notice is the call to ustr_enomem(), this checks if any allocation failures have happened to this particular ustr. Also note that we create a specifically configured ustr, which includes a size and a 4 byte reference count.

static void hello_world_three(void)
{ /* manually create a Ustr, from multiple parts. Often "significantly faster"
   * than using ustr_*_fmt(), due to printf overhead. Still needs to allocate
   * memory, and maybe resize it. Still need to free it. */
  Ustr *hello = USTR1(\5, "Hello");
  Ustr *sp    = USTR1(\1, " ");
  Ustr *world = USTR1(\6, "world!");
  Ustr *out   = ustr_dupx_empty(1, 4, USTR_FALSE, USTR_FALSE);

  if (!out)
    die("hello_world", strerror(ENOMEM));
  
  ustr_add(&out, hello);
  ustr_add(&out, sp);
  ustr_add(&out, world);
  
  if (ustr_enomem(out)) /* check all 3 additions at once */
    die("hello_world", strerror(ENOMEM));

  if (!ustr_io_putfileline(&out, stdout))
    die("hello_world", strerror(errno));
  
  ustr_free(out);
}

Hello World - stack based allocation

This example also creates a string from multiple functions, however the storage for the ustr is on the stack and so we don't need to de-allocate the ustr (although, as with the constant ustr, it does no harm).

static void hello_world_four(void)
{ /* manually create a Ustr, but use "auto" allocated storage
   * (stack instead of heap). As long as you don't use more than ustr_size()
   * you don't need to free. Also note that ustr_dup() will now always copy. */
  Ustr *sp    = USTR1(\1, " ");
  Ustr *world = USTR1(\6, "world!");
  char buf_out[1024] = USTR_BEG_FIXED2 "Hello";
  Ustr *out = USTR_SC_INIT_AUTO(buf_out, USTR_TRUE, 5);

  ustr_add(&out, sp);
  ustr_add(&out, world);

  /* in this can we know !ustr_enomem() as there is more than enough space */
  
  if (!ustr_io_putfileline(&out, stdout))
    die("hello_world", strerror(errno));
  
  /* ustr_free() not needed, because nothing was allocated.
   * Although it's often good to call it anyway, as it does no harm. */
}

fgrep

This example works like GNU's fgrep --color. There are two main things to notice here. The first is that searching, or searching and replacing with a colourized variant, and reading lines are all just ustr API calls. The second is that the line data is allocated on the stack, by default.

Both of these significantly improve the ease of use and speed of the resulting code, as ustr_io_getline() is significantly faster than fgets() (due to not having the length of data returned) and allocating from the stack can be a huge performance win (note that all functions, including ustr_free(), do the correct thing ... so we don't suffer complexity for that performance).

static int fgrep(Ustr **ps1, Ustr *fgrep_srch, Ustr *fgrep_repl,
                 int first_only)
{
  size_t num = 0;

  if (fgrep_repl)
    num = ustr_srch_fwd(*ps1, 0, fgrep_srch);
  else if (!(num = ustr_replace(ps1, fgrep_srch, fgrep_repl,
                                !first_only)) && errno)
    die("fgrep", strerror(ENOMEM));
  
  if (!num)
    ustr_sc_del(ps1);

  return (!!num);
}

static void fgrep_fp_loop(FILE *in, 
                          Ustr *fgrep_srch, Ustr *fgrep_repl)
{
  char buf[USTR_SIZE_FIXED(160)]; /* enough for two "normal" lines,
                                     after that we alloc. */
  Ustr *line = USTR_SC_INIT_AUTO(buf, USTR_FALSE, 0);
  
  while (ustr_io_getline(&line, in))
  {
    if (!fgrep(&line, fgrep_srch, fgrep_repl, USTR_FALSE))
      ustr_sc_del(&line);
    else if (!ustr_io_putfile(&line, stdout))
      die("fgrep", strerror(errno));
    
    if (line != USTR(buf)) /* re-init */
      ustr_sc_free2(&line, USTR_SC_INIT_AUTO(buf, USTR_FALSE, 0));
  }
  if (errno)
    die("fgrep", strerror(errno));

  ustr_free(line);
}

mkdir -p function

This example shows how a single function can take both an allocated (and modifiable) ustr and a constant string ustr, to efficiently modify data.

static int fu__mkdir_p(const Ustr *s1, int mode, size_t off, int ret)
{
  Ustr *allocd = USTR_NULL;
  char *ptr = NULL;

  if (mkdir(ustr_cstr(s1), mode) != -1)
    return (ret + 1);

  switch (errno)
  {
    case EEXIST: return (ret);
      
    case ENOENT: break;
    
    default:     return (-1);
  }

  if ((off = ustr_srch_chr_rev(s1, off, '/')) <= 1)
  {
    errno = EINVAL;
    return (-1);
  }
  --off; /* NOTE: offset moves from beg. to end */
  
  if (!ustr_owner(s1))
  { /* do it this way, so we can pass constant Ustr's to this function
     * and don't use ustr_sc_ensure_owner() so that we don't release a
     * reference */
    if (!(allocd = ustr_dup_buf(ustr_cstr(s1), ustr_len(s1))))
      return (-1); /* errno == ENOMEM, done by ustr */
    s1 = allocd;
  }
  
  ptr = ustr_wstr((Ustr *)s1);
  ptr[off] = 0;
  if ((ret = fu__mkdir_p(s1, mode, ustr_len(s1) - off, ret + 1)) != -1)
  {
    ptr[off] = '/';
    if (mkdir(ustr_cstr(s1), mode) == -1)
      ret = -1;
  }
  ustr_free(allocd);

  return (ret);
}

/* This returns -1, on error, or the number of directories created. */
static int mkdir_p(const Ustr *s1, int mode)
{
  return (fu__mkdir_p(s1, mode, 0, 0));
}

In other words:

 mkdir_p(USTR1(\x9, "12/45/789"), 0700)

...will create/free a single ustr for all 3 of the seperate paths required.

text into similar looking html converter

This is a function program showing how you can do the perl code (written by, tchrist@perl.com - Sunday, December 19th, 1999):

/* 
# first kill all the tabs
1 while s{ \t + }
         { " " x (length($&)*8 - length($`)%8) }ex;

# then the four standard naughty bits 
s/&/&amp;/g;        # must remember to do this one first!
s/</&lt;/g;         # this is the most important one
s/>/&gt;/g;         # don't close too early
s/"/&quot;/g;       # only in embedded tags, i guess

# make lines break where they should
s/^\s*$/<P>/ || s/$/<BR>/;

# make sure spaces aren't squishticated so we
# can do indentation and properly align comments
s/( {2,})/'&nbsp;' x length($1)/ge;
*/

The interesting points here are how simple most of the ustr code is, tab conversion is 2 lines in the original perl and 5 (1 could be dropped for a speed loss) with ustr. The "four std. naughty bits" take a single line in both perl and ustr. The only major difference is in the final perl 1 liner, which is 8 interesting lines (due to the two or more spaces constraint, or we could just use ustr_replace()).

It's also worth noting that the ustr version is significantly faster than the smaller perl code.

static void txt2html(Ustr **pline)
{
  Ustr *line = *pline;
  size_t tab_pos = 0;
  size_t tab_off = 0;
  int has_ret = 0;
  
  /* convert tabs to spaces */
  while ((tab_pos = ustr_srch_chr_fwd(line, tab_off, '\t')))
  {
    size_t tabs_len = ustr_spn_chr_fwd(line, tab_pos - 1, '\t');
    size_t spcs_len = (tabs_len * 8) - ((tab_pos - 1) % 8);
    
    ustr_sc_sub_rep_chr(&line, tab_pos, tabs_len, ' ', spcs_len);
    
    tab_off = tab_pos + spcs_len - 1;
  }

  if (ustr_cstr(line)[ustr_len(line) - 1] == '\n')
    has_ret = 1;
  
  if (ustr_spn_chr_fwd(line, 0, ' ') == (ustr_len(line) - has_ret))
    ustr_set(&line, USTR1(\3, "<P>")); /* blank lines start new paragraph */
  else
  {
    size_t spcs_off = 0;
    size_t spcs_pos = 0;
    char buf_rep[USTR_SIZE_FIXED(40 * 6)] = USTR_BEG_FIXED2 "&nbsp;";
    
    ustr_replace_cstr(&line, "&",  "&amp;",  0);
    ustr_replace_cstr(&line, "<",  "&lt;",   0);
    ustr_replace_cstr(&line, ">",  "&gt;",   0);
    ustr_replace_cstr(&line, "\"", "&quot;", 0);
    ustr_del(&line, has_ret);
    ustr_add_cstr(&line, "<BR>\n");

    /* convert runs of two or more spaces into runs of &nbsp; */
    while ((spcs_pos = ustr_srch_cstr_fwd(line, spcs_off, "  ")))
    {
      size_t spcs_len = ustr_spn_chr_fwd(line, spcs_pos - 1, ' ');
      size_t rep = spcs_len;
      size_t more = 0;
      Ustr *rep_nbsp = USTR_SC_INIT_AUTO(buf_rep, USTR_FALSE, 0);

      while (rep--)
        ustr_add_cstr(&rep_nbsp, "&nbsp;");

      if (ustr_enomem(rep_nbsp) ||
          !ustr_sc_sub(&line, spcs_pos, spcs_len, rep_nbsp))
        die(prog_name, strerror(ENOMEM));

      spcs_off = spcs_pos + (spcs_len * strlen("&nbsp;")) - 1;

      ustr_free(rep_nbsp);
    }
  }
  
  if (ustr_enomem(line))
    die("txt2html", strerror(errno));

  *pline = line;
}

James Antill

Last modified: Tue Oct 30 01:20:07 EDT 2007