2
\$\begingroup\$

The library (inspired by stb libraries) attempts to provide some commonly used functions (reading a file into memory, determining the size of a file) that are missing from the C standard library portably (for my own use cases).

Code:

#ifndef IO_H
#define IO_H

#include <stdio.h>
#include <stdbool.h>
#include <stdint.h>

/* 
 * To use, do this:
 *   #define IO_IMPLEMENTATION
 * before you include this file in *one* C file to create the implementation.
 *
 * i.e. it should look like:
 * #include ...
 * #include ...
 *
 * #define IO_IMPLEMENTATION
 * #include "io.h"
 * ...
 * 
 * To make all functions have internal linkage, i.e. be private to the source
 * file, do this:
 *  #define `IO_STATIC` 
 * before including "io.h".
 *
 * i.e. it should look like:
 * #define IO_IMPLEMENTATION
 * #define IO_STATIC
 * #include "io.h"
 * ...
 *
 * You can #define IO_MALLOC, IO_REALLOC, and IO_FREE to avoid using malloc(),
 * realloc(), and free(). Note that all three must be defined at once, or none.
 */

#ifndef IO_DEF
#ifdef IO_STATIC
#define IO_DEF  static
#else
#define IO_DEF  extern
#endif                          /* IO_STATIC */
#endif                          /* IO_DEF */

#if defined(__GNUC__) || defined(__clang__)
#define ATTRIB_NONNULL(...)              __attribute__((nonnull (__VA_ARGS__)))
#define ATTRIB_WARN_UNUSED_RESULT        __attribute__((warn_unused_result))
#else
#define ATTRIB_NONNULL(...)              /* If only. */
#define ATTRIB_WARN_UNUSED_RESULT        /* If only. */
#endif                          /* defined(__GNUC__) || define(__clang__) */

/* 
 * Reads the file pointed to by `stream` to a buffer and returns it.
 * The returned buffer is a null-terminated string.
 * If `nbytes` is not NULL, it shall hold the size of the file.
 * 
 * Returns NULL on memory allocation failure. The caller is responsible for
 * freeing the returned pointer.
 */
IO_DEF char *io_read_file(FILE *stream,
    size_t *nbytes) ATTRIB_NONNULL(1) ATTRIB_WARN_UNUSED_RESULT;

/* 
 * Splits a string into a sequence of tokens. The `delim` argument 
 * specifies a set of bytes that delimit the tokens in the parsed string.
 * If `ntokens` is not NULL, it shall hold the amount of total tokens.
 *
 * Returns an array of pointers to the tokens, or NULL on memory allocation
 * failure. The caller is responsible for freeing the returned pointer.
 */
IO_DEF char **io_split_by_delim(char *s, const char *delim,
    size_t *ntokens) ATTRIB_NONNULL(1, 2) ATTRIB_WARN_UNUSED_RESULT;

/* 
 * Splits a string into lines.
 * A wrapper around `io_split_by_delim()`. It calls the function with "\n" as
 * the delimiter.
 *
 * Returns an array of pointers to the tokens, or NULL on memory allocation
 * failure. The caller is responsible for freeing the returned pointer.
 */
IO_DEF char **io_split_lines(char *s,
    size_t *nlines) ATTRIB_NONNULL(1) ATTRIB_WARN_UNUSED_RESULT;

/* 
 * Reads the next line from the stream pointed to by `stream`. The returned line 
 * is null-terminated and does not contain a newline, if one was found.
 *
 * The memory pointed to by `size` shall contain the length of the 
 * line (including the terminating null character). Else it shall contain 0.
 *  
 * Upon successful completion a pointer is returned and the size of the line is 
 * stored in the memory pointed to by `size`, otherwise NULL is returned and
 * `size` holds 0.
 * 
 * `fgetline()` does not distinguish between end-of-file and error; the routines
 * `feof()` and `ferror()` must be used to determine which occurred. The
 * function also returns NULL on a memory-allocation failure. 
 *
 * Although a null character is always supplied after the line, note that
 * `strlen(line)` will always be smaller than the value is `size` if the line
 * contains embedded null characters.
 */
IO_DEF char *io_read_line(FILE *stream, size_t *size) ATTRIB_NONNULL(1,
    2) ATTRIB_WARN_UNUSED_RESULT;

/*
 * `size` must be a non-null pointer. On success, the function assigns `size`
 * with the number of bytes read and returns true, or returns false elsewise.
 *
 * Note: The file can grow between io_fsize() and a subsequent read.
 */
IO_DEF bool io_fsize(FILE *stream, size_t *size) ATTRIB_NONNULL(1, 2);

/* 
 * Writes `lines` to the file pointed to by `stream`.
 *
 * On success, it returns true, or false elsewise.
 */
IO_DEF bool io_write_lines(FILE *stream, size_t nlines,
    char *lines[const static nlines]) ATTRIB_NONNULL(1, 3);

/* 
 * Writes nbytes from the buffer pointed to by `data` to the file pointed to 
 * by `stream`. 
 *
 * On success, it returns true, or false elsewise.
 */
IO_DEF bool io_write_file(FILE *stream, size_t nbytes,
    const char data[static nbytes]) ATTRIB_NONNULL(1, 3);

#endif                          /* IO_H */

#ifdef IO_IMPLEMENTATION

#if defined(IO_MALLOC) && defined(IO_REALLOC) && defined(IO_FREE)
// Ok.
#elif !defined(IO_MALLOC) && !defined(IO_REALLOC) && !defined(IO_FREE)
// Ok.
#else
#error  "Must define all or none of IO_MALLOC, IO_REALLOC, and IO_FREE."
#endif

#ifndef IO_MALLOC
#define IO_MALLOC(sz)       malloc(sz)
#define IO_REALLOC(p, sz)   realloc(p, sz)
#define IO_FREE(p)          free(p)
#endif

#include <stdlib.h>
#include <string.h>
#include <string.h>

#define CHUNK_SIZE          INT64_C(1024 * 8)
#define TOKEN_CHUNK_SIZE    INT64_C(1024 * 2)

IO_DEF char *io_read_file(FILE *stream, size_t *nbytes)
{
    static const size_t page_size = CHUNK_SIZE;
    char *content = NULL;
    size_t len = 0;

    for (size_t rcount = 1; rcount; len += rcount) {
        void *const tmp = IO_REALLOC(content, len + page_size);

        if (!tmp) {
            IO_FREE(content);
            return content = NULL;
        }
        content = tmp;
        rcount = fread(content + len, 1, page_size - 1, stream);

        if (ferror(stream)) {
            IO_FREE(content);
            return content = NULL;
        }
    }

    if (nbytes) {
        *nbytes = len;
    }
    content[len] = '\0';
    return content;
}

IO_DEF char **io_split_by_delim(char *s, const char *delim, size_t *ntokens)
{
    char **tokens = NULL;
    const size_t chunk_size = TOKEN_CHUNK_SIZE;
    size_t capacity = 0;
    size_t token_count = 0;

    while (*s) {
        if (token_count >= capacity) {
            char **const tmp = IO_REALLOC(tokens,
                sizeof *tokens * (capacity += chunk_size));

            if (!tmp) {
                IO_FREE(tokens);
                return NULL;
            }
            tokens = tmp;
        }
        tokens[token_count++] = s;
        s = strpbrk(s, delim);

        if (s) {
            *s++ = '\0';
        }
    }

    if (ntokens) {
        *ntokens = token_count;
    }
    return tokens;

}

IO_DEF char **io_split_lines(char *s, size_t *nlines)
{
    return io_split_by_delim(s, "\n", nlines);
}

IO_DEF char *io_read_line(FILE *stream, size_t *size)
{
    const size_t page_size = BUFSIZ;
    size_t count = 0;
    size_t capacity = 0;
    char *line = NULL;

    for (;;) {
        if (count >= capacity) {
            char *const tmp = realloc(line, capacity += page_size);

            if (!tmp) {
                free(line);
                return NULL;
            }

            line = tmp;
        }

        int c = getc(stream);

        if (c == EOF || c == '\n') {
            if (c == EOF) {
                if (feof(stream)) {
                    if (!count) {
                        free(line);
                        return NULL;
                    }
                    /* Return what was read. */
                    break;
                }
                free(line);
                return NULL;
            } else {
                break;
            }
        } else {
            line[count] = (char) c;
        }
        ++count;
    }

    /* Shrink line to size if possible. */
    void *tmp = realloc(line, count + 1);

    if (tmp) {
        line = tmp;
    }

    line[count] = '\0';
    *size = ++count;
    return line;
}

/* 
 * Reasons to not use `fseek()` and `ftell()` to compute the size of the file:
 * 
 * Subclause 7.12.9.2 of the C Standard [ISO/IEC 9899:2011] specifies the
 * following behavior when opening a binary file in binary mode:
 * 
 * >> A binary stream need not meaningfully support fseek calls with a whence 
 * >> value of SEEK_END.
 *
 * In addition, footnote 268 of subclause 7.21.3 says:
 *
 * >> Setting the file position indicator to end-of-file, as with 
 * >> fseek(file, 0, SEEK_END) has undefined behavior for a binary stream.
 *
 * For regular files, the file position indicator returned by ftell() is useful
 * only in calls to fseek. As such, the value returned may not be reflect the 
 * physical byte offset. 
 *
 * Reasons to not use other non-standard functions:
 * 1) fseeko()/ftello() - Not specified in the C Standard, and have the same 
 *                        problem as the method above. 
 * 
 * 2) fstat()/stat() - Not specified in the C standard. POSIX specific.
 * 3) _filelength()/_filelengthi64()/GetFileSizeEx() - Not specified in the C
 *                                                     Standard. Windows
 *                                                     specific.
 * As such, we read the file in chunks, which is the only portable way to
 * determine the size of a file.
 *
 * Yet this is highly inefficient. We could use some #ifdefs and revert to
 * platform-specific functions.
 */
IO_DEF bool io_fsize(FILE *stream, size_t *size)
{
/* TODO:
 *
 * #if defined(_WIN32)
 *   Windows also supports fileno(), struct stat, and fstat() as _fileno(),
 *   _fstat(), and struct _stat.
 *
 *   #ifdef _WIN32 
 *   #define fstat  _fstat
 *   #define fileno _fileno
 *   #define stat   _stat 
 *   #endif
 *
 *   But which version to use? See: 
 *   learn.microsoft/en-us/cpp/c-runtime-library/reference/fstat-fstat32-fstat64-fstati64-fstat32i64-fstat64i32?view=msvc-170
 * #elif defined(__unix__) || defined(__linux__) et cetera
 *      struct stat st;
 *
 *      if (fstat(fileno(stream), st) == 0) {
 *          return st.size;
 *      }
 *      return -1?
 *          
 *      }
 * #else 
 *   Fall back to the default and read it in chunks.
 * #endif
 */
    size_t rcount = 0;
    char chunk[CHUNK_SIZE];

    while ((rcount = fread(chunk, 1, CHUNK_SIZE, stream)) > 0) {
        *size += rcount;
    }

    return !ferror(stream);
}

IO_DEF bool io_write_lines(FILE *stream, size_t nlines,
    char *lines[const static nlines])
{
    for (size_t i = 0; i < nlines; ++i) {
        if (fprintf(stream, "%s\n", lines[i]) < 0) {
            return false;
        }
    }

    return true;
}

IO_DEF bool io_write_file(FILE *stream, size_t nbytes,
    const char data[static nbytes])
{
    size_t nwritten = fwrite(data, 1, nbytes, stream);

    if (nwritten != nbytes || ferror(stream)) {
        return false;
    }

    return true;
}

#undef CHUNK_SIZE
#undef TOKEN_CHUNK_SIZE
#endif                          /* IO_IMPLEMENTATION */

And here's how it can be used:

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#define IO_IMPLEMENTATION
#define IO_STATIC
#include "io.h"

int main(int argc, char **argv)
{
    if (argc == 2) {
        FILE *fp = fopen(argv[1], "rb");
        assert(fp);
        
        size_t nbytes = 0;
        char *const fbuf = io_read_file(fp, &nbytes);
        assert(fbuf);
        assert(io_write_file(stdout, nbytes, fbuf));

        rewind(fp);

        size_t size = 0;
        bool rv = io_fsize(fp, &size);
        assert(rv);
        printf("Filesize: %zu.\n", size);

        rewind(fp);

        /* size_t nlines = 0; */
        /* char **lines = io_split_lines(fbuf, &nlines); */
        /* assert(lines); */
        /* assert(io_write_lines(stdout, nlines, lines)); */
        
        size_t ntokens = 0;
        char **tokens = io_split_by_delim(fbuf, " \f\n\r\v\t", &ntokens);
        assert(tokens);
        assert(io_write_lines(stdout, ntokens, tokens));

        free(fbuf);
        free(tokens);
        /* free(lines); */    
        fclose(fp);
    }
    
    return EXIT_SUCCESS;
}

Review Request:

Are there any bugs or undefined/implementation-defined behavior in the code? Are there any edge cases where the functions would leak memory?

Where would restrict be suitable to use?

General coding comments, style, bad practices, et cetera.

\$\endgroup\$
3
  • 1
    \$\begingroup\$ What's your reasoning for putting the implementation into a header file, rather than a unit file? \$\endgroup\$ Commented Feb 27 at 12:35
  • \$\begingroup\$ @RichardNeumann Personal preference, and I find them easier to use this way, and some more here: github.com/nothings/… \$\endgroup\$
    – Harith
    Commented Feb 27 at 12:41
  • 1
    \$\begingroup\$ if (nwritten != nbytes || ferror(stream)) { will be true on a successful write if the error flag was set prior to the io_write_file() call. Recommend if (nwritten != nbytes) \$\endgroup\$
    – chux
    Commented Mar 7 at 19:35

2 Answers 2

5
\$\begingroup\$

There is a lot to review here.


Power-of-2 read size in io_read_line()

Since it looks like code is trying to use a power-of-2 page_size, Lets fread() read a power-of-2 bytes

    // IO_REALLOC(content, len + page_size)
    // ...
    // rcount = fread(content + len, 1, page_size - 1, stream);

    IO_REALLOC(content, len + page_size + 1)
    ...
    rcount = fread(content + len, 1, page_size, stream);

OR

Just allocate in a power of 2 increment and perform, when needed, a IO_REALLOC() at the end to accommodate the append null character.

OR

Just allocate in a power of 2 increment and always perform a "right-size" IO_REALLOC() at the end to accommodate the exact size needed.

O() of io_read_line() and io_read_line()

Rather than grow allocation linearly by capacity += page_size, use a geometric growth. I like 0, 1, 3, 7, 15 ... SIZE_MAX or capacity = capacity*2 + 1.

With linear growth code is making O(length) allocations and potentiality incurring O(length*length) time.

With geometric growth, code is making O(ln(length)) allocations and potentiality incurring O(length*ln(length)) time.

Minor: reformat for clarity

Consider putting attributes on following line,

// IO_DEF char *io_read_file(FILE *stream,
//   size_t *nbytes) ATTRIB_NONNULL(1) ATTRIB_WARN_UNUSED_RESULT;

IO_DEF char *io_read_file(FILE *stream, size_t *nbytes) //
  ATTRIB_NONNULL(1) ATTRIB_WARN_UNUSED_RESULT;

Bug: Infinite loop in io_read_file()?

When fread(content + len, 1, page_size - 1, stream); returns 0 due to end-of-file, for (size_t rcount = 1; rcount; len += rcount) iterates endlessly.

Do not assume file error is false before io_read_file()

Similar problem in io_read_line().

    // ferror(stream) could be true due to prior stream error.
    if (ferror(stream)) {
        IO_FREE(content);
        return content = NULL;
    }

Fix for this and prior bug:

    size_t n = page_size - 1; // Or n = page_size (see above).
    rcount = fread(content + len, 1, n, stream);
    if (rcount < n) {
      if (!feof(stream)) {
        IO_FREE(content);
        return content = NULL;
      }
      break;
    }

*nbytes on error in io_read_line()

IMO, *nbytes should always be set, when nbytes != NULL, even when NULL is returned from the function.

Code may return non-NULL on input error in io_read_line()

This review comment not applicable as even though code was feof(stream) I originally processed that as ferror(stream).

Code has:
             if (feof(stream)) {
                if (!count) {
                    free(line);
                    return NULL;
                }
                /* Return what was read. */
                break;
            }

which, IMO, should always return NULL when an input error occurs, not just when count == 0.

Bug: Unsafe re-alloc()

In the following code, should tmp == NULL, line[count] may be beyond prior allocation as count == capacity is possible.

void *tmp = realloc(line, count + 1);
if (tmp) {
    line = tmp;
}
line[count] = '\0';  // not safe

Questionable file size

while ((rcount = fread(chunk, 1, CHUNK_SIZE, stream)) > 0) continues to read even when a read error occurs. Read errors are not certainly sticky. The read error flag is sticky.

Better as

IO_DEF bool io_fsize(FILE *stream, size_t *size) {
    size_t rcount = 0;
    char chunk[CHUNK_SIZE];

    // Consider a `rewind()` here
    // Depends on what OP wants should stream not be at the beginning.

    do {
      rcount = fread(chunk, 1, CHUNK_SIZE, stream);
     *size += rcount;
    } while (rcount == CHUNK_SIZE);
    return !ferror(stream);
}

Unsigned constants

Since CHUNK_SIZE and TOKEN_CHUNK_SIZE are use exclusively for sizing, uses size_t math and types.

// #define CHUNK_SIZE          INT64_C(1024 * 8)
// #define TOKEN_CHUNK_SIZE    INT64_C(1024 * 2)
#define CHUNK_SIZE          ((size_t)1024 * 8)
#define TOKEN_CHUNK_SIZE    ((size_t)1024 * 2)

Side issue: INT64_C(1024 * 2) make a constant that is at least 64-bit, but I do not think that means the 1024 * 2 is done first using 64-bit math, but with int math. Might be a problem with 1024 * 1024 * 1024 * 2 as that int overflows.

File size versus size_t

There are many systems where file size > SIZE_MAX. Consider using wider math for file size or detect overflow.

Uncomplicate long lines

// char **const tmp = IO_REALLOC(tokens,
//            sizeof *tokens * (capacity += chunk_size));
capacity += chunk_size;
char **const tmp = IO_REALLOC(tokens, sizeof *tokens * capacity);

Style: little reason to type the return from realloc()

(Further, I like comparing with == clearer than !=. 'I don't know half of you half ...)

        // char **const tmp = IO_REALLOC(tokens, sizeof *tokens * (capacity += chunk_size));
        // if (!tmp) {
        //    IO_FREE(tokens);
        //    return NULL;
        // }
        // tokens = tmp;

        void *tmp = IO_REALLOC(tokens, sizeof *tokens * (capacity += chunk_size));
        if (tmp == NULL) {
            IO_FREE(tokens);
            return NULL;
        }
        tokens = tmp; 

Note: if (ferror()) can mislead.


I realize now OP's post comes after a recent post. It looks like there is a fair amount of repetition of same issues.

\$\endgroup\$
0
1
\$\begingroup\$

Is there any good reason that we don't set IO_IMPLEMENTATION whenever IO_STATIC is defined? It doesn't make sense to have static linkage without defining the functions.


We can simplify this test:

#if defined(IO_MALLOC) && defined(IO_REALLOC) && defined(IO_FREE)
// Ok.
#elif !defined(IO_MALLOC) && !defined(IO_REALLOC) && !defined(IO_FREE)
// Ok.
#else
#error  "Must define all or none of IO_MALLOC, IO_REALLOC, and IO_FREE."
#endif

Consider instead

#if defined(IO_MALLOC) != defined(IO_REALLOC) || defined(IO_REALLOC) != defined(IO_FREE)
#error  "Must define all or none of IO_MALLOC, IO_REALLOC, and IO_FREE."
#endif

However, it's more normal to swap in a different allocation library using your system's linker (perhaps via LD_PRELOAD on Linux systems), rather than by writing macros like this.


This assertion is incorrect:

    FILE *fp = fopen(argv[1], "rb");
    assert(fp);

fopen() can return a null pointer for any of several reasons, which should be checked for first:

        FILE *fp = fopen(argv[1], "rb");
        if (!fp) {
            perror(argv[1]);
            return EXIT_FAILURE;
        }
        assert(fp);

Why do we return a success status when called with incorrect number of arguments? That's surprising.


I'll do a detailed review of two functions.

#define CHUNK_SIZE          INT64_C(1024 * 8)
#define TOKEN_CHUNK_SIZE    INT64_C(1024 * 2)

IO_DEF char *io_read_file(FILE *stream, size_t *nbytes)
{
    static const size_t page_size = CHUNK_SIZE;

Why are we assigning a (signed) int64_t to a size_t variable? I see no need for CHUNK_SIZE to include type information at all.

    char *content = NULL;
    size_t len = 0;

    for (size_t rcount = 1; rcount; len += rcount) {

That condition is wrong - we should continue only if rcount is positive.

        void *const tmp = IO_REALLOC(content, len + page_size);

        if (!tmp) {
            IO_FREE(content);
            return content = NULL;

No need to assign content here, as it's going out of scope anyway.

Documentation needs to be clear that *nbytes is not modified in this error case.

        }
        content = tmp;
        rcount = fread(content + len, 1, page_size - 1, stream);

        if (ferror(stream)) {

Broken assumption here - we should be testing fread() return value.

            IO_FREE(content);
            return content = NULL;
        }
    }

    if (nbytes) {
        *nbytes = len;
    }
    content[len] = '\0';
    return content;
}
IO_DEF bool io_write_file(FILE *stream, size_t nbytes,
    const char data[static nbytes])
{
    size_t nwritten = fwrite(data, 1, nbytes, stream);

    if (nwritten != nbytes || ferror(stream)) {

If nwritten is positive but less than nbytes, then we should continue writing. We certainly shouldn't fail when the first write is successful.

If nwritten is zero, then we have failed (and don't need to test ferror()).

        return false;
    }

    return true;
}
\$\endgroup\$
2
  • \$\begingroup\$ "Re: fopen() can return a null pointer for any of several reasons, which should be checked for first:" Yes, I was being lazy here. About calling fwrite() in a loop, the C standard says: "The fwrite function returns the number of elements successfully written, which will be less than nmemb only if a write error is encountered." I didn't think calling it again on a write error would make sense, and this StackOverflow answer also suggests against it: stackoverflow.com/a/9389745/20017547 \$\endgroup\$
    – Harith
    Commented Mar 3 at 15:15
  • \$\begingroup\$ Ah, that's me getting mixed up with POSIX write(). That test becomes much simpler - if (nwritten != nbytes). Still no need for the ferror() test. \$\endgroup\$ Commented Mar 3 at 15:17

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Not the answer you're looking for? Browse other questions tagged or ask your own question.