Title-Case of Hyphenated Words
This algorithm only recognizes the space character as a word separator. It doesn’t capitalize words after hyphens, unlike Standard English.
Unicode
If you’re formatting text for real-world display in this century, you need to support Unicode, not just ASCII. There are some Unicode ligatures that are different in title-case than in uppercase, like dz
→ Dz
, (as in Dzlecko) and some whose capitalization changes in another locale, like i
→ İ
in Turkish.
Since your input and output in 2023 should support UTF-8, and the title-case mapping of a UTF-8 character does not necessarily have the same length, you will need separate input and output buffers. As totitle
is not standardized, you may want to fall back to toupper
for portability.
Putting it All Together
First, some boilerplate for compatibility and error-handling:
#include <assert.h>
#include <limits.h>
#include <locale.h>
#include <stdbool.h>
#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <uchar.h>
#include <wctype.h>
#if defined(_MSC_VER) && !defined(__clang__)
# define MUSTTAIL /**/
#elif __has_c_attribute(clang::musttail)
# define MUSTTAIL [[clang::musttail]]
#else
# define MUSTTAIL /**/
#endif
#if defined(_MSC_VER) && !defined(__clang__)
# define NORETURN __declspec(noreturn)
#elif __has_c_attribute(noreturn)
# define NORETURN [[noreturn]]
#else // Any implementation supporting <uchar.h> probably also supports _Noreturn.
# define NORETURN _Noreturn
#endif
#define BUFSIZE 4096U
NORETURN void fatal_error_handler( const char* const msg, const char* const file, const int line );
NORETURN void fatal_error_handler( const char* const msg, const char* const file, const int line ) {
fflush(stdout);
fprintf(stderr, "%s (at line %s:%d)\n", msg, file, line);
exit(EXIT_FAILURE);
}
#define fatal_error(msg) fatal_error_handler((msg), __FILE__, __LINE__)
I implemented the UTF-8 title-case transformation as a finite state machine using tail recursion, because I’m weird. This is only efficient on LLVM compilers that support [[musttail]]
, and it might not work for codepoints outside the BMP on platforms where wchar_t
is 16-bit.
static inline char* titlecase_utf8_helper( const size_t n_in,
const char in[],
const size_t n_out,
char out[],
mbstate_t* const in_st,
mbstate_t* const out_st,
const wctrans_t tf,
const bool is_in_word ) {
char32_t wc = U'\0';
const size_t result_in = mbrtoc32(&wc, in, n_in, in_st);
if (0 == result_in || 0 == n_in) {
if (0 == n_out) {
fatal_error("Output buffer is full");
}
*out = '\0';
return out;
} else if ((size_t)-1 == result_in) {
fatal_error("UTF-8 encoding error");
} else if ((size_t)-2 == result_in) {
fatal_error("Incomplete UTF-8 string");
} else if (0 > (ptrdiff_t)result_in) {
fatal_error("Unknown UTF-8 processing error");
} else if (result_in > n_in) { // Prevent buggy implementation from causing a buffer overrun.
fprintf(stderr, "%u\n", (unsigned)*in);
fatal_error("Broken mbrtoc32() returned invalid result");
} else if (is_in_word) { // Converted a positive number of bytes, and should not capitalize.
if (result_in >= n_out) {
fatal_error("Output buffer full");
}
assert(result_in <= n_in && result_in < n_out);
memcpy(out, in, (size_t)result_in);
MUSTTAIL return titlecase_utf8_helper(
n_in - result_in,
in + result_in,
n_out - result_in,
out + result_in,
in_st,
out_st, // Can this actually change?
tf,
!(iswspace((wint_t)wc) || iswpunct((wint_t)wc)) );
} else { // Should capitalize this letter and start a new word.
const char32_t cap = (char32_t)towctrans((wint_t)wc, tf);
char utf8_buf[MB_LEN_MAX] = "";
const size_t result_out = c32rtomb(utf8_buf, cap, out_st);
if ((size_t)-1 == result_out) {
fatal_error("unicode mapping returned invalid character");
} else if (result_out >= n_out) {
fatal_error("Output buffer full");
} else { // In a word-initial position
assert(result_out < n_out && result_in <= n_in);
memcpy(out, utf8_buf, result_out);
MUSTTAIL return titlecase_utf8_helper(
n_in - result_in,
in + result_in,
n_out - result_out,
out + result_out,
in_st,
out_st,
tf,
!(iswspace((wint_t)cap) || iswpunct((wint_t)cap)) );
}
}
}
static size_t titlecase_utf8( const size_t n_in,
const char in[],
const size_t n_out,
char out[] ) {
mbstate_t in_state = {};
// initialize the states to the initial conversion state:
mbrtoc32(NULL, "", 1, &in_state);
mbstate_t out_state = in_state;
wctrans_t tf = wctrans("totitle");
if (tf == 0) { // Fall back to uppercase if titlecase is unsupported
fflush(stdout);
fputs("Using uppercase, not titlecase.\n", stderr);
fflush(stderr);
tf = wctrans("toupper"); // Guaranteed to work.
assert(tf);
}
char* const endp = titlecase_utf8_helper(n_in, in, n_out, out, &in_state, &out_state, tf, false);
return (size_t)(endp - out);
}
Finally, a basic test driver. It requires the environment to be set to a UTF-8 locale.
int main(void) {
static char input_buf[BUFSIZE];
static char output_buf[BUFSIZE];
setlocale(LC_ALL, "");
fgets(input_buf, sizeof(input_buf), stdin);
const size_t output_size = titlecase_utf8(strlen(input_buf), input_buf, sizeof(output_buf), output_buf);
assert(strlen(output_buf) == output_size);
fwrite(output_buf, 1, output_size, stdout);
return EXIT_SUCCESS;
}
Try it on the Godbolt compiler explorer.
:-D
\$\endgroup\$