The following is a simple example of converting utf-8 to utf-16 using stdio.h and stdlib.h, while taking into account some error conditions:
#include <stdio.h> #include <stdlib.h> #include <stdint.h> #define UNICODE_ERR -1 #define MALLOC_ERR -2 int utf8_to_utf16(uint8_t *utf8, uint16_t **utf16) { int len = 0; // The required length of the utf16 array int len_utf8 = 0; // length of utf8 array uint8_t *p = utf8; while (*p) { if ((*p & amp; 0x80) == 0x00) { // single byte encoding len + = 1; len_utf8 += 1; p + = 1; } else if ((*p & amp; 0xE0) == 0xC0) { // two-byte encoding len + = 1; len_utf8 += 2; p + = 2; } else if ((*p & amp; 0xF0) == 0xE0) { // three-byte encoding len + = 1; len_utf8 += 3; p + = 3; } else if ((*p & amp; 0xF8) == 0xF0) { // four-byte encoding len + = 1; len_utf8 += 4; p + = 4; } else { // non-compliant encoding return UNICODE_ERR; } } // Open up utf16 array memory *utf16 = (uint16_t *)malloc((len + 1) * sizeof(uint16_t)); if (*utf16 == NULL) { // memory allocation failed return MALLOC_ERR; } // convert p = utf8; int i = 0; while (*p) { if ((*p & amp; 0x80) == 0x00) { // single byte encoding (*utf16)[i] = (uint16_t)(*p); i + = 1; p + = 1; } else if ((*p & amp; 0xE0) == 0xC0) { // two-byte encoding uint16_t code = (((uint16_t)(*p & amp; 0x1F)) << 6) | ((uint16_t)(*(p + 1) & amp; 0x3F)); (*utf16)[i] = code; i + = 1; p + = 2; } else if ((*p & amp; 0xF0) == 0xE0) { // three-byte encoding uint16_t code = (((uint16_t)(*p & amp; 0x0F)) << 12) | (((uint16_t)(*(p + 1) & amp; 0x3F)) << 6) | ((uint16_t)(*(p + 2) & 0x3F)); (*utf16)[i] = code; i + = 1; p + = 3; } else if ((*p & amp; 0xF8) == 0xF0) { // four-byte encoding uint32_t codepoint = (((uint32_t)(*p & amp; 0x07)) << 18) | (((uint32_t)(*(p + 1) & amp; 0x3F)) << 12) | (((uint32_t)(*(p + 2) & amp; 0x3F)) << 6) | ((uint32_t)(*(p + 3) & amp; 0x3F)); if (codepoint > 0x10FFFF) { // non-compliant encoding free(*utf16); *utf16 = NULL; return UNICODE_ERR; } codepoint -= 0x10000; uint16_t high_surrogate = 0xD800 | ((codepoint >> 10) & 0x3FF); uint16_t low_surrogate = 0xDC00 | (codepoint & 0x3FF); (*utf16)[i] = high_surrogate; i + = 1; (*utf16)[i] = low_surrogate; i + = 1; p + = 4; } } (*utf16)[i] = 0; // The last bit is zero for subsequent processing return len; } int main() { uint8_t utf8[] = "Hello, world!"; // utf-8 encoded string uint16_t *utf16; int len = utf8_to_utf16(utf8, &utf16); if (len == UNICODE_ERR) { printf("Error: Invalid utf-8 encoding.\ "); return 1; } else if (len == MALLOC_ERR) { printf("Error: Failed to allocate memory.\ "); return 1; } for (int i = 0; i < len; i ++ ) { printf(" X ", utf16[i]); } printf("\ "); free(utf16); return 0; }
The output is:
0048 0065 006C 006C 006F 002C 4E16 754C FF01 0000
Among them, the utf16 array ends with 0x0000 for subsequent processing.
#include
#include
// Check if the UTF-8 encoding is legal
int is_utf8_valid(const char* utf8) {
unsigned char c;
int num_bytes = 0;
while (*utf8 != ‘\0’) {
c = *(utf8++);
if (num_bytes == 0) {
if ((c >> 5) == 0b110) {
num_bytes = 1;
} else if ((c >> 4) == 0b1110) {
num_bytes = 2;
} else if ((c >> 3) == 0b11110) {
num_bytes = 3;
} else if ((c >> 7) != 0) {
// Illegal UTF-8 encoding
return 0;
}
} else {
if ((c >> 6) != 0b10) {
// Illegal UTF-8 encoding
return 0;
}
num_bytes–;
}
}
// If the UTF-8 encoding of the last character is incomplete, it is also considered illegal
return (num_bytes == 0);
}
// Convert UTF-8 encoding to UTF-16 encoding
unsigned short* utf8_to_utf16(const char* utf8) {
if (!is_utf8_valid(utf8)) {
printf(“Error: The input UTF-8 encoding is invalid.\
“);
return NULL;
}
int utf8_len = 0;
const char* p = utf8;
while (*p != ‘\0’) {
utf8_len++;
p++;
}
unsigned short* utf16 = (unsigned short*)malloc((utf8_len + 1) * sizeof(unsigned short));
if (utf16 == NULL) {
printf(“Error: memory allocation failed.\
“);
return NULL;
}
int utf16_len = 0;
unsigned char c;
unsigned short codepoint;
while (*utf8 != ‘\0’) {
c = *(utf8++);
if ((c >> 7) == 0) {
// Single-byte UTF-8 encoding
codepoint = c;
} else if ((c >> 5) == 0b110) {
// Double-byte UTF-8 encoding
codepoint = ((c & amp; 0b00011111) << 6) | ((*utf8 + + ) & amp; 0b00111111);
} else if ((c >> 4) == 0b1110) {
// Three-byte UTF-8 encoding
codepoint = ((c & amp; 0b00001111) << 12) | (((unsigned short)(*utf8 + + ) & amp; 0b00111111) << 6) | ((*utf8 + + ) & amp; 0b00111111);
} else {
// four-byte UTF-8 encoding
codepoint = ((c & amp; 0b00000111) << 18) | (((unsigned short)(*utf8 + + ) & amp; 0b00111111) << 12) | (((unsigned short)(*utf8 + + ) & amp; 0b00111111) << 6) | ((*utf8 + + ) & 0b00111111);
}
utf16[utf16_len++] = codepoint;
}
utf16[utf16_len] = 0; // end with 0
return utf16;
}
int main() {
const char* utf8 = “Hello, world!”; // UTF-8 encoded string
unsigned short* utf16 = utf8_to_utf16(utf8);
if (utf16 != NULL) {
printf(“UTF-16 encoding:”);
for (int i = 0; utf16[i] != 0; i ++ ) {
printf(” X “, utf16[i]);
}
printf(“\
“);
free(utf16);
}
return 0;
}