Use stdio.h stdlib.h to realize utf-8 to utf-16, use malloc to apply for memory in the function, and check the validity at the same time, output error

The following is a simple example of converting utf-8 to utf-16 using stdio.h and stdlib.h, while taking into account some error conditions:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

#define UNICODE_ERR -1
#define MALLOC_ERR -2

int utf8_to_utf16(uint8_t *utf8, uint16_t **utf16) {
    int len = 0; // The required length of the utf16 array
    int len_utf8 = 0; // length of utf8 array
    uint8_t *p = utf8;

    while (*p) {
        if ((*p & amp; 0x80) == 0x00) { // single byte encoding
            len + = 1;
            len_utf8 += 1;
            p + = 1;
        }
        else if ((*p & amp; 0xE0) == 0xC0) { // two-byte encoding
            len + = 1;
            len_utf8 += 2;
            p + = 2;
        }
        else if ((*p & amp; 0xF0) == 0xE0) { // three-byte encoding
            len + = 1;
            len_utf8 += 3;
            p + = 3;
        }
        else if ((*p & amp; 0xF8) == 0xF0) { // four-byte encoding
            len + = 1;
            len_utf8 += 4;
            p + = 4;
        }
        else { // non-compliant encoding
            return UNICODE_ERR;
        }
    }

    // Open up utf16 array memory
    *utf16 = (uint16_t *)malloc((len + 1) * sizeof(uint16_t));
    if (*utf16 == NULL) { // memory allocation failed
        return MALLOC_ERR;
    }

    // convert
    p = utf8;
    int i = 0;
    while (*p) {
        if ((*p & amp; 0x80) == 0x00) { // single byte encoding
            (*utf16)[i] = (uint16_t)(*p);
            i + = 1;
            p + = 1;
        }
        else if ((*p & amp; 0xE0) == 0xC0) { // two-byte encoding
            uint16_t code = (((uint16_t)(*p & amp; 0x1F)) << 6) | ((uint16_t)(*(p + 1) & amp; 0x3F));
            (*utf16)[i] = code;
            i + = 1;
            p + = 2;
        }
        else if ((*p & amp; 0xF0) == 0xE0) { // three-byte encoding
            uint16_t code = (((uint16_t)(*p & amp; 0x0F)) << 12) | (((uint16_t)(*(p + 1) & amp; 0x3F)) << 6) |
                            ((uint16_t)(*(p + 2) & 0x3F));
            (*utf16)[i] = code;
            i + = 1;
            p + = 3;
        }
        else if ((*p & amp; 0xF8) == 0xF0) { // four-byte encoding
            uint32_t codepoint = (((uint32_t)(*p & amp; 0x07)) << 18) | (((uint32_t)(*(p + 1) & amp; 0x3F)) << 12) |
                                 (((uint32_t)(*(p + 2) & amp; 0x3F)) << 6) | ((uint32_t)(*(p + 3) & amp; 0x3F));
            if (codepoint > 0x10FFFF) { // non-compliant encoding
                free(*utf16);
                *utf16 = NULL;
                return UNICODE_ERR;
            }
            codepoint -= 0x10000;
            uint16_t high_surrogate = 0xD800 | ((codepoint >> 10) & 0x3FF);
            uint16_t low_surrogate = 0xDC00 | (codepoint & 0x3FF);
            (*utf16)[i] = high_surrogate;
            i + = 1;
            (*utf16)[i] = low_surrogate;
            i + = 1;
            p + = 4;
        }
    }
    (*utf16)[i] = 0; // The last bit is zero for subsequent processing

    return len;
}

int main() {
    uint8_t utf8[] = "Hello, world!"; // utf-8 encoded string
    uint16_t *utf16;
    int len = utf8_to_utf16(utf8, &utf16);

    if (len == UNICODE_ERR) {
        printf("Error: Invalid utf-8 encoding.\
");
        return 1;
    }
    else if (len == MALLOC_ERR) {
        printf("Error: Failed to allocate memory.\
");
        return 1;
    }

    for (int i = 0; i < len; i ++ ) {
        printf(" X ", utf16[i]);
    }
    printf("\
");

    free(utf16);
    return 0;
}

The output is:

0048 0065 006C 006C 006F 002C 4E16 754C FF01 0000

Among them, the utf16 array ends with 0x0000 for subsequent processing.

#include

// Check if the UTF-8 encoding is legal

int is_utf8_valid(const char* utf8) {

unsigned char c;

int num_bytes = 0;

while (*utf8 != ‘\0’) {

c = *(utf8++);

if (num_bytes == 0) {

if ((c >> 5) == 0b110) {

num_bytes = 1;

} else if ((c >> 4) == 0b1110) {

num_bytes = 2;

} else if ((c >> 3) == 0b11110) {

num_bytes = 3;

} else if ((c >> 7) != 0) {

// Illegal UTF-8 encoding

return 0;

}

} else {

if ((c >> 6) != 0b10) {

// Illegal UTF-8 encoding

return 0;

}

num_bytes–;

}

// If the UTF-8 encoding of the last character is incomplete, it is also considered illegal

return (num_bytes == 0);

}

// Convert UTF-8 encoding to UTF-16 encoding

unsigned short* utf8_to_utf16(const char* utf8) {

if (!is_utf8_valid(utf8)) {

printf(“Error: The input UTF-8 encoding is invalid.\
“);

return NULL;

}

int utf8_len = 0;

const char* p = utf8;

while (*p != ‘\0’) {

utf8_len++;

p++;

}

unsigned short* utf16 = (unsigned short*)malloc((utf8_len + 1) * sizeof(unsigned short));

if (utf16 == NULL) {

printf(“Error: memory allocation failed.\
“);

return NULL;

}

int utf16_len = 0;

unsigned char c;

unsigned short codepoint;

while (*utf8 != ‘\0’) {

c = *(utf8++);

if ((c >> 7) == 0) {

// Single-byte UTF-8 encoding

codepoint = c;

} else if ((c >> 5) == 0b110) {

// Double-byte UTF-8 encoding

codepoint = ((c & amp; 0b00011111) << 6) | ((*utf8 + + ) & amp; 0b00111111);

} else if ((c >> 4) == 0b1110) {

// Three-byte UTF-8 encoding

codepoint = ((c & amp; 0b00001111) << 12) | (((unsigned short)(*utf8 + + ) & amp; 0b00111111) << 6) | ((*utf8 + + ) & amp; 0b00111111);

} else {

// four-byte UTF-8 encoding

codepoint = ((c & amp; 0b00000111) << 18) | (((unsigned short)(*utf8 + + ) & amp; 0b00111111) << 12) | (((unsigned short)(*utf8 + + ) & amp; 0b00111111) << 6) | ((*utf8 + + ) & 0b00111111);

}

utf16[utf16_len++] = codepoint;

}

utf16[utf16_len] = 0; // end with 0

return utf16;

}

int main() {

const char* utf8 = “Hello, world!”; // UTF-8 encoded string

unsigned short* utf16 = utf8_to_utf16(utf8);

if (utf16 != NULL) {

printf(“UTF-16 encoding:”);

for (int i = 0; utf16[i] != 0; i ++ ) {

printf(” X “, utf16[i]);

}

printf(“\
“);

free(utf16);

}

return 0;

}