/*
    See license.txt in the root of this project.
*/

# include "luametatex.h"

/*tex

    The 5- and 6-byte UTF-8 sequences generate integers that are outside of the valid UCS range,
    and therefore unsupported. We recover from an error with |0xFFFD|.

*/

unsigned aux_str2uni(const unsigned char *text)
{
    if (text[0] < 0x80) {
        return (unsigned) text[0];
    } else if (text[0] <= 0xBF) {
        return 0xFFFD;
    } else if (text[0] <= 0xDF) {
        if (text[1] >= 0x80 && text[1] < 0xC0) {
            return (unsigned) (((text[0] & 0x1F) << 6) | (text[1] & 0x3F));
        }
    } else if (text[0] <= 0xEF) {
        if (text[1] >= 0x80 && text[1] < 0xC0 && text[2] >= 0x80 && text[2] < 0xC0) {
            return (unsigned) (((text[0] & 0xF) << 12) | ((text[1] & 0x3F) << 6) | (text[2] & 0x3F));
        }
    } else if (text[0] <= 0xF7) {
        if (text[1] <  0x80 || text[2] <  0x80 || text[3] <  0x80 ||
            text[1] >= 0xC0 || text[2] >= 0xC0 || text[3] >= 0xC0) {
            return 0xFFFD;
        } else {
            int w1 = (((text[0] & 0x7) << 2) | ((text[1] & 0x30) >> 4)) - 1;
            int w2 = ((text[2] & 0xF) << 6) | (text[3] & 0x3F);
            w1 = (w1 << 6) | ((text[1] & 0xF) << 2) | ((text[2] & 0x30) >> 4);
            return (unsigned) (w1 * 0x400 + w2 + 0x10000);
        }
    }
    return 0xFFFD;
}

unsigned aux_str2uni_len(const unsigned char *text, int *len)
{
    if (text[0] < 0x80) {
        *len = 1;
        return (unsigned) text[0];
    } else if (text[0] <= 0xBF) {
        *len = 1;
        return 0xFFFD;
    } else if (text[0] <= 0xDF) {
        if (text[1] >= 0x80 && text[1] < 0xC0) {
            *len = 2;
            return (unsigned) (((text[0] & 0x1F) << 6) | (text[1] & 0x3F));
        }
    } else if (text[0] <= 0xEF) {
        if (text[1] >= 0x80 && text[1] < 0xC0 && text[2] >= 0x80 && text[2] < 0xC0) {
            *len = 3;
            return (unsigned) (((text[0] & 0xF) << 12) | ((text[1] & 0x3F) << 6) | (text[2] & 0x3F));
        }
    } else if (text[0] <= 0xF7) {
        if (text[1] <  0x80 || text[2] <  0x80 || text[3] <  0x80 ||
            text[1] >= 0xC0 || text[2] >= 0xC0 || text[3] >= 0xC0) {
            *len = 4;
            return 0xFFFD;
        } else {
            int w1 = (((text[0] & 0x7) << 2) | ((text[1] & 0x30) >> 4)) - 1;
            int w2 = ((text[2] & 0xF) << 6) | (text[3] & 0x3F);
            w1 = (w1 << 6) | ((text[1] & 0xF) << 2) | ((text[2] & 0x30) >> 4);
            *len = 4;
            return (unsigned) (w1 * 0x400 + w2 + 0x10000);
        }
    }
    *len = 1;
    return 0xFFFD;
}

unsigned char *aux_uni2str(unsigned unic)
{
    unsigned char *buf = lmt_memory_malloc(5);
    if (buf) {
        if (unic < 0x80) {
            buf[0] = (unsigned char) unic;
            buf[1] = '\0';
        } else if (unic < 0x800) {
            buf[0] = (unsigned char) (0xC0 | (unic >> 6));
            buf[1] = (unsigned char) (0x80 | (unic & 0x3F));
            buf[2] = '\0';
        } else if (unic < 0x10000) {
            buf[0] = (unsigned char) (0xE0 | (unic >> 12));
            buf[1] = (unsigned char) (0x80 | ((unic >> 6) & 0x3F));
            buf[2] = (unsigned char) (0x80 | (unic & 0x3F));
            buf[3] = '\0';
        } else if (unic < 0x110000) {
            int u; 
            unic -= 0x10000;
            u = (int) (((unic & 0xF0000) >> 16) + 1);
            buf[0] = (unsigned char) (0xF0 | (u >> 2));
            buf[1] = (unsigned char) (0x80 | ((u & 3) << 4) | ((unic & 0xF000) >> 12));
            buf[2] = (unsigned char) (0x80 | ((unic & 0xFC0) >> 6));
            buf[3] = (unsigned char) (0x80 | (unic & 0x3F));
            buf[4] = '\0';
        } else {
            buf[0] = '\0';
        }
    }
    return buf;
}

/*tex

    Function |buffer_to_unichar| converts a sequence of bytes in the |buffer| into a \UNICODE\
    character value. It does not check for overflow of the |buffer|, but it is careful to check
    the validity of the \UTF-8 encoding. For historical reasons all these small helpers look a bit
    different but that has a certain charm so we keep it.

*/

char *aux_uni2string(char *utf8_text, unsigned unic)
{
    /*tex Increment and deposit character: */
    if (unic <= 0x7F) {
        *utf8_text++ = (char) unic;
    } else if (unic <= 0x7FF) {
        *utf8_text++ = (char) (0xC0 | (unic >> 6));
        *utf8_text++ = (char) (0x80 | (unic & 0x3F));
    } else if (unic <= 0xFFFF) {
        *utf8_text++ = (char) (0xe0 | (unic >> 12));
        *utf8_text++ = (char) (0x80 | ((unic >> 6) & 0x3F));
        *utf8_text++ = (char) (0x80 | (unic & 0x3F));
    } else if (unic < 0x110000) {
        unsigned u; 
        unic -= 0x10000;
        u = ((unic & 0xF0000) >> 16) + 1;
        *utf8_text++ = (char) (0xF0 | (u >> 2));
        *utf8_text++ = (char) (0x80 | ((u & 3) << 4) | ((unic & 0xF000) >> 12));
        *utf8_text++ = (char) (0x80 | ((unic & 0xFC0) >> 6));
        *utf8_text++ = (char) (0x80 | (unic & 0x3F));
    }
    return utf8_text;
}

/*tex This one could be more efficient because we know a bit more. */

unsigned aux_splitutf2uni(unsigned int *ubuf, const char *utf8buf)
{
    int len = (int) strlen(utf8buf);
    unsigned int *upt = ubuf;
    unsigned int *uend = ubuf + len;
    const unsigned char *pt = (const unsigned char *) utf8buf;
    const unsigned char *end = pt + len;
    while (pt < end && *pt != '\0' && upt < uend) {
        if (*pt <= 0x7F) {
            *upt = *pt++;
        } else if (*pt <= 0xDF) {
            *upt = (unsigned int) (((*pt & 0x1F) << 6) | (pt[1] & 0x3F));
            pt += 2;
        } else if (*pt <= 0xEF) {
            *upt = (unsigned int) (((*pt & 0xF) << 12) | ((pt[1] & 0x3F) << 6) | (pt[2] & 0x3F));
            pt += 3;
        } else {
            int w1 = (((*pt & 0x7) << 2) | ((pt[1] & 0x30) >> 4)) - 1;
            int w2 = ((pt[2] & 0xF) << 6) | (pt[3] & 0x3F);
            w1 = (w1 << 6) | ((pt[1] & 0xF) << 2) | ((pt[2] & 0x30) >> 4);
            *upt = (unsigned int) (w1 * 0x400 + w2 + 0x10000);
            pt += 4;
        }
        ++upt;
    }
    *upt = 0; /*tex We have integers here, so assigning |\0| is a bit misleading. */
    return (unsigned int) (upt - ubuf);
}

size_t aux_utf8len(const char *text, size_t size)
{
    size_t ind = 0;
    size_t num = 0;
    while (ind < size) {
        unsigned char i = (unsigned char) *(text + ind);
        if (i < 0x80) {
            ind += 1;
        } else if (i >= 0xF0) {
            ind += 4;
        } else if (i >= 0xE0) {
            ind += 3;
        } else if (i >= 0xC0) {
            ind += 2;
        } else {
            ind += 1;
        }
        num += 1;
    }
    return num;
}