kotlin/libraries/stdlib/test/text/StringEncodingTest.kt

/*
 * Copyright 2010-2019 JetBrains s.r.o. and Kotlin Programming Language contributors.
 * Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
 */

package test.text

import test.assertArrayContentEquals
import test.testOnNonJvm6And7
import kotlin.test.*

// When decoding utf-8, JVM and JS implementations replace the sequence reflecting a surrogate code point differently.
// JS replaces each byte of the sequence by the replacement char, whereas JVM replaces the whole sequence with a single replacement char.
// See corresponding actual to find out the replacement.
internal expect val surrogateCodePointDecoding: String

// The byte sequence used to replace a surrogate char.
// JVM default replacement sequence consist of single 0x3F byte.
// JS and Native replacement byte sequence is [0xEF, 0xBF, 0xBD].
internal expect val surrogateCharEncoding: ByteArray

class StringEncodingTest {
    private fun bytes(vararg elements: Int) = ByteArray(elements.size) { elements[it].toByte() }

    private fun testEncoding(isWellFormed: Boolean, expected: ByteArray, string: String) {
        assertArrayContentEquals(expected, string.encodeToByteArray())
        if (!isWellFormed) {
            assertFailsWith<CharacterCodingException> { string.encodeToByteArray(throwOnInvalidSequence = true) }
        } else {
            assertArrayContentEquals(expected, string.encodeToByteArray(throwOnInvalidSequence = true))
            assertEquals(string, string.encodeToByteArray(throwOnInvalidSequence = true).decodeToString())
        }
    }

    private fun testEncoding(isWellFormed: Boolean, expected: ByteArray, string: String, startIndex: Int, endIndex: Int) {
        assertArrayContentEquals(expected, string.encodeToByteArray(startIndex, endIndex))
        if (!isWellFormed) {
            assertFailsWith<CharacterCodingException> { string.encodeToByteArray(startIndex, endIndex, true) }
        } else {
            assertArrayContentEquals(expected, string.encodeToByteArray(startIndex, endIndex, true))
            assertEquals(
                string.substring(startIndex, endIndex),
                string.encodeToByteArray(startIndex, endIndex, true).decodeToString()
            )
        }
    }

    // https://youtrack.jetbrains.com/issue/KT-31614
    private fun string(vararg codeUnits: Int): String {
        return buildString { codeUnits.forEach { append(Char(it)) } }
    }

    @Test
    fun encodeToByteArray() {
        // empty string
        testEncoding(true, bytes(), "")

        // 1-byte chars
        testEncoding(true, bytes(0), "\u0000")
        testEncoding(true, bytes(0x2D), "-")
        testEncoding(true, bytes(0x7F), "\u007F")

        // 2-byte chars
        testEncoding(true, bytes(0xC2, 0x80), "\u0080")
        testEncoding(true, bytes(0xC2, 0xBF), "¿")
        testEncoding(true, bytes(0xDF, 0xBF), "\u07FF")

        // 3-byte chars
        testEncoding(true, bytes(0xE0, 0xA0, 0x80), "\u0800")
        testEncoding(true, bytes(0xE6, 0x96, 0xA4), "斤")
        testEncoding(true, bytes(0xED, 0x9F, 0xBF), "\uD7FF")

        // surrogate chars
        testEncoding(false, surrogateCharEncoding, string(0xD800))
        testEncoding(false, surrogateCharEncoding, string(0xDB6A))
        testEncoding(false, surrogateCharEncoding, string(0xDFFF))

        // 3-byte chars
        testEncoding(true, bytes(0xEE, 0x80, 0x80), "\uE000")
        testEncoding(true, bytes(0xEF, 0x98, 0xBC), "\uF63C")
        testEncoding(true, bytes(0xEF, 0xBF, 0xBF), "\uFFFF")

        // 4-byte surrogate pairs
        testEncoding(true, bytes(0xF0, 0x90, 0x80, 0x80), "\uD800\uDC00")
        testEncoding(true, bytes(0xF2, 0xA2, 0x97, 0xBC), "\uDA49\uDDFC")
        testEncoding(true, bytes(0xF4, 0x8F, 0xBF, 0xBF), "\uDBFF\uDFFF")

        // reversed surrogate pairs
        testEncoding(false, surrogateCharEncoding + surrogateCharEncoding, string(0xDC00, 0xD800))
        testEncoding(false, surrogateCharEncoding + surrogateCharEncoding, string(0xDDFC, 0xDA49))
        testEncoding(false, surrogateCharEncoding + surrogateCharEncoding, string(0xDFFF, 0xDBFF))

        testEncoding(
            false,
            bytes(
                0, /**/ 0x2D, /**/ 0x7F, /**/ 0xC2, 0x80, /**/ 0xC2, 0xBF, /**/ 0xDF, 0xBF, /**/ 0xE0, 0xA0, 0x80, /**/
                0xE6, 0x96, 0xA4, /**/ 0xED, 0x9F, 0xBF, /**/ 0x7A
            ) /**/ + surrogateCharEncoding /**/ + surrogateCharEncoding /**/ + 0x7A /**/ + surrogateCharEncoding /**/ + 0x7A /**/ + surrogateCharEncoding,
            "\u0000-\u007F\u0080¿\u07FF\u0800斤\uD7FFz" + string(0xDFFF, 0xD800, 0x7A, 0xDB6A, 0x7A, 0xDB6A)
        )

        testEncoding(
            true,
            bytes(
                0xEE, 0x80, 0x80, /**/ 0xEF, 0x98, 0xBC, /**/ 0xC2, 0xBF, /**/ 0xEF, 0xBF, 0xBF, /**/
                0xF0, 0x90, 0x80, 0x80, /**/ 0xF2, 0xA2, 0x97, 0xBC, /**/ 0xF4, 0x8F, 0xBF, 0xBF
            ),
            "\uE000\uF63C¿\uFFFF\uD800\uDC00\uDA49\uDDFC\uDBFF\uDFFF"
        )

        val longChars = CharArray(200_000) { 'k' }
        val longBytes = longChars.concatToString().encodeToByteArray()
        assertEquals(200_000, longBytes.size)
        assertTrue { longBytes.all { it == 0x6B.toByte() } }
    }

    @Test
    fun encodeToByteArraySlice() {
        assertFailsWith<IllegalArgumentException> { "".encodeToByteArray(startIndex = 1) }
        assertFailsWith<IllegalArgumentException> { "123".encodeToByteArray(startIndex = 10) }
        assertFailsWith<IndexOutOfBoundsException> { "123".encodeToByteArray(startIndex = -1) }
        assertFailsWith<IndexOutOfBoundsException> { "123".encodeToByteArray(endIndex = 10) }
        assertFailsWith<IllegalArgumentException> { "123".encodeToByteArray(endIndex = -1) }
        assertFailsWith<IndexOutOfBoundsException> { "123".encodeToByteArray(startIndex = 5, endIndex = 10) }
        assertFailsWith<IllegalArgumentException> { "123".encodeToByteArray(startIndex = 5, endIndex = 2) }
        assertFailsWith<IndexOutOfBoundsException> { "123".encodeToByteArray(startIndex = 1, endIndex = 4) }

        testEncoding(true, bytes(), "abc", 0, 0)
        testEncoding(true, bytes(), "abc", 3, 3)
        testEncoding(true, bytes(0x62, 0x63), "abc", 1, 3)
        testEncoding(true, bytes(0x61, 0x62), "abc", 0, 2)
        testEncoding(true, bytes(0x62), "abc", 1, 2)

        testEncoding(true, bytes(0x2D), "-", 0, 1)
        testEncoding(true, bytes(0xC2, 0xBF), "¿", 0, 1)
        testEncoding(true, bytes(0xE6, 0x96, 0xA4), "斤", 0, 1)

        testEncoding(false, surrogateCharEncoding, string(0xDB6A), 0, 1)

        testEncoding(true, bytes(0xEF, 0x98, 0xBC), "\uF63C", 0, 1)

        testEncoding(true, bytes(0xF2, 0xA2, 0x97, 0xBC), "\uDA49\uDDFC", 0, 2)
        testEncoding(false, surrogateCharEncoding, "\uDA49\uDDFC", 0, 1)
        testEncoding(false, surrogateCharEncoding, "\uDA49\uDDFC", 1, 2)

        testEncoding(
            false,
            bytes(0xE6, 0x96, 0xA4, /**/ 0xED, 0x9F, 0xBF, /**/ 0x7A) /**/ + surrogateCharEncoding /**/ + surrogateCharEncoding,
            "\u0000-\u007F\u0080¿\u07FF\u0800斤\uD7FFz" + string(0xDFFF, 0xD800, 0x7A, 0xDB6A, 0x7A, 0xDB6A),
            startIndex = 7,
            endIndex = 12
        )

        testEncoding(
            false,
            bytes(0xC2, 0xBF, /**/ 0xEF, 0xBF, 0xBF, /**/ 0xF0, 0x90, 0x80, 0x80, /**/ 0xF2, 0xA2, 0x97, 0xBC) /**/ + surrogateCharEncoding,
            "\uE000\uF63C¿\uFFFF\uD800\uDC00\uDA49\uDDFC\uDBFF\uDFFF",
            startIndex = 2,
            endIndex = 9
        )

        val longChars = CharArray(200_000) { 'k' }
        val longBytes = longChars.concatToString().encodeToByteArray(startIndex = 5000, endIndex = 195_000)
        assertEquals(190_000, longBytes.size)
        assertTrue { longBytes.all { it == 0x6B.toByte() } }
    }

    private fun testDecoding(isWellFormed: Boolean, expected: String, bytes: ByteArray) {
        assertEquals(expected, bytes.decodeToString())
        if (!isWellFormed) {
            assertFailsWith<CharacterCodingException> { bytes.decodeToString(throwOnInvalidSequence = true) }
        } else {
            assertEquals(expected, bytes.decodeToString(throwOnInvalidSequence = true))
            assertArrayContentEquals(bytes, bytes.decodeToString(throwOnInvalidSequence = true).encodeToByteArray())
        }
    }

    private fun testDecoding(isWellFormed: Boolean, expected: String, bytes: ByteArray, startIndex: Int, endIndex: Int) {
        assertEquals(expected, bytes.decodeToString(startIndex, endIndex))
        if (!isWellFormed) {
            assertFailsWith<CharacterCodingException> { bytes.decodeToString(startIndex, endIndex, true) }
        } else {
            assertEquals(expected, bytes.decodeToString(startIndex, endIndex, true))
            assertArrayContentEquals(
                bytes.sliceArray(startIndex until endIndex),
                bytes.decodeToString(startIndex, endIndex, true).encodeToByteArray()
            )
        }
    }

    private fun truncatedSurrogateDecoding() =
        surrogateCodePointDecoding.let { if (it.length > 1) it.dropLast(1) else it }

    @Test
    fun decodeToString() {
        testDecoding(true, "", bytes()) // empty
        testDecoding(true, "\u0000", bytes(0x0)) // null char
        testDecoding(true, "zC", bytes(0x7A, 0x43)) // 1-byte chars

        testDecoding(false, "<EFBFBD><EFBFBD>", bytes(0x85, 0xAF)) // invalid bytes starting with 1 bit
        testDecoding(true, "¿", bytes(0xC2, 0xBF)) // 2-byte char
        testDecoding(false, "<EFBFBD>z", bytes(0xCF, 0x7A)) // 2-byte char, second byte starts with 0 bit
        testDecoding(false, "<EFBFBD><EFBFBD>", bytes(0xC1, 0xAA)) // 1-byte char written in two bytes

        testDecoding(false, "<EFBFBD>z", bytes(0xEF, 0xAF, 0x7A)) // 3-byte char, third byte starts with 0 bit
        testDecoding(false, "<EFBFBD><EFBFBD><EFBFBD>", bytes(0xE0, 0x9F, 0xAF)) // 2-byte char written in three bytes
        testDecoding(false, "<EFBFBD>z", bytes(0xE0, 0xAF, 0x7A)) // 3-byte char, third byte starts with 0 bit
        testDecoding(true, "\u1FFF", bytes(0xE1, 0xBF, 0xBF)) // 3-byte char

        testOnNonJvm6And7 {
            testDecoding(false, surrogateCodePointDecoding, bytes(0xED, 0xAF, 0xBF)) // 3-byte high-surrogate char
            testDecoding(false, surrogateCodePointDecoding, bytes(0xED, 0xB3, 0x9A)) // 3-byte low-surrogate char
            testDecoding(
                false,
                surrogateCodePointDecoding + surrogateCodePointDecoding,
                bytes(0xED, 0xAF, 0xBF, /**/ 0xED, 0xB3, 0x9A)
            ) // surrogate pair chars
            testDecoding(false, "<EFBFBD>z", bytes(0xEF, 0x7A)) // 3-byte char, second byte starts with 0 bit, third byte missing

            testDecoding(false, "<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", bytes(0xF9, 0x94, 0x80, 0x80, 0x80)) // 5-byte code point larger than 0x10FFFF
            testDecoding(false, "<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>", bytes(0xFD, 0x94, 0x80, 0x80, 0x80, 0x80)) // 6-byte code point larger than 0x10FFFF

            // Ill-Formed Sequences for Surrogates
            testDecoding(
                false,
                surrogateCodePointDecoding + surrogateCodePointDecoding + truncatedSurrogateDecoding() + "A",
                bytes(0xED, 0xA0, 0x80, /**/ 0xED, 0xBF, 0xBF, /**/ 0xED, 0xAF, /**/ 0x41)
            )
            // Truncated Sequences
            testDecoding(false, "<EFBFBD><EFBFBD><EFBFBD><EFBFBD>A", bytes(0xE1, 0x80, /**/ 0xE2, /**/ 0xF0, 0x91, 0x92, /**/ 0xF1, 0xBF, /**/ 0x41))
        }

        testDecoding(false, "<EFBFBD>", bytes(0xE0, 0xAF)) // 3-byte char, third byte missing

        testDecoding(true, "\uD83D\uDFDF", bytes(0xF0, 0x9F, 0x9F, 0x9F)) // 4-byte char
        testDecoding(false, "<EFBFBD><EFBFBD><EFBFBD><EFBFBD>", bytes(0xF0, 0x8F, 0x9F, 0x9F)) // 3-byte char written in four bytes
        testDecoding(false, "<EFBFBD><EFBFBD><EFBFBD><EFBFBD>", bytes(0xF4, 0x9F, 0x9F, 0x9F)) // 4-byte code point larger than 0x10FFFF
        testDecoding(false, "<EFBFBD><EFBFBD><EFBFBD><EFBFBD>", bytes(0xF5, 0x80, 0x80, 0x80)) // 4-byte code point larger than 0x10FFFF

        // Non-Shortest Form Sequences
        testDecoding(false, "<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>A", bytes(0xC0, 0xAF, /**/ 0xE0, 0x80, 0xBF, /**/ 0xF0, 0x81, 0x82, /**/ 0x41))
        // Other Ill-Formed Sequences
        testDecoding(false, "<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>A<EFBFBD><EFBFBD>B", bytes(0xF4, 0x91, 0x92, 0x93, /**/ 0xFF, /**/ 0x41, /**/ 0x80, 0xBF, /**/ 0x42))

        val longBytes = ByteArray(200_000) { 0x6B.toByte() }
        val longString = longBytes.decodeToString()
        assertEquals(200_000, longString.length)
        assertTrue { longString.all { it == 'k' } }
    }

    @Test
    fun decodeToStringSlice() {
        assertFailsWith<IllegalArgumentException> { bytes().decodeToString(1, 0) }
        assertFailsWith<IllegalArgumentException> { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = 10) }
        assertFailsWith<IndexOutOfBoundsException> { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = -1) }
        assertFailsWith<IndexOutOfBoundsException> { bytes(0x61, 0x62, 0x63).decodeToString(endIndex = 10) }
        assertFailsWith<IllegalArgumentException> { bytes(0x61, 0x62, 0x63).decodeToString(endIndex = -1) }
        assertFailsWith<IndexOutOfBoundsException> { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = 5, endIndex = 10) }
        assertFailsWith<IllegalArgumentException> { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = 5, endIndex = 2) }
        assertFailsWith<IndexOutOfBoundsException> { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = 1, endIndex = 4) }

        testDecoding(true, "", bytes(), startIndex = 0, endIndex = 0)
        testDecoding(true, "", bytes(0x61, 0x62, 0x63), startIndex = 0, endIndex = 0)
        testDecoding(true, "", bytes(0x61, 0x62, 0x63), startIndex = 3, endIndex = 3)
        testDecoding(true, "abc", bytes(0x61, 0x62, 0x63), startIndex = 0, endIndex = 3)
        testDecoding(true, "ab", bytes(0x61, 0x62, 0x63), startIndex = 0, endIndex = 2)
        testDecoding(true, "bc", bytes(0x61, 0x62, 0x63), startIndex = 1, endIndex = 3)
        testDecoding(true, "b", bytes(0x61, 0x62, 0x63), startIndex = 1, endIndex = 2)

        testDecoding(true, "¿", bytes(0xC2, 0xBF), startIndex = 0, endIndex = 2)
        testDecoding(false, "<EFBFBD>", bytes(0xC2, 0xBF), startIndex = 0, endIndex = 1)
        testDecoding(false, "<EFBFBD>", bytes(0xC2, 0xBF), startIndex = 1, endIndex = 2)

        testDecoding(false, "<EFBFBD>", bytes(0xEF, 0xAF, 0x7A), startIndex = 0, endIndex = 2)
        testDecoding(false, "<EFBFBD>z", bytes(0xEF, 0xAF, 0x7A), startIndex = 1, endIndex = 3)
        testDecoding(true, "z", bytes(0xEF, 0xAF, 0x7A), startIndex = 2, endIndex = 3)

        testOnNonJvm6And7 {
            testDecoding(false, surrogateCodePointDecoding, bytes(0xED, 0xAF, 0xBF), startIndex = 0, endIndex = 3)
            testDecoding(false, truncatedSurrogateDecoding(), bytes(0xED, 0xB3, 0x9A), startIndex = 0, endIndex = 2)
            testDecoding(false, "<EFBFBD><EFBFBD><EFBFBD>", bytes(0xED, 0xAF, 0xBF, 0xED, 0xB3, 0x9A), startIndex = 1, endIndex = 4)
            testDecoding(false, "<EFBFBD>", bytes(0xEF, 0x7A), startIndex = 0, endIndex = 1)
            testDecoding(true, "z", bytes(0xEF, 0x7A), startIndex = 1, endIndex = 2)
        }

        testDecoding(true, "\uD83D\uDFDF", bytes(0xF0, 0x9F, 0x9F, 0x9F), startIndex = 0, endIndex = 4)
        testDecoding(false, "<EFBFBD><EFBFBD>", bytes(0xF0, 0x9F, 0x9F, 0x9F), startIndex = 2, endIndex = 4)
        testDecoding(false, "<EFBFBD><EFBFBD>", bytes(0xF0, 0x9F, 0x9F, 0x9F), startIndex = 1, endIndex = 3)

        val longBytes = ByteArray(200_000) { 0x6B.toByte() }
        val longString = longBytes.decodeToString(startIndex = 5000, endIndex = 195_000)
        assertEquals(190_000, longString.length)
        assertTrue { longString.all { it == 'k' } }
    }

    @Test
    fun kotlinxIOUnicodeTest() {
        fun String.readHex(): ByteArray = split(" ")
            .filter { it.isNotBlank() }
            .map { it.toInt(16).toByte() }
            .toByteArray()

        val smokeTestData = "\ud83c\udf00"
        val smokeTestDataCharArray: CharArray = smokeTestData.toCharArray()
        val smokeTestDataAsBytes = "f0 9f 8c 80".readHex()

        val testData = "file content with unicode " +
                "\ud83c\udf00 :" +
                " \u0437\u0434\u043e\u0440\u043e\u0432\u0430\u0442\u044c\u0441\u044f :" +
                " \uc5ec\ubcf4\uc138\uc694 :" +
                " \u4f60\u597d :" +
                " \u00f1\u00e7"
        val testDataCharArray: CharArray = testData.toCharArray()
        val testDataAsBytes: ByteArray = ("66 69 6c 65 20 63 6f 6e 74 65 6e 74 20 77 69 74 " +
                " 68 20 75 6e 69 63 6f 64 65 20 f0 9f 8c 80 20 3a 20 d0 b7 d0 b4 d0 be d1 " +
                "80 d0 be d0 b2 d0 b0 d1 82 d1 8c d1 81 d1 8f 20 3a 20 ec 97 ac eb b3 b4 ec " +
                " 84 b8 ec 9a 94 20 3a 20 e4 bd a0 e5 a5 bd 20 3a 20 c3 b1 c3 a7").readHex()


        assertArrayContentEquals(smokeTestDataAsBytes, smokeTestData.encodeToByteArray())
        assertArrayContentEquals(testDataAsBytes, testData.encodeToByteArray())

        assertEquals(smokeTestData, smokeTestDataAsBytes.decodeToString())
        assertEquals(testData, testDataAsBytes.decodeToString())

        assertEquals(smokeTestData, smokeTestDataCharArray.concatToString())
        assertEquals(testData, testDataCharArray.concatToString())

        assertArrayContentEquals(smokeTestDataCharArray, smokeTestData.toCharArray())
        assertArrayContentEquals(testDataCharArray, testData.toCharArray())

        assertArrayContentEquals(smokeTestDataAsBytes, smokeTestDataCharArray.concatToString().encodeToByteArray())
        assertArrayContentEquals(testDataAsBytes, testDataCharArray.concatToString().encodeToByteArray())

        assertArrayContentEquals(smokeTestDataCharArray, smokeTestDataAsBytes.decodeToString().toCharArray())
        assertArrayContentEquals(testDataCharArray, testDataAsBytes.decodeToString().toCharArray())

        assertEquals("\uD858\uDE18\n", bytes(0xF0, 0xA6, 0x88, 0x98, 0x0a).decodeToString())
        assertEquals("\u0BF5\n", bytes(0xE0, 0xAF, 0xB5, 0x0A).decodeToString())
        assertEquals("\u041a\n", bytes(0xD0, 0x9A, 0x0A).decodeToString())
    }
}