/* * Copyright 2010-2019 JetBrains s.r.o. and Kotlin Programming Language contributors. * Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file. */ package test.text import test.assertArrayContentEquals import test.testOnNonJvm6And7 import kotlin.test.* // When decoding utf-8, JVM and JS implementations replace the sequence reflecting a surrogate code point differently. // JS replaces each byte of the sequence by the replacement char, whereas JVM replaces the whole sequence with a single replacement char. // See corresponding actual to find out the replacement. internal expect val surrogateCodePointDecoding: String // The byte sequence used to replace a surrogate char. // JVM default replacement sequence consist of single 0x3F byte. // JS and Native replacement byte sequence is [0xEF, 0xBF, 0xBD]. internal expect val surrogateCharEncoding: ByteArray class StringEncodingTest { private fun bytes(vararg elements: Int) = ByteArray(elements.size) { elements[it].toByte() } private fun testEncoding(isWellFormed: Boolean, expected: ByteArray, string: String) { assertArrayContentEquals(expected, string.encodeToByteArray()) if (!isWellFormed) { assertFailsWith { string.encodeToByteArray(throwOnInvalidSequence = true) } } else { assertArrayContentEquals(expected, string.encodeToByteArray(throwOnInvalidSequence = true)) assertEquals(string, string.encodeToByteArray(throwOnInvalidSequence = true).decodeToString()) } } private fun testEncoding(isWellFormed: Boolean, expected: ByteArray, string: String, startIndex: Int, endIndex: Int) { assertArrayContentEquals(expected, string.encodeToByteArray(startIndex, endIndex)) if (!isWellFormed) { assertFailsWith { string.encodeToByteArray(startIndex, endIndex, true) } } else { assertArrayContentEquals(expected, string.encodeToByteArray(startIndex, endIndex, true)) assertEquals( string.substring(startIndex, endIndex), string.encodeToByteArray(startIndex, endIndex, true).decodeToString() ) } } // https://youtrack.jetbrains.com/issue/KT-31614 private fun string(vararg codeUnits: Int): String { return buildString { codeUnits.forEach { append(Char(it)) } } } @Test fun encodeToByteArray() { // empty string testEncoding(true, bytes(), "") // 1-byte chars testEncoding(true, bytes(0), "\u0000") testEncoding(true, bytes(0x2D), "-") testEncoding(true, bytes(0x7F), "\u007F") // 2-byte chars testEncoding(true, bytes(0xC2, 0x80), "\u0080") testEncoding(true, bytes(0xC2, 0xBF), "¿") testEncoding(true, bytes(0xDF, 0xBF), "\u07FF") // 3-byte chars testEncoding(true, bytes(0xE0, 0xA0, 0x80), "\u0800") testEncoding(true, bytes(0xE6, 0x96, 0xA4), "斤") testEncoding(true, bytes(0xED, 0x9F, 0xBF), "\uD7FF") // surrogate chars testEncoding(false, surrogateCharEncoding, string(0xD800)) testEncoding(false, surrogateCharEncoding, string(0xDB6A)) testEncoding(false, surrogateCharEncoding, string(0xDFFF)) // 3-byte chars testEncoding(true, bytes(0xEE, 0x80, 0x80), "\uE000") testEncoding(true, bytes(0xEF, 0x98, 0xBC), "\uF63C") testEncoding(true, bytes(0xEF, 0xBF, 0xBF), "\uFFFF") // 4-byte surrogate pairs testEncoding(true, bytes(0xF0, 0x90, 0x80, 0x80), "\uD800\uDC00") testEncoding(true, bytes(0xF2, 0xA2, 0x97, 0xBC), "\uDA49\uDDFC") testEncoding(true, bytes(0xF4, 0x8F, 0xBF, 0xBF), "\uDBFF\uDFFF") // reversed surrogate pairs testEncoding(false, surrogateCharEncoding + surrogateCharEncoding, string(0xDC00, 0xD800)) testEncoding(false, surrogateCharEncoding + surrogateCharEncoding, string(0xDDFC, 0xDA49)) testEncoding(false, surrogateCharEncoding + surrogateCharEncoding, string(0xDFFF, 0xDBFF)) testEncoding( false, bytes( 0, /**/ 0x2D, /**/ 0x7F, /**/ 0xC2, 0x80, /**/ 0xC2, 0xBF, /**/ 0xDF, 0xBF, /**/ 0xE0, 0xA0, 0x80, /**/ 0xE6, 0x96, 0xA4, /**/ 0xED, 0x9F, 0xBF, /**/ 0x7A ) /**/ + surrogateCharEncoding /**/ + surrogateCharEncoding /**/ + 0x7A /**/ + surrogateCharEncoding /**/ + 0x7A /**/ + surrogateCharEncoding, "\u0000-\u007F\u0080¿\u07FF\u0800斤\uD7FFz" + string(0xDFFF, 0xD800, 0x7A, 0xDB6A, 0x7A, 0xDB6A) ) testEncoding( true, bytes( 0xEE, 0x80, 0x80, /**/ 0xEF, 0x98, 0xBC, /**/ 0xC2, 0xBF, /**/ 0xEF, 0xBF, 0xBF, /**/ 0xF0, 0x90, 0x80, 0x80, /**/ 0xF2, 0xA2, 0x97, 0xBC, /**/ 0xF4, 0x8F, 0xBF, 0xBF ), "\uE000\uF63C¿\uFFFF\uD800\uDC00\uDA49\uDDFC\uDBFF\uDFFF" ) val longChars = CharArray(200_000) { 'k' } val longBytes = longChars.concatToString().encodeToByteArray() assertEquals(200_000, longBytes.size) assertTrue { longBytes.all { it == 0x6B.toByte() } } } @Test fun encodeToByteArraySlice() { assertFailsWith { "".encodeToByteArray(startIndex = 1) } assertFailsWith { "123".encodeToByteArray(startIndex = 10) } assertFailsWith { "123".encodeToByteArray(startIndex = -1) } assertFailsWith { "123".encodeToByteArray(endIndex = 10) } assertFailsWith { "123".encodeToByteArray(endIndex = -1) } assertFailsWith { "123".encodeToByteArray(startIndex = 5, endIndex = 10) } assertFailsWith { "123".encodeToByteArray(startIndex = 5, endIndex = 2) } assertFailsWith { "123".encodeToByteArray(startIndex = 1, endIndex = 4) } testEncoding(true, bytes(), "abc", 0, 0) testEncoding(true, bytes(), "abc", 3, 3) testEncoding(true, bytes(0x62, 0x63), "abc", 1, 3) testEncoding(true, bytes(0x61, 0x62), "abc", 0, 2) testEncoding(true, bytes(0x62), "abc", 1, 2) testEncoding(true, bytes(0x2D), "-", 0, 1) testEncoding(true, bytes(0xC2, 0xBF), "¿", 0, 1) testEncoding(true, bytes(0xE6, 0x96, 0xA4), "斤", 0, 1) testEncoding(false, surrogateCharEncoding, string(0xDB6A), 0, 1) testEncoding(true, bytes(0xEF, 0x98, 0xBC), "\uF63C", 0, 1) testEncoding(true, bytes(0xF2, 0xA2, 0x97, 0xBC), "\uDA49\uDDFC", 0, 2) testEncoding(false, surrogateCharEncoding, "\uDA49\uDDFC", 0, 1) testEncoding(false, surrogateCharEncoding, "\uDA49\uDDFC", 1, 2) testEncoding( false, bytes(0xE6, 0x96, 0xA4, /**/ 0xED, 0x9F, 0xBF, /**/ 0x7A) /**/ + surrogateCharEncoding /**/ + surrogateCharEncoding, "\u0000-\u007F\u0080¿\u07FF\u0800斤\uD7FFz" + string(0xDFFF, 0xD800, 0x7A, 0xDB6A, 0x7A, 0xDB6A), startIndex = 7, endIndex = 12 ) testEncoding( false, bytes(0xC2, 0xBF, /**/ 0xEF, 0xBF, 0xBF, /**/ 0xF0, 0x90, 0x80, 0x80, /**/ 0xF2, 0xA2, 0x97, 0xBC) /**/ + surrogateCharEncoding, "\uE000\uF63C¿\uFFFF\uD800\uDC00\uDA49\uDDFC\uDBFF\uDFFF", startIndex = 2, endIndex = 9 ) val longChars = CharArray(200_000) { 'k' } val longBytes = longChars.concatToString().encodeToByteArray(startIndex = 5000, endIndex = 195_000) assertEquals(190_000, longBytes.size) assertTrue { longBytes.all { it == 0x6B.toByte() } } } private fun testDecoding(isWellFormed: Boolean, expected: String, bytes: ByteArray) { assertEquals(expected, bytes.decodeToString()) if (!isWellFormed) { assertFailsWith { bytes.decodeToString(throwOnInvalidSequence = true) } } else { assertEquals(expected, bytes.decodeToString(throwOnInvalidSequence = true)) assertArrayContentEquals(bytes, bytes.decodeToString(throwOnInvalidSequence = true).encodeToByteArray()) } } private fun testDecoding(isWellFormed: Boolean, expected: String, bytes: ByteArray, startIndex: Int, endIndex: Int) { assertEquals(expected, bytes.decodeToString(startIndex, endIndex)) if (!isWellFormed) { assertFailsWith { bytes.decodeToString(startIndex, endIndex, true) } } else { assertEquals(expected, bytes.decodeToString(startIndex, endIndex, true)) assertArrayContentEquals( bytes.sliceArray(startIndex until endIndex), bytes.decodeToString(startIndex, endIndex, true).encodeToByteArray() ) } } private fun truncatedSurrogateDecoding() = surrogateCodePointDecoding.let { if (it.length > 1) it.dropLast(1) else it } @Test fun decodeToString() { testDecoding(true, "", bytes()) // empty testDecoding(true, "\u0000", bytes(0x0)) // null char testDecoding(true, "zC", bytes(0x7A, 0x43)) // 1-byte chars testDecoding(false, "��", bytes(0x85, 0xAF)) // invalid bytes starting with 1 bit testDecoding(true, "¿", bytes(0xC2, 0xBF)) // 2-byte char testDecoding(false, "�z", bytes(0xCF, 0x7A)) // 2-byte char, second byte starts with 0 bit testDecoding(false, "��", bytes(0xC1, 0xAA)) // 1-byte char written in two bytes testDecoding(false, "�z", bytes(0xEF, 0xAF, 0x7A)) // 3-byte char, third byte starts with 0 bit testDecoding(false, "���", bytes(0xE0, 0x9F, 0xAF)) // 2-byte char written in three bytes testDecoding(false, "�z", bytes(0xE0, 0xAF, 0x7A)) // 3-byte char, third byte starts with 0 bit testDecoding(true, "\u1FFF", bytes(0xE1, 0xBF, 0xBF)) // 3-byte char testOnNonJvm6And7 { testDecoding(false, surrogateCodePointDecoding, bytes(0xED, 0xAF, 0xBF)) // 3-byte high-surrogate char testDecoding(false, surrogateCodePointDecoding, bytes(0xED, 0xB3, 0x9A)) // 3-byte low-surrogate char testDecoding( false, surrogateCodePointDecoding + surrogateCodePointDecoding, bytes(0xED, 0xAF, 0xBF, /**/ 0xED, 0xB3, 0x9A) ) // surrogate pair chars testDecoding(false, "�z", bytes(0xEF, 0x7A)) // 3-byte char, second byte starts with 0 bit, third byte missing testDecoding(false, "�����", bytes(0xF9, 0x94, 0x80, 0x80, 0x80)) // 5-byte code point larger than 0x10FFFF testDecoding(false, "������", bytes(0xFD, 0x94, 0x80, 0x80, 0x80, 0x80)) // 6-byte code point larger than 0x10FFFF // Ill-Formed Sequences for Surrogates testDecoding( false, surrogateCodePointDecoding + surrogateCodePointDecoding + truncatedSurrogateDecoding() + "A", bytes(0xED, 0xA0, 0x80, /**/ 0xED, 0xBF, 0xBF, /**/ 0xED, 0xAF, /**/ 0x41) ) // Truncated Sequences testDecoding(false, "����A", bytes(0xE1, 0x80, /**/ 0xE2, /**/ 0xF0, 0x91, 0x92, /**/ 0xF1, 0xBF, /**/ 0x41)) } testDecoding(false, "�", bytes(0xE0, 0xAF)) // 3-byte char, third byte missing testDecoding(true, "\uD83D\uDFDF", bytes(0xF0, 0x9F, 0x9F, 0x9F)) // 4-byte char testDecoding(false, "����", bytes(0xF0, 0x8F, 0x9F, 0x9F)) // 3-byte char written in four bytes testDecoding(false, "����", bytes(0xF4, 0x9F, 0x9F, 0x9F)) // 4-byte code point larger than 0x10FFFF testDecoding(false, "����", bytes(0xF5, 0x80, 0x80, 0x80)) // 4-byte code point larger than 0x10FFFF // Non-Shortest Form Sequences testDecoding(false, "��������A", bytes(0xC0, 0xAF, /**/ 0xE0, 0x80, 0xBF, /**/ 0xF0, 0x81, 0x82, /**/ 0x41)) // Other Ill-Formed Sequences testDecoding(false, "�����A��B", bytes(0xF4, 0x91, 0x92, 0x93, /**/ 0xFF, /**/ 0x41, /**/ 0x80, 0xBF, /**/ 0x42)) val longBytes = ByteArray(200_000) { 0x6B.toByte() } val longString = longBytes.decodeToString() assertEquals(200_000, longString.length) assertTrue { longString.all { it == 'k' } } } @Test fun decodeToStringSlice() { assertFailsWith { bytes().decodeToString(1, 0) } assertFailsWith { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = 10) } assertFailsWith { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = -1) } assertFailsWith { bytes(0x61, 0x62, 0x63).decodeToString(endIndex = 10) } assertFailsWith { bytes(0x61, 0x62, 0x63).decodeToString(endIndex = -1) } assertFailsWith { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = 5, endIndex = 10) } assertFailsWith { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = 5, endIndex = 2) } assertFailsWith { bytes(0x61, 0x62, 0x63).decodeToString(startIndex = 1, endIndex = 4) } testDecoding(true, "", bytes(), startIndex = 0, endIndex = 0) testDecoding(true, "", bytes(0x61, 0x62, 0x63), startIndex = 0, endIndex = 0) testDecoding(true, "", bytes(0x61, 0x62, 0x63), startIndex = 3, endIndex = 3) testDecoding(true, "abc", bytes(0x61, 0x62, 0x63), startIndex = 0, endIndex = 3) testDecoding(true, "ab", bytes(0x61, 0x62, 0x63), startIndex = 0, endIndex = 2) testDecoding(true, "bc", bytes(0x61, 0x62, 0x63), startIndex = 1, endIndex = 3) testDecoding(true, "b", bytes(0x61, 0x62, 0x63), startIndex = 1, endIndex = 2) testDecoding(true, "¿", bytes(0xC2, 0xBF), startIndex = 0, endIndex = 2) testDecoding(false, "�", bytes(0xC2, 0xBF), startIndex = 0, endIndex = 1) testDecoding(false, "�", bytes(0xC2, 0xBF), startIndex = 1, endIndex = 2) testDecoding(false, "�", bytes(0xEF, 0xAF, 0x7A), startIndex = 0, endIndex = 2) testDecoding(false, "�z", bytes(0xEF, 0xAF, 0x7A), startIndex = 1, endIndex = 3) testDecoding(true, "z", bytes(0xEF, 0xAF, 0x7A), startIndex = 2, endIndex = 3) testOnNonJvm6And7 { testDecoding(false, surrogateCodePointDecoding, bytes(0xED, 0xAF, 0xBF), startIndex = 0, endIndex = 3) testDecoding(false, truncatedSurrogateDecoding(), bytes(0xED, 0xB3, 0x9A), startIndex = 0, endIndex = 2) testDecoding(false, "���", bytes(0xED, 0xAF, 0xBF, 0xED, 0xB3, 0x9A), startIndex = 1, endIndex = 4) testDecoding(false, "�", bytes(0xEF, 0x7A), startIndex = 0, endIndex = 1) testDecoding(true, "z", bytes(0xEF, 0x7A), startIndex = 1, endIndex = 2) } testDecoding(true, "\uD83D\uDFDF", bytes(0xF0, 0x9F, 0x9F, 0x9F), startIndex = 0, endIndex = 4) testDecoding(false, "��", bytes(0xF0, 0x9F, 0x9F, 0x9F), startIndex = 2, endIndex = 4) testDecoding(false, "��", bytes(0xF0, 0x9F, 0x9F, 0x9F), startIndex = 1, endIndex = 3) val longBytes = ByteArray(200_000) { 0x6B.toByte() } val longString = longBytes.decodeToString(startIndex = 5000, endIndex = 195_000) assertEquals(190_000, longString.length) assertTrue { longString.all { it == 'k' } } } @Test fun kotlinxIOUnicodeTest() { fun String.readHex(): ByteArray = split(" ") .filter { it.isNotBlank() } .map { it.toInt(16).toByte() } .toByteArray() val smokeTestData = "\ud83c\udf00" val smokeTestDataCharArray: CharArray = smokeTestData.toCharArray() val smokeTestDataAsBytes = "f0 9f 8c 80".readHex() val testData = "file content with unicode " + "\ud83c\udf00 :" + " \u0437\u0434\u043e\u0440\u043e\u0432\u0430\u0442\u044c\u0441\u044f :" + " \uc5ec\ubcf4\uc138\uc694 :" + " \u4f60\u597d :" + " \u00f1\u00e7" val testDataCharArray: CharArray = testData.toCharArray() val testDataAsBytes: ByteArray = ("66 69 6c 65 20 63 6f 6e 74 65 6e 74 20 77 69 74 " + " 68 20 75 6e 69 63 6f 64 65 20 f0 9f 8c 80 20 3a 20 d0 b7 d0 b4 d0 be d1 " + "80 d0 be d0 b2 d0 b0 d1 82 d1 8c d1 81 d1 8f 20 3a 20 ec 97 ac eb b3 b4 ec " + " 84 b8 ec 9a 94 20 3a 20 e4 bd a0 e5 a5 bd 20 3a 20 c3 b1 c3 a7").readHex() assertArrayContentEquals(smokeTestDataAsBytes, smokeTestData.encodeToByteArray()) assertArrayContentEquals(testDataAsBytes, testData.encodeToByteArray()) assertEquals(smokeTestData, smokeTestDataAsBytes.decodeToString()) assertEquals(testData, testDataAsBytes.decodeToString()) assertEquals(smokeTestData, smokeTestDataCharArray.concatToString()) assertEquals(testData, testDataCharArray.concatToString()) assertArrayContentEquals(smokeTestDataCharArray, smokeTestData.toCharArray()) assertArrayContentEquals(testDataCharArray, testData.toCharArray()) assertArrayContentEquals(smokeTestDataAsBytes, smokeTestDataCharArray.concatToString().encodeToByteArray()) assertArrayContentEquals(testDataAsBytes, testDataCharArray.concatToString().encodeToByteArray()) assertArrayContentEquals(smokeTestDataCharArray, smokeTestDataAsBytes.decodeToString().toCharArray()) assertArrayContentEquals(testDataCharArray, testDataAsBytes.decodeToString().toCharArray()) assertEquals("\uD858\uDE18\n", bytes(0xF0, 0xA6, 0x88, 0x98, 0x0a).decodeToString()) assertEquals("\u0BF5\n", bytes(0xE0, 0xAF, 0xB5, 0x0A).decodeToString()) assertEquals("\u041a\n", bytes(0xD0, 0x9A, 0x0A).decodeToString()) } }