Write valid UTF8 characters without escaping.

Summary: commit 6a18528 introduced escaping of bytes >= 127 to ensure that KConfig files are valid UTF8. The simplistic approach with a cutoff results in many escaped bytes where it is not required. Especially non-western configuration files would have many escapes. This commit fixes that by only escaping bytes that are not valid UTF8. BUG: 403557 FIXED-IN: 5.56 Test Plan: ninja && ninja test Reviewers: dfaure, arichardson, apol, #frameworks, thiago Subscribers: rapiteanu, kde-frameworks-devel Tags: #frameworks Differential Revision: https://phabricator.kde.org/D19107
author: Jos van den Oever <jos@vandenoever.info> 2019-02-17 23:32:35 +0100
committer: Jos van den Oever <jos@vandenoever.info> 2019-02-20 17:19:14 +0100
commit: 2cdcd4f30666fd1095ab7cf31361e404db871075 (patch)
tree: 4875c4c3a78583342821814b53444443278448e4
parent: fbad48f87c1e62b1d56215eba76a7bb7e2d22f1e (diff)
download: kconfig-2cdcd4f30666fd1095ab7cf31361e404db871075.tar.gz
kconfig-2cdcd4f30666fd1095ab7cf31361e404db871075.tar.bz2
3 files changed, 167 insertions, 11 deletions
diff --git a/autotests/kconfigtest.cpp b/autotests/kconfigtest.cpp
index 64c6223d..2ad3b312 100644
--- a/autotests/kconfigtest.cpp
+++ b/autotests/kconfigtest.cpp
@@ -1736,10 +1736,11 @@ void KConfigTest::testQByteArrayUtf8()
     QFile readFile(file.fileName());
     QVERIFY(readFile.open(QFile::ReadOnly));
 #define VALUE "Utf8=\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd\\xfe\\xff"
+    const QByteArray fileBytes = readFile.readAll();
 #ifndef Q_OS_WIN
-    QCOMPARE(readFile.readAll(), QByteArrayLiteral("[General]\n" VALUE "\n"));
+    QCOMPARE(fileBytes, QByteArrayLiteral("[General]\n" VALUE "\n"));
 #else
-    QCOMPARE(readFile.readAll(), QByteArrayLiteral("[General]\r\n" VALUE "\r\n"));
+    QCOMPARE(fileBytes, QByteArrayLiteral("[General]\r\n" VALUE "\r\n"));
 #endif
 #undef VALUE
 
@@ -1749,6 +1750,55 @@ void KConfigTest::testQByteArrayUtf8()
     QCOMPARE(bytes, general2.readEntry("Utf8", QByteArray()));
 }
 
+void KConfigTest::testQStringUtf8_data()
+{
+    QTest::addColumn<QByteArray>("data");
+    QTest::newRow("1") << QByteArray("Téléchargements\tTéléchargements");
+    QTest::newRow("2") << QByteArray("$¢ह€𐍈\t$¢ह€𐍈");
+    QTest::newRow("3") << QByteArray("\xc2\xe0\xa4\xf0\x90\x8d\t\\xc2\\xe0\\xa4\\xf0\\x90\\x8d");
+    // 2 byte overlong
+    QTest::newRow("4") << QByteArray("\xc1\xbf\t\\xc1\\xbf");
+    // 3 byte overlong
+    QTest::newRow("5") << QByteArray("\xe0\x9f\xbf\t\\xe0\\x9f\\xbf");
+    // 4 byte overlong
+    QTest::newRow("6") << QByteArray("\xf0\x8f\xbf\xbf\t\\xf0\\x8f\\xbf\\xbf");
+    // outside unicode range
+    QTest::newRow("7") << QByteArray("\xf4\x90\x80\x80\t\\xf4\\x90\\x80\\x80");
+    // just within range
+    QTest::newRow("8") << QByteArray("\xc2\x80\t\xc2\x80");
+    QTest::newRow("9") << QByteArray("\xe0\xa0\x80\t\xe0\xa0\x80");
+    QTest::newRow("10") << QByteArray("\xf0\x90\x80\x80\t\xf0\x90\x80\x80");
+    QTest::newRow("11") << QByteArray("\xf4\x8f\xbf\xbf\t\xf4\x8f\xbf\xbf");
+}
+
+void KConfigTest::testQStringUtf8()
+{
+    QFETCH(QByteArray, data);
+    const QList<QByteArray> d = data.split('\t');
+    const QByteArray value = d[0];
+    const QByteArray serialized = d[1];
+    QTemporaryFile file;
+    QVERIFY(file.open());
+    KConfig config(file.fileName(), KConfig::SimpleConfig);
+    KConfigGroup general(&config, "General");
+    general.writeEntry("key", value);
+    config.sync();
+    file.flush();
+    file.close();
+    QFile readFile(file.fileName());
+    QVERIFY(readFile.open(QFile::ReadOnly));
+    QByteArray fileBytes = readFile.readAll();
+#ifdef Q_OS_WIN
+    fileBytes.replace("\r\n", "\n");
+#endif
+    QCOMPARE(fileBytes, QByteArrayLiteral("[General]\nkey=") + serialized + QByteArrayLiteral("\n"));
+
+    // check that reading works
+    KConfig config2(file.fileName(), KConfig::SimpleConfig);
+    KConfigGroup general2(&config2, "General");
+    QCOMPARE(value, general2.readEntry("key", QByteArray()));
+}
+
 void KConfigTest::testNewlines()
 {
     // test that kconfig always uses the native line endings
diff --git a/autotests/kconfigtest.h b/autotests/kconfigtest.h
index 0715f45e..26d8e7f5 100644
--- a/autotests/kconfigtest.h
+++ b/autotests/kconfigtest.h
@@ -64,6 +64,8 @@ private Q_SLOTS:
     void testReparent();
     void testAnonymousConfig();
     void testQByteArrayUtf8();
+    void testQStringUtf8_data();
+    void testQStringUtf8();
 
     void testSubGroup();
     void testAddConfigSources();
diff --git a/src/core/kconfigini.cpp b/src/core/kconfigini.cpp
index 84d77b48..87c4a8af 100644
--- a/src/core/kconfigini.cpp
+++ b/src/core/kconfigini.cpp
@@ -647,13 +647,110 @@ bool KConfigIniBackend::isLocked() const
     return lockFile && lockFile->isLocked();
 }
 
-QByteArray KConfigIniBackend::stringToPrintable(const QByteArray &aString, StringType type)
-{
-    static const char nibbleLookup[] = {
-        '0', '1', '2', '3', '4', '5', '6', '7',
-        '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
+namespace {
+    // serialize an escaped byte at the end of @param data
+    // @param data should have room for 4 bytes
+    char* escapeByte(char* data, unsigned char s) {
+        static const char nibbleLookup[] = {
+            '0', '1', '2', '3', '4', '5', '6', '7',
+            '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
+        };
+        *data++ = '\\';
+        *data++ = 'x';
+        *data++ = nibbleLookup[s >> 4];
+        *data++ = nibbleLookup[s & 0x0f];
+        return data;
+    }
+
+    // Struct that represents a multi-byte UTF-8 character.
+    // This struct is used to keep track of bytes that seem to be valid
+    // UTF-8.
+    struct Utf8Char {
+    public:
+        unsigned char bytes[4];
+        unsigned char count;
+        unsigned char charLength;
+
+        Utf8Char() {
+            clear();
+            charLength = 0;
+        }
+        void clear() {
+            count = 0;
+        }
+        // Add a byte to the UTF8 character.
+        // When an additional byte leads to an invalid character, return false.
+        bool addByte(unsigned char b) {
+            if (count == 0) {
+                if (b > 0xc1 && (b & 0xe0) == 0xc0) {
+                    charLength = 2;
+                } else if ((b & 0xf0) == 0xe0) {
+                    charLength = 3;
+                } else if (b < 0xf5 && (b & 0xf8) == 0xf0) {
+                    charLength = 4;
+                } else {
+                    return false;
+                }
+                bytes[0] = b;
+                count = 1;
+            } else if (count < 4 && (b & 0xc0) == 0x80) {
+                if (count == 1) {
+                    if (charLength == 3 && bytes[0] == 0xe0 && b < 0xa0) {
+                        return false; // overlong 3 byte sequence
+                    }
+                    if (charLength == 4) {
+                        if (bytes[0] == 0xf0 && b < 0x90) {
+                            return false; // overlong 4 byte sequence
+                        }
+                        if (bytes[0] == 0xf4 && b > 0x8f) {
+                            return false; // Unicode value larger than U+10FFFF
+                        }
+                    }
+                }
+                bytes[count++] = b;
+            } else {
+                return false;
+            }
+            return true;
+        }
+        // Return true if Utf8Char contains one valid character.
+        bool isComplete() {
+            return count > 0 && count == charLength;
+        }
+        // Add the bytes in this UTF8 character in escaped form to data.
+        char* escapeBytes(char* data) {
+            for (unsigned char i = 0; i < count; ++i) {
+                data = escapeByte(data, bytes[i]);
+            }
+            clear();
+            return data;
+        }
+        // Add the bytes of the UTF8 character to a buffer.
+        // Only call this if isComplete() returns true.
+        char* writeUtf8(char* data) {
+            for (unsigned char i = 0; i < count; ++i) {
+                *data++ = bytes[i];
+            }
+            clear();
+            return data;
+        }
+        // Write the bytes in the UTF8 character literally, or, if the
+        // character is not complete, write the escaped bytes.
+        // This is useful to handle the state that remains after handling
+        // all bytes in a buffer.
+        char* write(char* data) {
+            if (isComplete()) {
+                data = writeUtf8(data);
+            } else {
+                data = escapeBytes(data);
+            }
+            return data;
+        }
     };
+}
 
+QByteArray KConfigIniBackend::stringToPrintable(const QByteArray &aString, StringType type)
+{
     if (aString.isEmpty()) {
         return aString;
     }
@@ -672,10 +769,16 @@ QByteArray KConfigIniBackend::stringToPrintable(const QByteArray &aString, Strin
         *data++ = 's';
         i++;
     }
+    Utf8Char utf8;
 
     for (; i < l; ++i/*, r++*/) {
         switch (s[i]) {
         default:
+            if (utf8.addByte(s[i])) {
+                break;
+            } else {
+                data = utf8.escapeBytes(data);
+            }
             // The \n, \t, \r cases (all < 32) are handled below; we can ignore them here
             if (((unsigned char)s[i]) < 32) {
                 goto doEscape;
@@ -717,13 +820,14 @@ QByteArray KConfigIniBackend::stringToPrintable(const QByteArray &aString, Strin
                 break;
             }
         doEscape:
-            *data++ = '\\';
-            *data++ = 'x';
-            *data++ = nibbleLookup[((unsigned char)s[i]) >> 4];
-            *data++ = nibbleLookup[((unsigned char)s[i]) & 0x0f];
+            data = escapeByte(data, s[i]);
             break;
         }
+        if (utf8.isComplete()) {
+            data = utf8.writeUtf8(data);
+        }
     }
+    data = utf8.write(data);
     *data = 0;
     result.resize(data - start);
author	Jos van den Oever <jos@vandenoever.info>	2019-02-17 23:32:35 +0100
committer	Jos van den Oever <jos@vandenoever.info>	2019-02-20 17:19:14 +0100
commit	2cdcd4f30666fd1095ab7cf31361e404db871075 (patch)
tree	4875c4c3a78583342821814b53444443278448e4
parent	fbad48f87c1e62b1d56215eba76a7bb7e2d22f1e (diff)
download	kconfig-2cdcd4f30666fd1095ab7cf31361e404db871075.tar.gz kconfig-2cdcd4f30666fd1095ab7cf31361e404db871075.tar.bz2