LCOV - code coverage report
Current view: top level - src/univalue/include - univalue_utffilter.h (source / functions) Hit Total Coverage
Test: fuzz_coverage.info Lines: 71 71 100.0 %
Date: 2023-10-05 15:40:34 Functions: 5 5 100.0 %
Branches: 43 44 97.7 %

           Branch data     Line data    Source code
       1                 :            : // Copyright 2016 Wladimir J. van der Laan
       2                 :            : // Distributed under the MIT software license, see the accompanying
       3                 :            : // file COPYING or https://opensource.org/licenses/mit-license.php.
       4                 :            : #ifndef BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H
       5                 :            : #define BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H
       6                 :            : 
       7                 :            : #include <string>
       8                 :            : 
       9                 :            : /**
      10                 :            :  * Filter that generates and validates UTF-8, as well as collates UTF-16
      11                 :            :  * surrogate pairs as specified in RFC4627.
      12                 :            :  */
      13                 :            : class JSONUTF8StringFilter
      14                 :            : {
      15                 :            : public:
      16                 :     573830 :     explicit JSONUTF8StringFilter(std::string& s)
      17                 :     573830 :         : str(s)
      18                 :            :     {
      19                 :     573830 :     }
      20                 :            :     // Write single 8-bit char (may be part of UTF-8 sequence)
      21                 :   23332112 :     void push_back(unsigned char ch)
      22                 :            :     {
      23         [ +  + ]:   23332112 :         if (state == 0) {
      24         [ +  + ]:   21613343 :             if (ch < 0x80) // 7-bit ASCII, fast direct pass-through
      25                 :   20423239 :                 str.push_back(ch);
      26         [ +  + ]:    1190104 :             else if (ch < 0xc0) // Mid-sequence character, invalid in this state
      27                 :      47797 :                 is_valid = false;
      28         [ +  + ]:    1142307 :             else if (ch < 0xe0) { // Start of 2-byte sequence
      29                 :     714915 :                 codepoint = (ch & 0x1f) << 6;
      30                 :     714915 :                 state = 6;
      31         [ +  + ]:    1142307 :             } else if (ch < 0xf0) { // Start of 3-byte sequence
      32                 :      32121 :                 codepoint = (ch & 0x0f) << 12;
      33                 :      32121 :                 state = 12;
      34         [ +  + ]:     427392 :             } else if (ch < 0xf8) { // Start of 4-byte sequence
      35                 :     313234 :                 codepoint = (ch & 0x07) << 18;
      36                 :     313234 :                 state = 18;
      37                 :     313234 :             } else // Reserved, invalid
      38                 :      82037 :                 is_valid = false;
      39                 :   21613343 :         } else {
      40         [ +  + ]:    1718769 :             if ((ch & 0xc0) != 0x80) // Not a continuation, invalid
      41                 :    1590575 :                 is_valid = false;
      42                 :    1718769 :             state -= 6;
      43                 :    1718769 :             codepoint |= (ch & 0x3f) << state;
      44         [ +  + ]:    1718769 :             if (state == 0)
      45                 :    1060212 :                 push_back_u(codepoint);
      46                 :            :         }
      47                 :   23332112 :     }
      48                 :            :     // Write codepoint directly, possibly collating surrogate pairs
      49                 :    1072633 :     void push_back_u(unsigned int codepoint_)
      50                 :            :     {
      51         [ +  + ]:    1072633 :         if (state) // Only accept full codepoints in open state
      52                 :       1698 :             is_valid = false;
      53   [ +  +  +  + ]:    1072633 :         if (codepoint_ >= 0xD800 && codepoint_ < 0xDC00) { // First half of surrogate pair
      54         [ +  + ]:       5786 :             if (surpair) // Two subsequent surrogate pair openers - fail
      55                 :       3878 :                 is_valid = false;
      56                 :            :             else
      57                 :       1908 :                 surpair = codepoint_;
      58   [ +  +  +  + ]:    1072633 :         } else if (codepoint_ >= 0xDC00 && codepoint_ < 0xE000) { // Second half of surrogate pair
      59         [ +  + ]:       3447 :             if (surpair) { // Open surrogate pair, expect second half
      60                 :            :                 // Compute code point from UTF-16 surrogate pair
      61                 :       1815 :                 append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint_ - 0xDC00));
      62                 :       1815 :                 surpair = 0;
      63                 :       1815 :             } else // Second half doesn't follow a first half - fail
      64                 :       1632 :                 is_valid = false;
      65                 :       3447 :         } else {
      66         [ +  + ]:    1063400 :             if (surpair) // First half of surrogate pair not followed by second - fail
      67                 :       2885 :                 is_valid = false;
      68                 :            :             else
      69                 :    1060515 :                 append_codepoint(codepoint_);
      70                 :            :         }
      71                 :    1072633 :     }
      72                 :            :     // Check that we're in a state where the string can be ended
      73                 :            :     // No open sequences, no open surrogate pairs, etc
      74                 :     573478 :     bool finalize()
      75                 :            :     {
      76   [ +  +  +  + ]:     573478 :         if (state || surpair)
      77                 :         34 :             is_valid = false;
      78                 :     573478 :         return is_valid;
      79                 :            :     }
      80                 :            : private:
      81                 :            :     std::string &str;
      82                 :     573830 :     bool is_valid{true};
      83                 :            :     // Current UTF-8 decoding state
      84                 :     573830 :     unsigned int codepoint{0};
      85                 :     573830 :     int state{0}; // Top bit to be filled in for next UTF-8 byte, or 0
      86                 :            : 
      87                 :            :     // Keep track of the following state to handle the following section of
      88                 :            :     // RFC4627:
      89                 :            :     //
      90                 :            :     //    To escape an extended character that is not in the Basic Multilingual
      91                 :            :     //    Plane, the character is represented as a twelve-character sequence,
      92                 :            :     //    encoding the UTF-16 surrogate pair.  So, for example, a string
      93                 :            :     //    containing only the G clef character (U+1D11E) may be represented as
      94                 :            :     //    "\uD834\uDD1E".
      95                 :            :     //
      96                 :            :     //  Two subsequent \u.... may have to be replaced with one actual codepoint.
      97                 :     573830 :     unsigned int surpair{0}; // First half of open UTF-16 surrogate pair, or 0
      98                 :            : 
      99                 :    1062330 :     void append_codepoint(unsigned int codepoint_)
     100                 :            :     {
     101         [ +  + ]:    1062330 :         if (codepoint_ <= 0x7f)
     102                 :      11793 :             str.push_back((char)codepoint_);
     103         [ +  + ]:    1050537 :         else if (codepoint_ <= 0x7FF) {
     104                 :     701825 :             str.push_back((char)(0xC0 | (codepoint_ >> 6)));
     105                 :     701825 :             str.push_back((char)(0x80 | (codepoint_ & 0x3F)));
     106         [ +  + ]:    1050537 :         } else if (codepoint_ <= 0xFFFF) {
     107                 :      33958 :             str.push_back((char)(0xE0 | (codepoint_ >> 12)));
     108                 :      33958 :             str.push_back((char)(0x80 | ((codepoint_ >> 6) & 0x3F)));
     109                 :      33958 :             str.push_back((char)(0x80 | (codepoint_ & 0x3F)));
     110         [ -  + ]:     348712 :         } else if (codepoint_ <= 0x1FFFFF) {
     111                 :     314754 :             str.push_back((char)(0xF0 | (codepoint_ >> 18)));
     112                 :     314754 :             str.push_back((char)(0x80 | ((codepoint_ >> 12) & 0x3F)));
     113                 :     314754 :             str.push_back((char)(0x80 | ((codepoint_ >> 6) & 0x3F)));
     114                 :     314754 :             str.push_back((char)(0x80 | (codepoint_ & 0x3F)));
     115                 :     314754 :         }
     116                 :    1062330 :     }
     117                 :            : };
     118                 :            : 
     119                 :            : #endif // BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H

Generated by: LCOV version 1.14