LCOV - code coverage report
Current view: top level - lib - stream.cpp (source / functions) Hit Total Coverage
Test: coverage.info Lines: 232 232 100.0 %
Date: 2014-11-22 Functions: 34 34 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* stream.cpp -- written by Alexis WILKE for Made to Order Software Corp. (c) 2005-2014 */
       2             : 
       3             : /*
       4             : 
       5             : Copyright (c) 2005-2014 Made to Order Software Corp.
       6             : 
       7             : http://snapwebsites.org/project/as2js
       8             : 
       9             : Permission is hereby granted, free of charge, to any
      10             : person obtaining a copy of this software and
      11             : associated documentation files (the "Software"), to
      12             : deal in the Software without restriction, including
      13             : without limitation the rights to use, copy, modify,
      14             : merge, publish, distribute, sublicense, and/or sell
      15             : copies of the Software, and to permit persons to whom
      16             : the Software is furnished to do so, subject to the
      17             : following conditions:
      18             : 
      19             : The above copyright notice and this permission notice
      20             : shall be included in all copies or substantial
      21             : portions of the Software.
      22             : 
      23             : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
      24             : ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
      25             : LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
      26             : FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
      27             : EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
      28             : LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
      29             : WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
      30             : ARISING FROM, OUT OF OR IN CONNECTION WITH THE
      31             : SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
      32             : SOFTWARE.
      33             : 
      34             : */
      35             : 
      36             : #include    "as2js/stream.h"
      37             : 
      38             : #include    "as2js/exceptions.h"
      39             : #include    "as2js/message.h"
      40             : 
      41             : #include    <unistd.h>
      42             : 
      43             : 
      44             : namespace as2js
      45             : {
      46             : 
      47             : /**********************************************************************/
      48             : /**********************************************************************/
      49             : /***  FILTERS  ********************************************************/
      50             : /**********************************************************************/
      51             : /**********************************************************************/
      52             : 
      53             : 
      54             : /** \brief Push one byte in the decoder.
      55             :  *
      56             :  * This function pushes exactly one byte in the decoder.
      57             :  *
      58             :  * In most cases decoders expects their getc() function to be called
      59             :  * right after each putc(), although it is not mandatory.
      60             :  *
      61             :  * \param[in] c  The character to append to this decoding filter.
      62             :  */
      63    75437912 : void DecodingFilter::putc(unsigned char c)
      64             : {
      65    75437912 :     f_buffer.push_back(c);
      66    75437912 : }
      67             : 
      68             : 
      69             : /** \brief Retrieve the next character.
      70             :  *
      71             :  * This function retrieves the next input character.
      72             :  *
      73             :  * If there is data, but not enough of it, it returns Input::INPUT_NAC.
      74             :  * Processing can safely continue.
      75             :  *
      76             :  * If possible, the function avoids returning with the Input::INPUT_NAC
      77             :  * result (i.e. if a filter returns that value when there is still data
      78             :  * available in the buffer, their get_char() function gets called again.)
      79             :  *
      80             :  * If there is data, but it cannot properly be converted to a valid
      81             :  * character, it returns Input::INPUT_ERR.
      82             :  *
      83             :  * If there is no data, then Input::INPUT_EOF is returned.
      84             :  *
      85             :  * \return The next character available or one of the Input::INPUT_...
      86             :  *         result (EOF, NAC, ERR).
      87             :  */
      88   104329205 : Input::char_t DecodingFilter::getc()
      89             : {
      90   104329205 :     if(f_buffer.empty())
      91             :     {
      92    22813468 :         return Input::INPUT_EOF;
      93             :     }
      94             : 
      95    81515737 :     return get_char();
      96             : }
      97             : 
      98             : 
      99             : /** \brief Get the next ISO-8859-1 character.
     100             :  *
     101             :  * This function returns the next unsigned char from the input buffer.
     102             :  *
     103             :  * \note
     104             :  * We know that buffer has at least one byte because get_char() is
     105             :  * called only after getc() checked whether the input buffer was
     106             :  * empty.
     107             :  *
     108             :  * \return The next character.
     109             :  */
     110         774 : Input::char_t DecodingFilterISO88591::get_char()
     111             : {
     112             :     // no conversion for ISO-8859-1 to UTF-32
     113         774 :     Input::char_t c(f_buffer[0]);
     114         774 :     f_buffer.erase(f_buffer.begin());
     115         774 :     return c;
     116             : }
     117             : 
     118             : 
     119             : /** \brief Get the next UTF-8 character.
     120             :  *
     121             :  * This function reads the next UTF-8 character. Since UTF-8 makes use of
     122             :  * a variable number of bytes, the function may return Input::INPUT_NAC
     123             :  * meaning that not enough data is available in the input buffer.
     124             :  *
     125             :  * If an invalid UTF-8 sequence is discovered, then Input::INPUT_ERR is
     126             :  * returned. The function can still be called with additional data to
     127             :  * read whatever comes next. Multiple errors may be returned while skipping
     128             :  * encoded bytes.
     129             :  *
     130             :  * \return The next character, Input::INPUT_NAC, or Input::INPUT_ERR.
     131             :  */
     132    37285445 : Input::char_t DecodingFilterUTF8::get_char()
     133             : {
     134             :     // Note: we know that the buffer is at least 1 byte
     135    37285445 :     unsigned char b(f_buffer[0]);
     136             : 
     137    37285445 :     if(b < 0x80)
     138             :     {
     139    16511720 :         f_buffer.erase(f_buffer.begin());
     140    16511720 :         return b;
     141             :     }
     142             : 
     143    20773725 :     size_t l(0);
     144    20773725 :     zas_char_t c;
     145    20773725 :     if(b >= 0xC0 && b <= 0xDF)
     146             :     {
     147        7727 :         l = 2;
     148        7727 :         c = b & 0x1F;
     149             :     }
     150    20765998 :     else if(b >= 0xE0 && b <= 0xEF)
     151             :     {
     152      444646 :         l = 3;
     153      444646 :         c = b & 0x0F;
     154             :     }
     155    20321352 :     else if(b >= 0xF0 && b <= 0xF7)
     156             :     {
     157    20321340 :         l = 4;
     158    20321340 :         c = b & 0x07;
     159             :     }
     160             :     else
     161             :     {
     162             :         // invalid UTF-8 sequence, erase one input byte
     163          12 :         f_buffer.erase(f_buffer.begin());
     164          12 :         return Input::INPUT_ERR;
     165             :     }
     166    20773713 :     if(f_buffer.size() < l)
     167             :     {
     168             :         // not enough bytes for this character
     169    18675619 :         return Input::INPUT_NAC;
     170             :     }
     171     8324782 :     for(size_t i(1); i < l; ++i)
     172             :     {
     173     6226758 :         b = f_buffer[i];
     174     6226758 :         if(b < 0x80 || b > 0xBF)
     175             :         {
     176             :             // found an invalid byte, remove bytes before that
     177          70 :             f_buffer.erase(f_buffer.begin(), f_buffer.begin() + i);
     178          70 :             return Input::INPUT_ERR;
     179             :         }
     180     6226688 :         c = (c << 6) | (b & 0x3F);
     181             :     }
     182             : 
     183             :     // get rid of those bytes
     184     2098024 :     f_buffer.erase(f_buffer.begin(), f_buffer.begin() + l);
     185             : 
     186             :     // is it a UTF-16 surrogate or too large a character?
     187     2098024 :     if(!String::valid_character(c))
     188             :     {
     189      985087 :         return Input::INPUT_ERR;
     190             :     }
     191             : 
     192             :     // return that character
     193     1112937 :     return c;
     194             : }
     195             : 
     196             : 
     197             : /** \brief Decode a UTF-16 character.
     198             :  *
     199             :  * This function is called with a 2 byte value which either represents
     200             :  * a Unicode character as is, or a UTF-16 surrogate. When a surrogate
     201             :  * is detected, it is transformed in a full Unicode character by this
     202             :  * function. The function needs to be called twice to decode one full
     203             :  * Unicode character described using a surrogate.
     204             :  *
     205             :  * If an invalid surrogate sequence is found, then the function
     206             :  * returns Input::INPUT_ERR.
     207             :  *
     208             :  * When the lead surrogate is found, the function returns Input::INPUT_NAC
     209             :  * meaning that more data is necessary and the function needs to be called
     210             :  * again to proceed.
     211             :  *
     212             :  * \param[in] c  Two byte value representing a Unicode character or a UTF-16
     213             :  *               surrogate.
     214             :  *
     215             :  * \return The following character, Input::INPUT_NAC, or Input::INPUT_ERR.
     216             :  */
     217     8650752 : Input::char_t DecodingFilterUTF16::next_char(Input::char_t c)
     218             : {
     219     8650752 :     if(c >= 0xD800 && c < 0xDC00)
     220             :     {
     221     4196352 :         f_lead_surrogate = c;
     222     4196352 :         return Input::INPUT_NAC; // not an error unless it was the last 2 bytes
     223             :     }
     224     4454400 :     else if(c >= 0xDC00 && c <= 0xDFFF)
     225             :     {
     226     4196352 :         if(f_lead_surrogate == 0)
     227             :         {
     228             :             // lead surrogate missing, skip trail
     229        2048 :             f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 2);
     230        2048 :             return Input::INPUT_ERR;
     231             :         }
     232     4194304 :         c = (((static_cast<as_char_t>(f_lead_surrogate) & 0x03FF) << 10) | (static_cast<as_char_t>(c) & 0x03FF)) + 0x10000;
     233     4194304 :         f_lead_surrogate = 0;
     234             :     }
     235      258048 :     else if(f_lead_surrogate != 0)
     236             :     {
     237             :         // trail surrogate missing
     238        2048 :         f_lead_surrogate = 0;
     239        2048 :         return Input::INPUT_ERR;
     240             :     }
     241             : 
     242     4450304 :     return c;
     243             : }
     244             : 
     245             : 
     246             : /** \brief Decode UTF-16 in Little Endian format.
     247             :  *
     248             :  * This function reads data in UTF-16 Little Endian. The function may
     249             :  * return Input::INPUT_NAC if called without enough data forming a
     250             :  * unicode character or when only the lead surrogate is read.
     251             :  *
     252             :  * The function returns Input::INPUT_ERR if the function finds a lead
     253             :  * without a trail surrogate, or a trail without a lead.
     254             :  *
     255             :  * \return The next character, Input::INPUT_ERR, or Input::INPUT_NAC.
     256             :  */
     257     7538689 : Input::char_t DecodingFilterUTF16LE::get_char()
     258             : {
     259             :     Input::char_t c;
     260     4323328 :     do
     261             :     {
     262     7538689 :         if(f_buffer.size() < 2)
     263             :         {
     264     3213313 :             return Input::INPUT_NAC;
     265             :         }
     266             : 
     267     4325376 :         c = next_char(f_buffer[0] + f_buffer[1] * 256);
     268     4325376 :         if(c == Input::INPUT_ERR)
     269             :         {
     270        2048 :             return Input::INPUT_ERR;
     271             :         }
     272     4323328 :         f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 2);
     273             :     }
     274             :     while(c == Input::INPUT_NAC);
     275             : 
     276     2225152 :     return c;
     277             : }
     278             : 
     279             : 
     280             : /** \brief Decode UTF-16 in Big Endian format.
     281             :  *
     282             :  * This function reads data in UTF-16 Big Endian. The function may
     283             :  * return Input::INPUT_NAC if called without enough data forming a
     284             :  * unicode character or when only the lead surrogate is read.
     285             :  *
     286             :  * The function returns Input::INPUT_ERR if the function finds a lead
     287             :  * without a trail surrogate, or a trail without a lead.
     288             :  *
     289             :  * \return The next character or Input::INPUT_NAC.
     290             :  */
     291     7538689 : Input::char_t DecodingFilterUTF16BE::get_char()
     292             : {
     293             :     Input::char_t c;
     294     4323328 :     do
     295             :     {
     296     7538689 :         if(f_buffer.size() < 2)
     297             :         {
     298     3213313 :             return Input::INPUT_NAC;
     299             :         }
     300             : 
     301     4325376 :         c = next_char(f_buffer[0] * 256 + f_buffer[1]);
     302     4325376 :         if(c == Input::INPUT_ERR)
     303             :         {
     304        2048 :             return Input::INPUT_ERR;
     305             :         }
     306     4323328 :         f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 2);
     307             :     }
     308             :     while(c == Input::INPUT_NAC);
     309             : 
     310     2225152 :     return c;
     311             : }
     312             : 
     313             : 
     314             : /** \brief Decode UTF-32 in Little Endian format.
     315             :  *
     316             :  * This function reads data in UTF-32 Little Endian. The function may
     317             :  * return Input::INPUT_ERR if the input represents an invalid character
     318             :  * (i.e. a character larger than 0x10FFFF or representing a UTF-16
     319             :  * surrogate encoding.)
     320             :  *
     321             :  * If the buffer does not at least include 4 bytes, then the function
     322             :  * returns Input::INPUT_NAC.
     323             :  *
     324             :  * \return The next character, Input::INPUT_NAC, or Input::INPUT_ERR.
     325             :  */
     326     8388867 : Input::char_t DecodingFilterUTF32LE::get_char()
     327             : {
     328     8388867 :     if(f_buffer.size() < 4)
     329             :     {
     330     6291459 :         return Input::INPUT_NAC;
     331             :     }
     332             : 
     333             :     // little endian has byte 0 as the least significant
     334             :     Input::char_t c(
     335     2097408 :               (f_buffer[0] <<  0)
     336     2097408 :             | (f_buffer[1] <<  8)
     337     2097408 :             | (f_buffer[2] << 16)
     338     2097408 :             | (f_buffer[3] << 24)
     339     2097408 :         );
     340     2097408 :     f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 4);
     341     2097408 :     if(!String::valid_character(c))
     342             :     {
     343      985202 :         return Input::INPUT_ERR;
     344             :     }
     345     1112206 :     return c;
     346             : }
     347             : 
     348             : 
     349             : /** \brief Decode UTF-32 in Big Endian format.
     350             :  *
     351             :  * This function reads data in UTF-32 Big Endian. The function may
     352             :  * return Input::INPUT_ERR if the input represents an invalid character
     353             :  * (i.e. a character larger than 0x10FFFF or representing a UTF-16
     354             :  * surrogate encoding.)
     355             :  *
     356             :  * If the buffer does not at least include 4 bytes, then the function
     357             :  * returns Input::INPUT_NAC.
     358             :  *
     359             :  * \return The next character, Input::INPUT_NAC, or Input::INPUT_ERR.
     360             :  */
     361     8388867 : Input::char_t DecodingFilterUTF32BE::get_char()
     362             : {
     363     8388867 :     if(f_buffer.size() < 4)
     364             :     {
     365     6291459 :         return Input::INPUT_NAC;
     366             :     }
     367             : 
     368             :     // big endian has byte 0 as the most significant
     369             :     Input::char_t c(
     370     2097408 :               (f_buffer[0] << 24)
     371     2097408 :             | (f_buffer[1] << 16)
     372     2097408 :             | (f_buffer[2] <<  8)
     373     2097408 :             | (f_buffer[3] <<  0)
     374     2097408 :         );
     375     2097408 :     f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 4);
     376     2097408 :     if(!String::valid_character(c))
     377             :     {
     378      985202 :         return Input::INPUT_ERR;
     379             :     }
     380     1112206 :     return c;
     381             : }
     382             : 
     383             : 
     384             : /** \brief Read the next character in any format.
     385             :  *
     386             :  * This function reads one character from the input stream. At first
     387             :  * the stream is considered to be undefined (no specific filter defined).
     388             :  *
     389             :  * Once we have an least 4 bytes of data, we try to detect a BOM. If no
     390             :  * BOM is detected, make sure that the characters are valid UTF-8, and
     391             :  * if so, use the UTF-8 filter, otherwise fallback on the ISO-8859-1
     392             :  * filter unless we notice many zeroes in which case we use one of
     393             :  * the UTF-16 or UTF-32 decoders.
     394             :  *
     395             :  * \bug
     396             :  * Known bug: if the input file is less than 4 bytes it cannot be
     397             :  * used because this filter will always return a NAC. So even a valid
     398             :  * source of 1, 2, or 3 characters fails. However, the likelihood of
     399             :  * such a script to be useful are probably negative so we do not care
     400             :  * too much.
     401             :  *
     402             :  * \return The following characters, Input::INPUT_NAC, or Input::INPUT_ERR.
     403             :  */
     404    16570758 : Input::char_t DecodingFilterDetect::get_char()
     405             : {
     406    16570758 :     if(!f_filter)
     407             :     {
     408       75075 :         if(f_buffer.size() < 4)
     409             :         {
     410       56301 :             return Input::INPUT_NAC;
     411             :         }
     412             : 
     413             :         // read the BOM in big endian
     414             :         uint32_t bom(
     415       18774 :                   (f_buffer[0] << 24)
     416       18774 :                 | (f_buffer[1] << 16)
     417       18774 :                 | (f_buffer[2] <<  8)
     418       18774 :                 | (f_buffer[3] <<  0)
     419       18774 :             );
     420             : 
     421       18774 :         if(bom == 0x0000FEFF)
     422             :         {
     423             :             // UTF-32 Big Endian
     424           2 :             f_filter.reset(new DecodingFilterUTF32BE);
     425           2 :             f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 4);
     426             :         }
     427       18772 :         else if(bom == 0xFFFE0000)
     428             :         {
     429             :             // UTF-32 Little Endian
     430           2 :             f_filter.reset(new DecodingFilterUTF32LE);
     431           2 :             f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 4);
     432             :         }
     433       18770 :         else if((bom >> 16) == 0xFEFF)
     434             :         {
     435             :             // UTF-16 Big Endian
     436           2 :             f_filter.reset(new DecodingFilterUTF16BE);
     437           2 :             f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 2);
     438             :         }
     439       18768 :         else if((bom >> 16) == 0xFFFE)
     440             :         {
     441             :             // UTF-16 Little Endian
     442           2 :             f_filter.reset(new DecodingFilterUTF16LE);
     443           2 :             f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 2);
     444             :         }
     445       18766 :         else if((bom & 0xFFFFFF00) == 0xEFBBBF00)
     446             :         {
     447             :             // UTF-8
     448          13 :             f_filter.reset(new DecodingFilterUTF8);
     449          13 :             f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 3);
     450             :         }
     451             :         else
     452             :         {
     453             :             // if each character is valid UTF-8, the use UTF-8
     454       18753 :             String s;
     455       18753 :             String::conversion_result_t r(s.from_utf8(reinterpret_cast<char const *>(&f_buffer[0]), f_buffer.size()));
     456       18753 :             if(r == String::conversion_result_t::STRING_GOOD || r == String::conversion_result_t::STRING_END)
     457             :             {
     458       18751 :                 f_filter.reset(new DecodingFilterUTF8);
     459             :             }
     460             :             else
     461             :             {
     462             :                 // fallback to ISO-8859-1 (should very rarely happen!)
     463           2 :                 f_filter.reset(new DecodingFilterISO88591);
     464       18753 :             }
     465             :         }
     466             :     }
     467             : 
     468             :     // we do not get BOMs returned, yet we could check for the BOM
     469             :     // character and adjust the filter if we detect it being
     470             :     // swapped (it does not look like Unicode promotes that scheme
     471             :     // anymore though, therefore at this point we won't do that...)
     472             : 
     473    16514457 :     Input::char_t c(f_filter->getc());
     474    66060057 :     while((c == Input::INPUT_EOF || c == Input::INPUT_NAC)
     475    49545602 :        && !f_buffer.empty())
     476             :     {
     477             :         // transmit the data added to "this" filter
     478             :         // down to f_filter, but only as required
     479             :         // because otherwise we'd generate an EOF
     480    16514477 :         f_filter->putc(f_buffer[0]);
     481    16514477 :         f_buffer.erase(f_buffer.begin(), f_buffer.begin() + 1);
     482    16514477 :         c = f_filter->getc();
     483             :     }
     484             : 
     485    16514457 :     return c;
     486             : }
     487             : 
     488             : 
     489             : 
     490             : /**********************************************************************/
     491             : /**********************************************************************/
     492             : /***  INPUT  **********************************************************/
     493             : /**********************************************************************/
     494             : /**********************************************************************/
     495             : 
     496             : 
     497             : /** \brief Initialize an input object.
     498             :  *
     499             :  * This function initializes the input object making it ready to be
     500             :  * used to read data from a file, a string, or a TTY.
     501             :  *
     502             :  * The \p filter should generally not be specified, although if you
     503             :  * know the format of an input file, it can be useful to force the
     504             :  * filter to the exact format. We only support Unicode formats,
     505             :  * though.
     506             :  *
     507             :  * \param[in] filter  The filter to use while reading the input data.
     508             :  */
     509     5995372 : Input::Input(DecodingFilter::pointer_t filter)
     510     5995372 :     : f_filter(filter)
     511             :     //, f_position() -- auto-init
     512             :     //, f_unget() -- auto-init
     513             : {
     514     5995372 : }
     515             : 
     516             : 
     517             : /** \brief Get the position object of the input object.
     518             :  *
     519             :  * The stream manages a position object. The call can use this function
     520             :  * to retrieve a read/write version of the current position.
     521             :  *
     522             :  * \return A modifiable version of the position object.
     523             :  */
     524    90023280 : Position& Input::get_position()
     525             : {
     526    90023280 :     return f_position;
     527             : }
     528             : 
     529             : 
     530             : /** \brief Get the position object of the input object.
     531             :  *
     532             :  * The stream manages a position object. The call can use this function
     533             :  * to retrieve a read-only version of the current position.
     534             :  *
     535             :  * \return A constant version of the position object.
     536             :  */
     537          51 : Position const& Input::get_position() const
     538             : {
     539          51 :     return f_position;
     540             : }
     541             : 
     542             : 
     543             : /** \brief Get one character.
     544             :  *
     545             :  * This function retrieves the next character from the input object.
     546             :  *
     547             :  * If the caller used the ungetc() function, then the characters that
     548             :  * were ungotten are returned first in the opposite order (FILO).
     549             :  *
     550             :  * \return The next character available in the stream.
     551             :  */
     552   281481729 : Input::char_t Input::getc()
     553             : {
     554   281481729 :     if(!f_unget.empty())
     555             :     {
     556          28 :         char_t result(f_unget.back());
     557          28 :         f_unget.pop_back();
     558          28 :         return result;
     559             :     }
     560   281481701 :     return filter_getc();
     561             : }
     562             : 
     563             : 
     564             : /** \brief Unget one character.
     565             :  *
     566             :  * This function saves the specified character \p c in a buffer of the
     567             :  * Input object. The next getc() call will first return that last character
     568             :  * the caller unget.
     569             :  *
     570             :  * \param[in] c  The character to unget.
     571             :  */
     572          84 : void Input::ungetc(char_t c)
     573             : {
     574             :     // silently avoid ungetting special values such as INPUT_EOF
     575             :     // (TBD: maybe we should check surrogates?)
     576          84 :     if(c > 0 && c < 0x110000)
     577             :     {
     578          28 :         f_unget.push_back(c);
     579             :     }
     580          84 : }
     581             : 
     582             : 
     583             : /** \brief Get the next character.
     584             :  *
     585             :  * This function reads the next character from the input. In most cases
     586             :  * this reads one or more bytes from the input file, and then it
     587             :  * converts those bytes in a character using a filter.
     588             :  *
     589             :  * This function does not return Input::INPUT_NAC. Instead it reads as
     590             :  * much data as it can and returns the next character, no matter what.
     591             :  * However, it may return EOF if the end of the file is reached, or
     592             :  * ERR if a character in the stream is not valid. There are two types
     593             :  * of invalid characters: (1) numbers that are outside of the Unicode
     594             :  * range (0 .. 0x010FFFF) or a UTF-16 surrogate in a format that does
     595             :  * not support such surrogate (UTF-8, UTF-32), and (2) byte sequences
     596             :  * that end before a valid character can be formed (missing surrogate,
     597             :  * invalid UTF-8).
     598             :  *
     599             :  * \return The next character, Input::INPUT_EOF, or Input::INPUT_ERR.
     600             :  */
     601    16580070 : Input::char_t Input::filter_getc()
     602             : {
     603             :     // if the input class used does not overload this function,
     604             :     // then we get the next byte and try to convert it to a
     605             :     // character, if that works, return that character
     606             : 
     607             :     char_t w;
     608    16523866 :     do
     609             :     {
     610    16580070 :         char_t c(get_byte());
     611    16580069 :         if(c == Input::INPUT_EOF)
     612             :         {
     613             :             // determine the final result
     614       56203 :             w = f_filter->getc();
     615       56203 :             return w == Input::INPUT_NAC ? Input::INPUT_ERR : w;
     616             :         }
     617    16523866 :         f_filter->putc(c);
     618    16523866 :         w = f_filter->getc();
     619             :     }
     620    16465409 :     while(w == Input::INPUT_NAC || w == Input::INPUT_EOF);
     621             :     // EOF can happen if we bump in a BOM in the middle of nowhere
     622             :     // so we have to loop on EOF as well
     623             : 
     624    16465409 :     return w;
     625             : }
     626             : 
     627             : 
     628             : /** \brief Function used to get the following byte of data.
     629             :  *
     630             :  * This virtual function is used by the filter_getc() function to
     631             :  * retrieve the next character of data from the input stream. The
     632             :  * default implementation of the function throws because it
     633             :  * should never get called.
     634             :  *
     635             :  * Note that it is possible to bypass this function by implementing
     636             :  * instead the filter_getc() in your own class.
     637             :  *
     638             :  * \exception exception_internal_error
     639             :  * This function always raises this exception because it should
     640             :  * not be called.
     641             :  *
     642             :  * \return The next byte from the input stream.
     643             :  */
     644           1 : Input::char_t Input::get_byte()
     645             : {
     646             :     // this function should never be called
     647           1 :     throw exception_internal_error("internal error: the get_byte() of the Input class was called");
     648             : }
     649             : 
     650             : 
     651             : /**********************************************************************/
     652             : /**********************************************************************/
     653             : /***  STANDARD INPUT  *************************************************/
     654             : /**********************************************************************/
     655             : /**********************************************************************/
     656             : 
     657             : 
     658             : /** \brief Use standard input as the input stream.
     659             :  *
     660             :  * This function sets up the input file to the standard input of the
     661             :  * process. In that case the filename is set to "-". However, there
     662             :  * is not size available.
     663             :  *
     664             :  * The function first calls close() to make sure that any previous
     665             :  * call to standard_input() or open() get cleaned up.
     666             :  *
     667             :  * \return true if the file could be opened.
     668             :  */
     669           2 : StandardInput::StandardInput()
     670             : {
     671           1 :     get_position().set_filename("-");
     672           1 : }
     673             : 
     674             : 
     675             : /** \brief Read one by from the standard input.
     676             :  *
     677             :  * This function returns the next byte found in the standard input
     678             :  * stream.
     679             :  *
     680             :  * If the input stream can end and the end was reached, then
     681             :  * INPUT_EOF is returned.
     682             :  *
     683             :  * \return The next byte from the input stream.
     684             :  */
     685          39 : Input::char_t StandardInput::get_byte()
     686             : {
     687             :     char c;
     688          39 :     if(std::cin.get(c))
     689             :     {
     690          36 :         return static_cast<char_t>(c) & 255;
     691             :     }
     692           3 :     return INPUT_EOF;
     693             : }
     694             : 
     695             : 
     696             : /**********************************************************************/
     697             : /**********************************************************************/
     698             : /***  FILE INPUT  *****************************************************/
     699             : /**********************************************************************/
     700             : /**********************************************************************/
     701             : 
     702             : 
     703             : /** \brief Use the named file as the input stream.
     704             :  *
     705             :  * This function sets up the named file as the input stream of this
     706             :  * FileInput object.
     707             :  *
     708             :  * The function first calls close() to make sure that any previous
     709             :  * call to standard_input() or open() get cleaned up.
     710             :  *
     711             :  * \note
     712             :  * This function is not placed in the constructor because we want
     713             :  * to return false if the file cannot be opened.
     714             :  *
     715             :  * \param[in] filename  The name of the file to open.
     716             :  *
     717             :  * \return true if the file could be opened.
     718             :  */
     719       18789 : bool FileInput::open(String const& filename)
     720             : {
     721       18789 :     if(f_file.is_open())
     722             :     {
     723           1 :         throw exception_file_already_open("file object for \"" + get_position().get_filename().to_utf8() + "\" cannot be reused for \"" + filename.to_utf8() + "\"");
     724             :     }
     725             : 
     726       18788 :     std::string utf8(filename.to_utf8());
     727       18788 :     f_file.open(utf8.c_str());
     728       18788 :     if(!f_file.is_open())
     729             :     {
     730          28 :         return false;
     731             :     }
     732       18760 :     get_position().set_filename(filename);
     733             : 
     734       18760 :     return true;
     735             : }
     736             : 
     737             : 
     738             : /** \brief Get the next byte from the file.
     739             :  *
     740             :  * This function reads one byte from the input file and returns it.
     741             :  *
     742             :  * \return The read byte.
     743             :  */
     744    16580030 : Input::char_t FileInput::get_byte()
     745             : {
     746             :     char c;
     747    16580030 :     if(f_file.get(c))
     748             :     {
     749    16523830 :         return static_cast<char_t>(c) & 255;
     750             :     }
     751       56200 :     return INPUT_EOF;
     752             : }
     753             : 
     754             : 
     755             : 
     756             : 
     757             : /**********************************************************************/
     758             : /**********************************************************************/
     759             : /***  STRING INPUT  ***************************************************/
     760             : /**********************************************************************/
     761             : /**********************************************************************/
     762             : 
     763             : /** \brief Initliaze the string input.
     764             :  *
     765             :  * This function initialize a StringInput object with the specified
     766             :  * string and line number. By default, line is set to 1 since strings
     767             :  * represent code from the start of a file.
     768             :  *
     769             :  * \param[in] str  The string parameter which we will read characters from.
     770             :  * \param[in] line  The start line for the Position object.
     771             :  */
     772     5976601 : StringInput::StringInput(String const& str, Position::counter_t line)
     773    11953202 :     : f_str(str)
     774             :     //, f_pos(0) -- auto-init
     775             : {
     776             :     // in case line is not set to 1
     777     5976601 :     get_position().reset_counters(line);
     778     5976601 : }
     779             : 
     780             : 
     781             : /** \brief Get the next character.
     782             :  *
     783             :  * This function bypasses the Input filter since we already have
     784             :  * UTF-32 characters in the input string.
     785             :  *
     786             :  * \return The next character from the string or Input::INPUT_EOF.
     787             :  */
     788   264960088 : Input::char_t StringInput::filter_getc() // bypass the filters
     789             : {
     790   264960088 :     char_t  c(INPUT_EOF);
     791             : 
     792   264960088 :     if(f_pos < f_str.length())
     793             :     {
     794   260265538 :         c = f_str[f_pos];
     795   260265538 :         ++f_pos;
     796             :     }
     797   264960088 :     return c;
     798             : }
     799             : 
     800             : 
     801             : 
     802             : 
     803             : /**********************************************************************/
     804             : /**********************************************************************/
     805             : /***  OUTPUT  *********************************************************/
     806             : /**********************************************************************/
     807             : /**********************************************************************/
     808             : 
     809             : 
     810             : /** \brief Get the position object of the input object.
     811             :  *
     812             :  * The stream manages a position object. The call can use this function
     813             :  * to retrieve a read/write version of the current position.
     814             :  *
     815             :  * \return A modifiable version of the position object.
     816             :  */
     817        8242 : Position& Output::get_position()
     818             : {
     819        8242 :     return f_position;
     820             : }
     821             : 
     822             : 
     823             : /** \brief Get the position object of the input object.
     824             :  *
     825             :  * The stream manages a position object. The call can use this function
     826             :  * to retrieve a read-only version of the current position.
     827             :  *
     828             :  * \return A constant version of the position object.
     829             :  */
     830           4 : Position const& Output::get_position() const
     831             : {
     832           4 :     return f_position;
     833             : }
     834             : 
     835             : 
     836             : /** \brief Write data to this output stream.
     837             :  *
     838             :  * This function writes the specified string to the output stream.
     839             :  * Since we pretty much only support text based files, we just
     840             :  * use this format.
     841             :  *
     842             :  * All outputs are done in UTF-8.
     843             :  *
     844             :  * If the function cannot write to the destination, then it throws
     845             :  * an error.
     846             :  *
     847             :  * \param[in] data  The string to be written to the output stream.
     848             :  */
     849      160107 : void Output::write(String const& data)
     850             : {
     851      160107 :     internal_write(data);
     852      160105 : }
     853             : 
     854             : 
     855             : /**********************************************************************/
     856             : /**********************************************************************/
     857             : /***  STANDARD OUTPUT  ************************************************/
     858             : /**********************************************************************/
     859             : /**********************************************************************/
     860             : 
     861             : /** \brief Initializes the standard output object.
     862             :  *
     863             :  * This function initializes the standard output object, more
     864             :  * specifically it defines its filename as "-".
     865             :  */
     866           2 : StandardOutput::StandardOutput()
     867             : {
     868           2 :     get_position().set_filename("-");
     869           2 : }
     870             : 
     871             : 
     872             : /** \brief Write a string to standard output.
     873             :  *
     874             :  * This function writes the specified string of data to the output
     875             :  * in UTF-8 format.
     876             :  *
     877             :  * Note that the streams do not save a BOM at the start of files.
     878             :  *
     879             :  * \param[in] data  The string to write in the standard output.
     880             :  */
     881           2 : void StandardOutput::internal_write(String const& data)
     882             : {
     883           2 :     std::cout << data.to_utf8();
     884           2 :     if(!std::cout)
     885             :     {
     886             :         // should we do something here?
     887           1 :         Message msg(message_level_t::MESSAGE_LEVEL_FATAL, err_code_t::AS_ERR_IO_ERROR, get_position());
     888           1 :         msg << "I/O error: could not write to output.";
     889           1 :         throw exception_exit(1, "I/O error: could not write to output.");
     890             :     }
     891           1 : }
     892             : 
     893             : 
     894             : /**********************************************************************/
     895             : /**********************************************************************/
     896             : /***  OUTPUT FILE  ****************************************************/
     897             : /**********************************************************************/
     898             : /**********************************************************************/
     899             : 
     900             : 
     901             : /** \brief Open the output file.
     902             :  *
     903             :  * This function is used to open the output file.
     904             :  *
     905             :  * One FileOutput object can only be used to output to one file. Trying
     906             :  * to reuse the same object with a different filename will generate
     907             :  * an exception.
     908             :  *
     909             :  * \todo
     910             :  * Generate an error message for why the file could not be opened.
     911             :  *
     912             :  * \param[in] filename  The name of the file to open for output.
     913             :  *
     914             :  * \return true if the file was successfully opened, false if an error
     915             :  *         occured.
     916             :  */
     917        8229 : bool FileOutput::open(String const& filename)
     918             : {
     919        8229 :     if(f_file.is_open())
     920             :     {
     921           2 :         throw exception_file_already_open("file object for \"" + get_position().get_filename().to_utf8() + "\" cannot be reused for \"" + filename.to_utf8() + "\"");
     922             :     }
     923             : 
     924        8227 :     std::string utf8(filename.to_utf8());
     925        8227 :     f_file.open(utf8.c_str());
     926        8227 :     if(!f_file.is_open())
     927             :     {
     928           3 :         return false;
     929             :     }
     930        8224 :     get_position().set_filename(filename);
     931             : 
     932        8224 :     return true;
     933             : }
     934             : 
     935             : 
     936             : /** \brief Write to the output file.
     937             :  *
     938             :  * This function writes the specified \p data string to this output file.
     939             :  *
     940             :  * If an error occurs, the process writes a fatal error in stderr and
     941             :  * exists.
     942             :  *
     943             :  * \param[in] data  The string to write to the output file.
     944             :  */
     945      160047 : void FileOutput::internal_write(String const& data)
     946             : {
     947      160047 :     f_file << data.to_utf8();
     948      160047 :     if(!f_file)
     949             :     {
     950             :         // should we do something here?
     951           1 :         Message msg(message_level_t::MESSAGE_LEVEL_FATAL, err_code_t::AS_ERR_IO_ERROR, get_position());
     952           1 :         msg << "I/O error: could not write to output.";
     953           1 :         throw exception_exit(1, "I/O error: could not write to output.");
     954             :     }
     955      160046 : }
     956             : 
     957             : 
     958             : /**********************************************************************/
     959             : /**********************************************************************/
     960             : /***  OUTPUT STRING  **************************************************/
     961             : /**********************************************************************/
     962             : /**********************************************************************/
     963             : 
     964             : 
     965             : /** \brief Retrieve a copy of the output string.
     966             :  *
     967             :  * This function is used to retrieve the output string used as a buffer
     968             :  * each time the write() function is called.
     969             :  *
     970             :  * \return A reference to the internal string.
     971             :  */
     972          22 : String const& StringOutput::get_string() const
     973             : {
     974          22 :     return f_string;
     975             : }
     976             : 
     977             : 
     978             : /** \brief Write to the output string.
     979             :  *
     980             :  * This function writes the specified \p data string to this output string.
     981             :  *
     982             :  * \param[in] data  The string to write to the output file.
     983             :  */
     984          58 : void StringOutput::internal_write(String const& data)
     985             : {
     986          58 :     f_string += data;
     987          58 : }
     988             : 
     989             : 
     990          63 : }
     991             : // namespace as2js
     992             : 
     993             : // vim: ts=4 sw=4 et

Generated by: LCOV version 1.10