petidomo/libtext/text.hpp

/*
 * $Source$
 * $Revision$
 * $Date$
 *
 * Copyright (c) 1999 by CyberSolutions GmbH.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by CyberSolutions GmbH.
 *
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef __LIB_TEXT_HPP__
#define __LIB_TEXT_HPP__

#include <stdexcept>
#include <string>
#include <cstring>
#include <iterator>
#include <list>
#include <sys/types.h>
#include "../RegExp/RegExp.hpp"

/** \file text.hpp

    A library for text parsing and manipulation.

    This library contains a couple of useful functions for dealing
    with strings, most notably a regular expression class and a
    generic config file parser.
*/

//////////////////////////////////////////////////
//                tokenize()                    //
//////////////////////////////////////////////////

template<class T>
void tokenize(insert_iterator<T> & ii, const string & buffer,
	      const char * sep = " \t\r\n")
{
    string::size_type pos = 0;
    while(pos != string::npos) {
	string::size_type end_pos = buffer.find_first_of(sep, pos);
	string token = buffer.substr(pos, end_pos-pos);
	if (!token.empty()) {
	    *ii = token;
	    ++ii;
	    end_pos = buffer.find_first_not_of(sep, end_pos);
	}
	if (end_pos != string::npos)
	  end_pos = buffer.find_first_not_of(sep, end_pos);
	pos = end_pos;
    }
}

//////////////////////////////////////////////////
//               RegexTokenizer()               //
//////////////////////////////////////////////////


/** The RegexTokenizer extracts tokens from 'string' input.

    string or stream input has to be converted to string. This means
    the Tokenizer should be useful with large input which is divided
    into large chunks. A match is performed against a list of regular
    expressions. Each expression defines a match-separator pair.
    Regular Expressions are compiled with REG_EXTENDED flag.
*/

class RegexTokenizer: forward_iterator<RegexTokenizer, int> {
public:

    /** maximum number of registers, subexpressions */
    static const int N_pm=10;

    /** maximum length of a match */
    static const int N_substring=1024;

    /** the workspace */
    static char workspace[N_substring+1]; //+1 for trailing \0

    /** Modes (other than Custom) make the \a RegexTokenizer use a standard regular expression.

	\a Custom : The tokenizer uses the regular expression you specify.

	\a Word   : The tokenizer gives chunks of input separated by space and tabs.

	\a Line   : The tokenizer splits input at end of line.

	\a RFC    : The tokenizer splits input at end of line.
	         Lines may be continued by starting a new line with spaces or tabs.
		 These continuation characters are NOT stripped from the tokens.

     */
    enum Mode {Custom, Word, Line, RFC};

    /** RegexTokenizer is it''s own iterator. */
    typedef RegexTokenizer iterator;
private:
    string input;
    string result[N_pm];
    list<const char*>regex_src;// the source regexes needed for copy/begin/end
    list<regex_t>regex;        // not sure multiple regexes are a smart idea
    int whichregexwasmatched;
    regmatch_t pm[N_pm];
    int I_pm;                  // matched subexpressions
    int error;                 // result of regex calls
    int so,eo,previous_eo;     // positions
    //int matchMask;//bitset; which fields to return by the * operator
protected:
    Mode mode;
    void advance();
    void reset();
    int set(string _input,list<const char*> _regex);
public:
    /** default constructor. */
    RegexTokenizer();

    /** Tokenize a string in a mode. */
    RegexTokenizer(string _input,Mode _mode);

    /**  Tokenize a string according to a single regular expression. */
    RegexTokenizer(string _input,const char* oneregex);

    /** Tokenize a string according to several regular expressions.
	(If the first regular expression fails, the next one will be tried. )
     */
    RegexTokenizer(string _input,list<const char*> _regex);

    /** copy constructor */
    RegexTokenizer(const RegexTokenizer &r);

    //void selectFields(int m){ matchMask= m; }

    /** The begin state */
    RegexTokenizer begin() const;

    /** The end state */
    RegexTokenizer end() const;

    /**
	from Input Iterator
        Returns the current token.
    */
    const string operator*() const
    { return result[0]; };


    /** from Input Iterator
        Returns the i-th matched subexpression.
    */
    const string operator[](int i) const
    { return result[i]; };

    /** from Input Iterator
	PreIncrement
     */
    RegexTokenizer& operator++()
    { (*this).advance(); return *this; };

    /** from Input Iterator
	PostIncrement
     */
    RegexTokenizer& operator++(int i)
    { while(i>0){ (*this).advance(); --i; }; return *this; };

    /** Destructor */
    virtual ~RegexTokenizer();

    /** compare not equal */
    bool operator != (const RegexTokenizer &R) const{// const & I say, const
	return  so != R.so || eo != R.eo || previous_eo != R.previous_eo;
    }

    /** compare two RegexTokenizers */
    bool operator == (const RegexTokenizer &R) const{
	return !( *this != R );
    }

    /** print the current state of the RegexTokenizer */
    friend ostream& operator<<(ostream &o,const RegexTokenizer &r);
};


//////////////////////////////////////////////////
//                 TokenIterator                //
//////////////////////////////////////////////////


/** The TokenIterator extracts tokens from string or stream input.

    There are four main modes and a custom mode. In all modes, the
    backslash works as an escape character for the next character i.e.
    'one\\\\backslash' is read as 'one\backslash'.

    Description of the main modes:

    1. Words separated by whitespace, with "whitespace" consisting of
    tabulators and the blank.
    \code
    TokenIterator tokenize(inputStr,TokenIterator::Word);
    \endcode

    2. Words separated by whitespace, "one word" is one token.
    whitespace is defined to be only tabulators and the blank.
    \code
    TokenIterator tokenize(inputStr,TokenIterator::Word,true);
    \endcode

    3. Each line is a token.
    Escaped newlines will become part of the token.
    example:
    \code
    TokenIterator tokenize(inputStr,TokenIterator::Line);
    \endcode

    4. RFC style:
    Whitespace at start of next line appends next line.
    The use of escaping the newline to append the next line,
    like in Makefiles, is NOT part of this mode.
    example:
    \code
    TokenIterator tokenize(inputStr,TokenIterator::RFC);
    \endcode

    5. The Custom Mode: The custom mode is intended for reading from
    data that is in almost human-readable-format, like /etc/passwd.
    Separating elements are not returned as Tokens, but are stored in
    thesep and previoussep. In /etc/passwd ':' is the separator,
    while newlines separate records.
    \code
    class MyCustomTokenIterator: public TokenIterator{
        public:

        MyCustomTokenIterator(string inputStr, bool b=false)
	    : TokenIterator(inputStr,TokenIterator::Custom, b){
	    eoltoken= '\n';
	    separator= ":\n";
        };

        MyCustomTokenIterator(istream &inputStr, bool b=false)
	    : TokenIterator(inputStr,TokenIterator::Custom, b){
	    eoltoken= '\n';
	    separator= ":\n";
        };
   \endcode
   See \a CustomTokenIterator.cpp for the full example.

    Bugs (Custom Mode): Does not recognize a separator preceded by whitespace
    Instead, the tokenizer will collapse a series of whitespace, but
    will offer it as a separator in thesep.
    This is probably not what you want.
*/


class TokenIterator:istream_iterator<string,int> {

private:
    istream *i;
    bool ismyistream;
    string buffer;

    static TokenIterator finalIterator;

    static string mooncheese;

public:
    /** \relates TokenIterator
	The modes allowed as arguments.
     */
    enum Mode {Word, Line, RFC, Custom};

    typedef TokenIterator iterator;

protected:
    int brace;
    int braceoftoken;
    string bracestack;
    bool braces;
    Mode mode;

    const char *whitespace;  // ALL whitespace must be listed here
    const char *separator;   // separators
    const char *continuation;// lists continuation
    const char *leftbrace;   // leftbrace[i] matches rightbrace[i]
    const char *rightbrace;  // supports multiple levels of braces
    char escapechar;// escapechar is the escape char; default \ .
    char eoltoken;  // use this instead of end of line
    char whitetoken;// use this instead of whitespace

    void setMode(Mode m);
    void reset();

public:
    /**
      Returns one token each call.
      An empty token does NOT signal the end of the input.
    */
    virtual string operator()();

    /** Dummy constructor */
    /** constructs an Iterator that has reached end */
    TokenIterator();

    /** Constructor used to tokenize a string s,
	using \a Mode m (default is Words),
	by default without braces.
    */
    TokenIterator(string s, Mode m=Word, bool braces=false);

    /** Constructor used to tokenize from an input stream,
	using \a Mode m (default is Words),
	by default without braces.

	The input stream is consumed, which is why
	the TokenIterator doesn''t offer backward iterator capabilities.
    */
    TokenIterator(istream &is, Mode m=Word, bool braces=false);


    /** A begin function returning bool.
	\a begin and \a end functions have been crafted to
	work with this way of using iterators:
	\code
	ifstream is(somefilename);
	TokenIterator tokenize(is);

	while( tokenize->begin() != tokenize->end() ){
	    string token= tokenize();
	    ...
	}
	\endcode
     */
    iterator& begin() const;


    /** A end function returning an iterator. See \a begin .
     */
    inline iterator& end() const{ return finalIterator; };


    virtual ~TokenIterator();

    //! from Input Iterator
    //! Returns the current object in the stream.
    operator string() const;


    //! from Input Iterator
    //! Returns the current object in the stream,
    //! and the next object if the stream hasn't been read yet
    operator string();


    //! from Input Iterator
    //! Returns the current object in the stream.
    const string operator*() const;


    //! from Input Iterator
    //! Returns the current object in the stream,
    //! and the next object if the stream hasn't been read yet
    const string operator*();


    //! from Input Iterator
    //! Preincrement.
    TokenIterator& operator++();

    //! from Input Iterator
    //! Postincrement.
    //! this works .. almost
    TokenIterator& operator++(int i);


    /** compare not equal */
    bool operator != (TokenIterator &R) const;


    /** compare two Tokenizers */
    bool operator == (TokenIterator &R) const;


    /** need this for foreach template */
    bool operator ! (void) const;


    /** Introducing an implicit conversion to bool is not */
    /** good because it creates an ambiguity, */
    /** since bool may be converted implicitly to int and String. */
    bool hastoken (void) const;


    /** contains the separator that ended the token */
    char thesep;

    /** holds the separator that preceded the token */
    char previoussep;

    /** when using braces (in custom mode),
	check this to get the number of unclosed braces. */
    inline int bracingdepth() const{ return braceoftoken; };

    /** use this to compare with instead of end of line \\n */
    inline char eolToken() const{ return eoltoken; };

    // use this to compare with instead of space */
    inline char whiteToken() const{ return whitetoken; };
};
/** \example TokenIterator_test.cpp */
/** \example CustomTokenIterator.cpp  */


/**
   The LexxStyleToken is returned by the \a LexxStyleTokenIterator
\code
   struct LexxStyleToken{
    enum Tokentype {T1_separator, T1_string};
    Tokentype ttype;
    string Tstring;
    char Tchar;
    };
\endcode
*/
struct LexxStyleToken{
    enum Tokentype {T1_separator, T1_string};
    Tokentype ttype;
    string Tstring;
    char Tchar;
};

/**
   The \a LexxStyleToken iterator is a wrapper around the
   \a TokenIterator . It returns the separators and the parts
   of the string that are separated by the separators
   in alteration.
*/
class LexxStyleTokenIterator{
private:
    TokenIterator *base;
    int state;
public:
    /**
       Return the current token,
       without proceeding to the next token.
     */
    LexxStyleToken thetoken;

    /**
       Wrap the TokenIterator in the LexxStyleTokenIterator.
     */
    LexxStyleTokenIterator(TokenIterator *Tbase);

    /**
       Return the next token.
     */
    LexxStyleToken& operator()();
};


/**
   \a crop_token removes leading and trailing whitespace from a token.
   Example:
   \code
   cout << crop_token( " \thead tail \t" ) << endl; // prints "head tail"
   \endcode
*/

inline string crop_token(const string &s, const string whitespace=string(" /t") ){
    size_t left = s.find_first_not_of(whitespace.c_str());
    size_t right= s.find_last_not_of(whitespace.c_str());
    return string(s,left,right-left+1);
};


/** \a text_escape escapes newlines and escape characters
    inside a string such that it may be read by the \a TokenIterator
    in \a TokenIterator::Line or \a TokenIterator::Word Mode.
*/
inline string text_escape(const string &lines)
{
    unsigned int count= 0;

    //
    // count how many characters have to be escaped
    //
    for( unsigned int i=0;  i<lines.size(); ++i )
	if( lines[i]=='\n' || lines[i]=='\\' )
	    ++count;


    string result("");
    result.reserve( lines.size()+count+1 );

    //
    // escape characters
    //
      {

	for( unsigned int i=0;  i<lines.size(); ++i )
	    {
	      if( lines[i]=='\n' || lines[i]=='\\' )
		  result += '\\';
	      result += lines[i];
	    }

      }

    return result;
}

#endif // !defined(__LIB_TEXT_HPP__)