Removed unused C++ code.

This commit is contained in:
Peter Simons 2000-12-13 17:37:25 +00:00
parent ee2cb50f52
commit 3402ec1a3b
7 changed files with 0 additions and 1753 deletions

View File

@ -1,59 +0,0 @@
#include "text.hpp"
#include <fstream.h>
/**
Example using the Custom Mode of the TokenIterator
*/
class MyCustomTokenIterator: public TokenIterator{
public:
MyCustomTokenIterator(string inputStr, bool b=false)
: TokenIterator(inputStr,TokenIterator::Custom, b){
eoltoken= '\n';
separator= ":\n";
whitespace= "";
};
MyCustomTokenIterator(istream &inputStr, bool b=false)
: TokenIterator(inputStr,TokenIterator::Custom, b){
eoltoken= '\n';
separator= ":\n";
whitespace= "";
};
};
int main(int argc, char* argv[]){
ifstream infile("/etc/passwd");
MyCustomTokenIterator tokenize(infile);
while(!infile.eof()){
string user= tokenize();
string password= tokenize();
string userid = tokenize();
string groupid= tokenize();
string description= tokenize();
string home = tokenize();
string shell= tokenize();
if(password=="x")
password="shadowed";
cout << "----"<<endl;
cout << "user :" <<user <<endl;
cout << "password :" <<password <<endl;
cout << "userid :" <<userid <<endl;
cout << "groupid :" <<groupid <<endl;
cout << "description:" <<description <<endl;
cout << "home :" <<home <<endl;
cout << "shell :" <<shell <<endl;
while( !infile.eof() && tokenize.thesep!= tokenize.eolToken() )
{
string trailing_garbage = tokenize();
cout<<"\\:"<< trailing_garbage;
}
cout<<endl;
}
return 0;
}

View File

@ -1,259 +0,0 @@
/*
* $Source$
* $Revision$
* $Date$
*
* Copyright (c) 1999 by CyberSolutions GmbH, Germany.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by CyberSolutions GmbH.
*
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "text.hpp"
char RegexTokenizer::workspace[RegexTokenizer::N_substring+1]="";
RegexTokenizer::RegexTokenizer(){
}
void RegexTokenizer::reset(){
input= string();
int i=N_pm; while(--i>0){ pm[i].rm_so=-1; pm[i].rm_eo=-1; }
so= 0;
eo= 0;
previous_eo= -1;
error= 0;
}
int RegexTokenizer::set(string _input,list<const char*> _regex){
reset();
input= _input;
list<const char*>::iterator first= _regex.begin();
list<const char*>::iterator last = _regex.end();
while(first!=last){
regex_t re;
int i;
//REG_EXTENDED
//use extended regular expressions
//REG_NEWLINE
//makes ^...$ work to match newline/endofline
i= regcomp (&re, *first, REG_EXTENDED|REG_NEWLINE);
if(i)
return i;
regex.push_back(re);
regex_src.push_back(*first);
++first;
}
}
RegexTokenizer::RegexTokenizer(string _input,Mode _mode){
mode= _mode;
//create a list
list<const char*>alist;
switch(_mode){
case Word:
alist.push_back("([^ \t\n]*)([ \t\n]*)");
break;
case Line:
alist.push_back("^(.*)$\n");
break;
case RFC:
alist.push_back("((^.*$)((\n)^[ \t]+.*$)*)(\n)?");
//this works, but output is confusing
// that is, how to remove the glue ?
break;
case Custom:
//break;
default:
cerr<<"RegexTokenizer mode constructor called with pointless mode."<<endl;
}
set(_input,alist);
}
RegexTokenizer::RegexTokenizer(string _input,const char* oneregex){
//create a list
list<const char*>alist;
alist.push_back(oneregex);
set(_input,alist);
}
RegexTokenizer::RegexTokenizer(string _input,list<const char*> _regex){
set(_input,_regex);
}
RegexTokenizer::RegexTokenizer(const RegexTokenizer &r){
//cerr<<"(copy constructor)"<<endl;
set(r.input,r.regex_src);
// result= r.result; "ANSI C++ fobids ..."
memcpy(&result[0], &r.result[0], N_pm*sizeof(result[0]) );
whichregexwasmatched= r.whichregexwasmatched;
// pm= r.pm;
memcpy(&pm[0], &r.pm[0], N_pm*sizeof(pm[0]) );
I_pm= r.I_pm;
error= r.error;
so= r.so;
eo= r.eo;
previous_eo= r.previous_eo;
mode= r.mode;
}
RegexTokenizer RegexTokenizer::begin() const
{
//cerr<<"(begin)"<<endl;
RegexTokenizer RT(*this);
RT.error= 0;
RT.so= 0;
RT.eo= 0;
RT.previous_eo= -1;
return RT;
}
RegexTokenizer RegexTokenizer::end() const
{
//cerr<<"(end)"<<endl;
RegexTokenizer RT(*this);
RT.error= 1;
RT.so= input.length();
RT.eo= input.length();
RT.previous_eo= RT.eo;
return RT;
}
void RegexTokenizer::advance(){
//try all patterns until one matches
//cerr<<"advance"<<endl;
//wonder where to get the string from ?
//using a char * buffer is ugly, but there is no regex for string
// (no regex stuff which I'm aware of at the time of writing (1999) )
if(eo < (signed int)input.size()){
// there is no c_substr(eo,N_substring) ;-(
string sWorkspace(input,eo,N_substring);
// waste of time, but I´m not sure when sWorkspace.c_str() gets freed;
strncpy(workspace, sWorkspace.c_str(), N_substring) ;
}
else
workspace[0]='\0';
result[0]= string();
if(
error == 0 && /* regex ok ? */
*workspace != 0 && /* check end buffer */
previous_eo < eo /* make sure we finish */
)
{/* while matches found */
//cerr<<"go over regex's supplied"<<endl;
list<regex_t>::iterator first= regex.begin();
list<regex_t>::iterator last = regex.end();
error= 1;
previous_eo= eo;
while(error && result[0].empty() && first!=last){//check for empty buffer
{
//cerr<<endl <<"matching "<< workspace + eo<< endl;
/* substring found between pm.rm_so and pm.rm_eo */
/* This call to regexec() finds the next match */
error = regexec(&*first, workspace, N_pm, &pm[0], 0);
++first;
}
if(!error){
int final_so= eo;
int final_eo= eo;
//Go over the members of pm to see submatches
int i;
i=N_pm; while(--i>0){ result[i]= string(); }
i=0;
while(i<N_pm &&
pm[i].rm_so>=0 && pm[i].rm_eo>0 &&
pm[i].rm_so<N_substring && pm[i].rm_eo<=N_substring
){
int local_so= previous_eo+pm[i].rm_so;
int local_eo= previous_eo+pm[i].rm_eo;
if(i==0)
{
final_so= local_so;
final_eo= local_eo;
}
result[i]= input.substr(local_so, local_eo-local_so);
//cout <<"match["<<i<<"]{"<<pm[i].rm_so<<","<<pm[i].rm_eo<<"}";
//cout <<"("<< local_so <<","<< local_eo <<"): " << result[i] << endl;
i++;
}
so= final_so;
eo= final_eo;
I_pm= i;
}
else{
(void)regerror(error, &*first, workspace, N_substring);
}
}
}else{
//if the final match has been passed,
//signal end (to make != operator work ?PS)
// like in *this= end();
so= input.length();
eo= input.length();
previous_eo= eo;
}
}
RegexTokenizer::~RegexTokenizer(){
list<regex_t>::iterator first= regex.begin();
list<regex_t>::iterator last = regex.end();
while(first!=last){
//cerr<<"freeing "<<&*first<<endl;
(void) regfree (&*first);
++first;
}
}
ostream& operator<<(ostream &o, const RegexTokenizer &r){
o<<"("<<&r<<" "<<r.previous_eo<<"-"<<r.so<<"/"<<r.eo<<" ?"<<r.error<<")["<<r.input<<"]"<<endl;
return o;
}

View File

@ -1,234 +0,0 @@
/*
* $Source$
* $Revision$
* $Date$
*
* Copyright (c) 1999 by CyberSolutions GmbH.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by CyberSolutions GmbH.
*
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "text.hpp"
// begin test section
// Set verbose to 1 if you want to see the parsed tokens
const int verbose=0;
int test1(){
string input("a22bbb4444ccccc999999999dfgDFG");
// Set up the tokenizer to match the input string
// against a regular expression.
// The entire match will be returned by *rt,
// subexpressions by rt[1], rt[2], rt[3], ..
RegexTokenizer rt(input,"([a-z]*)([^a-z]*)");
int count=0;
RegexTokenizer::iterator next= rt.begin();
const RegexTokenizer::iterator last= rt.end();
if(verbose)
cout << "*** begin 1*** "<<endl;
while(next!=last){
++next;// Preinc - processes input
if(next!=last){
if(verbose)
cout << *next // Entire match,
<<"="
<<next[1] // first subexpression,
<<"+"
<<next[2] // 2nd subexpr.
<< endl;
}
count++;
}
if(verbose)
cout << "--- end 1 ---"<<count<<endl;
if(count != 5)
return 1;
return 0;
}
int test2(){
string input("Word-Satz 1\n Satz 1a\nSatz 2\n\tSatz2a.");
// Set up the tokenizer to match the input string
// against a regular expression.
// The entire match will be returned by *rt,
// subexpressions by rt[1], rt[2], rt[3], ..
RegexTokenizer rt(input,"([^ \t\n]*)([ \t\n]*)");
int count=0;
RegexTokenizer::iterator next= rt.begin();
const RegexTokenizer::iterator last= rt.end();
if(verbose)
cout << "*** begin 2*** "<<endl;
while(next!=last){
++next;// Preinc - processes input
if(next!=last){
if(verbose)
cout <<next[1] // first matched subexpression
<<"["
<<next[2] // second matched subexpression
<<"]"<< endl;
}
count++;
}
if(verbose)
cout << "--- end 2 ---"<<count<<endl;
if(count != 8)
return 1;
return 0;
}
int testWord(){
string input("Ein Satz aus vielen langen Wor-ten.\nUnd ein zweiter Satz.2\n3");
// Set up the tokenizer to match the input string
// against a regular expression that defines the word-wise tokenizing.
// The expression used is "([^ \t\n]*)([ \t\n]*)" .
// The entire match will be returned by *rt,
// subexpressions by rt[1], rt[2], rt[3], ..
RegexTokenizer rt(input,RegexTokenizer::Word);
int count=0;
RegexTokenizer::iterator next= rt.begin();
const RegexTokenizer::iterator last= rt.end();
if(verbose)
cout << "*** begin Word*** "<<endl;
while(next!=last){
++next;// Preinc - processes input
if(next!=last){
if(verbose)
cout <<next[1] // first matched subexpression,
<<"["
<<next[2] // 2nd matched subexpr.
<<"]"<< endl;
}
count++;
}
if(verbose)
cout << "--- end Word ---"<<count<<endl;
if(count != 12)
return 1;
return 0;
}
int testLine(){
string input("Line-Satz 1\n Satz 1a\nSatz 2\n\tSatz2a.");
// Set up the tokenizer to match the input string
// against a regular expression that defines line by line tokenizing.
// The expression used is "^(.*)$\n" .
// The entire match will be returned by *rt,
// subexpressions by rt[1], rt[2], rt[3], ..
RegexTokenizer rt(input,RegexTokenizer::Line);
int count=0;
RegexTokenizer::iterator next= rt.begin();
const RegexTokenizer::iterator last= rt.end();
if(verbose)
cout << "*** begin Line*** "<<endl;
while(next!=last){
++next;// Preinc - processes input
if(next!=last){
if(verbose)
cout <<"'"
<<next[1] // first matched subexpression
<<"'"<<"["
<<next[2] // second matched subexpression
<<"]"<< endl;
}
count++;
}
if(verbose)
cout << "--- end Line ---"<<count<<endl;
if(count != 5)
return 1;
return 0;
}
int testRFC(){
string input("RFC-Satz 1\n Satz 1a\nSatz 2\n\tSatz2a\n\tSatz2b.");
// Set up the tokenizer to match the input string
// against a regular expression that defines RFC-style tokenizing.
// The expression used is "((^.*$)((\n)^[ \t]+.*$)*)(\n)?" .
// Bug: whitespace that glues one line to the next is not removed.
// (afaik, there not way to do this with a single regular expression).
// The entire match will be returned by *rt,
// subexpressions by rt[1], rt[2], rt[3], ..
RegexTokenizer rt(input,RegexTokenizer::RFC);
int count=0;
RegexTokenizer::iterator next= rt.begin();
const RegexTokenizer::iterator last= rt.end();
if(verbose)
cout << "*** begin RFC*** "<<endl;
while(next!=last){
++next;// Preinc - processes input
if(next!=last){
if(verbose)
cout <<"'"<<next[1]<<"'"<<"["<<next[2]<<"]"<<"["<<next[3]<<"]"<< endl;
// first, second and third matched subexpression
}
count++;
}
if(verbose)
cout << "--- end RFC ---"<<count<<endl;
if(count != 3)
return 1;
return 0;
}
int main(int argc, char *argv[])
{
return test1() || test2() || testWord() || testLine() || testRFC() ;
}

View File

@ -1,380 +0,0 @@
/*
* $Source$
* $Revision$
* $Date$
*
* Copyright (c) 1999 by CyberSolutions GmbH, Germany.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by CyberSolutions GmbH.
*
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <cstdlib>
#include <cstring>
#include <strstream>
#include "text.hpp"
static int mystrpos(const char *c,char s){
int i=0;
while(c[i])
{
if(c[i]==s){
return i;
}
i++;
};
if(!c[i])
return -1;
else
return -1;
}
string TokenIterator::mooncheese= string("The Moon is A green cheese (sheesh!).");
void TokenIterator::reset(){
i= (istream*)0;
brace= 0; bracestack[0]='\0';
braceoftoken= 0;
thesep= '\0'; previoussep= '\0';
eoltoken= '\n';
whitetoken= ' ';
buffer= mooncheese;
}
void TokenIterator::setMode(Mode m){
mode= m;
switch(mode){
case Word:
whitespace=" \t";
separator="";
continuation="";
leftbrace="\"";
rightbrace="\"";
escapechar = '\\';
break;
case Line:
whitespace="";
separator="";
continuation="";
leftbrace="";
rightbrace="";
escapechar = '\\';
break;
case RFC:
whitespace="";
separator="";
continuation=" \t";
leftbrace="";
rightbrace="";
escapechar ='\\';
break;
default:
whitespace = " \t";
separator = ",;:+-=/\\@";
continuation="";
leftbrace = "\"([{<";
rightbrace = "\")]}>";
escapechar = '\\';
}
}
TokenIterator::TokenIterator(){
reset();
braces= false;
setMode(Word);
ismyistream= false;
}
TokenIterator TokenIterator::finalIterator = TokenIterator();
//TokenIterator::TokenIterator(string s, Mode m=Word, bool b=false){
TokenIterator::TokenIterator(string s, Mode m, bool b){
reset();
braces= b;
setMode(m);
ismyistream= true;
i= new istrstream(s.c_str());
//++(*this);// read first value (not done; makes this unwieldly)
}
//TokenIterator::TokenIterator(istream &is, Mode m=Word, bool b=false){
TokenIterator::TokenIterator(istream &is, Mode m, bool b){
reset();
braces= b;
setMode(m);
ismyistream= false;
i= &is;
//++(*this);// read first value (not done; makes this unwieldly)
}
TokenIterator::~TokenIterator(){
if(ismyistream)
delete i;
}
TokenIterator::iterator& TokenIterator::begin() const
{
if( i && i->good() && !i->eof() )
return *const_cast<TokenIterator*> (this);
else
return finalIterator;
};
//! from Input Iterator
//! Returns the next object in the stream.
TokenIterator::operator string() const
{
return buffer;
};
//! from Input Iterator
//! Returns the next object in the stream.
TokenIterator::operator string()
{
if( buffer== mooncheese )
(*this)();
return buffer;
};
//! from Input Iterator
//! Returns the next object in the stream.
const string TokenIterator::operator*() const
{
return buffer;
};
//! from Input Iterator
//! Returns the next object in the stream.
const string TokenIterator::operator*()
{
if( buffer== mooncheese )
(*this)();
return buffer;
};
//! from Input Iterator
//! Preincrement.
TokenIterator& TokenIterator::operator++()
{
(*this)(); return *this;
};
//! from Input Iterator
//! Postincrement.
//! this works .. almost
TokenIterator& TokenIterator::operator++(int i)
{
static TokenIterator t = *this;
while(i>0){ --i; (*this)++; }
return t;
};
/** compare not equal */
bool TokenIterator::operator != (TokenIterator &R) const{// const & I say, const
// note: const TokenIterator &R will create a copy of R :-(
// this can't work; have to allow use of const in the above
// has to be compared differently( endflags .. ! )
return &R!= this;
}
/** compare two Tokenizers */
bool TokenIterator::operator == (TokenIterator &R) const{
// note: const TokenIterator &R will create a copy of R :-(
// this can't work; have to allow use of const in the above
// has to be compared differently( endflags .. ! )
return !( *this != R );
}
/** need this for foreach template */
bool TokenIterator::operator ! (void) const{
return !( i && i->good() && !i->eof() );
}
/** need this for fun */
bool TokenIterator::hastoken(void) const{
return i && i->good() && !i->eof();
}
inline bool linefeed(char c, istream *i){
if(c=='\r'){
char d;
if( i->get(d) ){
if(d=='\n')
;/* dos line feed */
else
i->unget();
}
return true;
}else if(c=='\n'){
char d;
if( i->get(d) ){
if(d=='\r')
;/* carriage return after line feed(?) */
else
i->unget();
}
return true;
}
return false;
}
string TokenIterator::operator()(){
char c= 0;
int pos;
previoussep= thesep;
buffer= string("");
while( i->get(c) ){
if(c==escapechar){
char d;// special translations need to be plugged in here
if( i->get(d) ){
if( brace && linefeed(d,i) )
buffer+= '\n';
else
buffer+= d;
}
}
else if( linefeed(c,i) ){
thesep= eoltoken;
{
switch(mode){
case Word:
if( previoussep!=whitetoken || buffer.length() )// space" = "
return buffer;
break;
case Line:
return buffer;
break;
case RFC:
{
char d;
if( i->get(d) ){
if(!strchr(continuation,d) ){
i->unget();
return buffer;
}else
i->unget();
}
do{
if(!i->get(d)){ return buffer; }
}while( strchr(continuation,d) );
//should "A\n \tB" be returned as one token "AB" or as "A B" ?
// currently, "AB" is returned
i->unget();// unget
}
break;
default:
return buffer;
}
}
}
else if( !(brace) && strchr(whitespace,c) ){ // brace>0 implies braces==true
if(buffer.length()){
thesep= whitetoken;
return buffer;// send token
}else
previoussep= whitetoken;// !?
;/* skip */
}else if(strchr(separator,c)){
thesep= c;
if( previoussep!=whitetoken || buffer.length() )// space" = "
return buffer;// send token
}else if(brace>0 && bracestack[brace]==c){
/* closing brace */
braceoftoken= brace;
brace--; /* pop stack of braces */
thesep= c;
return buffer;// send token
}else if( braces && (pos=mystrpos(leftbrace,c), pos>=0) ){//pos>0
/* opening brace */
braceoftoken= brace;
bracestack[++brace]= rightbrace[pos];
if( previoussep!=whitetoken || buffer.length() ){// space" = "
thesep= c;
return buffer;// send token
}
}else{
/* normal, append to token */
buffer+= c;
}
}
return buffer;
}
LexxStyleTokenIterator::LexxStyleTokenIterator(TokenIterator *Tbase){
state=0;
base= Tbase;
}
LexxStyleToken& LexxStyleTokenIterator::operator()(){
state= !state;
thetoken.ttype= (LexxStyleToken::Tokentype)state;
if(state){
thetoken.Tstring= (*base)();
}else{
thetoken.Tchar= base->thesep;
}
return thetoken;
}

View File

@ -1,222 +0,0 @@
/*
* $Source$
* $Revision$
* $Date$
*
* Copyright (c) 1999 by CyberSolutions GmbH.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by CyberSolutions GmbH.
*
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <cstdlib>
#include <fstream.h>
#include <string>
#include "text.hpp"
#include <algo.h>
// Set verbose=1 to see the tokens
const int verbose=1;
/**
The do_sth_with_aTokenIterator class
was written with the intent
to have the class written to cout
using the for_each template
*/
class do_sth_with_aTokenIterator {
string s;
public:
explicit do_sth_with_aTokenIterator() : s() {};
void operator()(const TokenIterator& s)
{ cout<< *s <<endl; };
//! Postincrement.
do_sth_with_aTokenIterator& operator++(int i)
{
};
};
int main(int argc,char *argv[]){
{
//Tokenize words
ifstream i("test.txt");
if(!i)
{ cerr<<"Test Data not found(test.txt)"<< endl; return(2); }
//Initialize the Tokenizer for word mode
TokenIterator tokenize(i,TokenIterator::Word);
string token="";
int count=0;
if(verbose)
cout<<endl<<"--Word"<<endl;
//Loop over all tokens
while( tokenize.hastoken() ){
token= tokenize();
if(verbose)
cout<<":"<<token<<"\n";
count ++;
}
if(verbose)
cout<<endl<<count<<endl;
if(count!=27)
;//return(1);
}
{
//Tokenize words, with " "
ifstream i("test.txt");
if(!i) return 255;
//Initialize the Tokenizer for word mode, "a b" is one word
TokenIterator tokenize(i,TokenIterator::Word,true);
string token="";
int count=0;
if(verbose)
cout<<endl<<"--\"Word\""<<endl;
//Loop over all tokens
while( tokenize.hastoken() ){
token= tokenize();
if(verbose)
cout<<":"<<token<<"\n";
count ++;
}
if(verbose)
cout<<endl<<count<<endl;
if(count!=25)
;//return(1);
}
{
//Tokenize lines
ifstream i("test.txt");
if(!i) return 255;
//Initialize the Tokenizer for line mode ( one line == one token )
TokenIterator tokenize(i,TokenIterator::Line);
string token="";
int count=0;
if(verbose)
cout<<endl<<"--Line"<<endl;
while( tokenize.hastoken() ){
token= tokenize();
if(verbose)
cout<<":"<<token<<"\n";
count ++;
}
if(verbose)
cout<<endl<<count<<endl;
if(count!=10)
;//return(1);
}
{
//Tokenize 'RFC-style'
ifstream i("test.txt");
if(!i) return 255;
//Initialize Tokenizer for RFC mode
// ( If the following line starts with space or tabulator,
// it is glued to the previous line ).
TokenIterator tokenize(i,TokenIterator::RFC);
string token="";
int count=0;
if(verbose)
cout<<endl<<"--RFC"<<endl;
//Loop over all tokens
while( tokenize.hastoken() ){
token= tokenize();
if(verbose)
cout<<":"<<token<<"\n";
count ++;
}
if(verbose)
cout<<endl<<count<<endl;
if(count!=5)
;//return(1);
}
//trying sequence capability ..
{
//Tokenize words
ifstream i("test.txt");
if(!i) return 255;
TokenIterator tokenize(i,TokenIterator::Word);
string token="";
int count=0;
if(verbose)
cout<<endl<<"--Word(seq)"<<endl;
//TokenIterator has only dummy capabilities, hence the warnings
// or maybe I just don't get what a unary function is ?
/* for_each (
tokenize.begin(),
tokenize.end(),
do_sth_with_aTokenIterator()
)
++count;
*/
if(verbose)
cout<<endl<<count<<endl;
if(count!=0)
cerr<<"You mean someone repaired that ?!"<<endl;
}
// there are different way to use the TokenIterator;
// this way seems intuitive to me.
// testing this way ..
{
ifstream i("test.txt");
if(!i) return 255;
TokenIterator lines(i, TokenIterator::Line);
cout<<endl;
cout<<"testing while( line= ++lines, ( lines.begin() != lines.end() ) )"
<<"\n cout<<\"<<line<<\"<<endl; "<<endl;
string line;
while( line= ++lines, ( lines.begin() != lines.end() ) )
cout<<"\""<<line<<"\""<<endl;
}
return 0;
}

View File

@ -1,539 +0,0 @@
/*
* $Source$
* $Revision$
* $Date$
*
* Copyright (c) 1999 by CyberSolutions GmbH.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by CyberSolutions GmbH.
*
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __LIB_TEXT_HPP__
#define __LIB_TEXT_HPP__
#include <stdexcept>
#include <string>
#include <cstring>
#include <iterator>
#include <list>
#include <sys/types.h>
#include "../RegExp/RegExp.hpp"
/** \file text.hpp
A library for text parsing and manipulation.
This library contains a couple of useful functions for dealing
with strings, most notably a regular expression class and a
generic config file parser.
*/
//////////////////////////////////////////////////
// tokenize() //
//////////////////////////////////////////////////
template<class T>
void tokenize(insert_iterator<T> & ii, const string & buffer,
const char * sep = " \t\r\n")
{
string::size_type pos = 0;
while(pos != string::npos) {
string::size_type end_pos = buffer.find_first_of(sep, pos);
string token = buffer.substr(pos, end_pos-pos);
if (!token.empty()) {
*ii = token;
++ii;
end_pos = buffer.find_first_not_of(sep, end_pos);
}
if (end_pos != string::npos)
end_pos = buffer.find_first_not_of(sep, end_pos);
pos = end_pos;
}
}
//////////////////////////////////////////////////
// RegexTokenizer() //
//////////////////////////////////////////////////
/** The RegexTokenizer extracts tokens from 'string' input.
string or stream input has to be converted to string. This means
the Tokenizer should be useful with large input which is divided
into large chunks. A match is performed against a list of regular
expressions. Each expression defines a match-separator pair.
Regular Expressions are compiled with REG_EXTENDED flag.
*/
class RegexTokenizer: forward_iterator<RegexTokenizer, int> {
public:
/** maximum number of registers, subexpressions */
static const int N_pm=10;
/** maximum length of a match */
static const int N_substring=1024;
/** the workspace */
static char workspace[N_substring+1]; //+1 for trailing \0
/** Modes (other than Custom) make the \a RegexTokenizer use a standard regular expression.
\a Custom : The tokenizer uses the regular expression you specify.
\a Word : The tokenizer gives chunks of input separated by space and tabs.
\a Line : The tokenizer splits input at end of line.
\a RFC : The tokenizer splits input at end of line.
Lines may be continued by starting a new line with spaces or tabs.
These continuation characters are NOT stripped from the tokens.
*/
enum Mode {Custom, Word, Line, RFC};
/** RegexTokenizer is it''s own iterator. */
typedef RegexTokenizer iterator;
private:
string input;
string result[N_pm];
list<const char*>regex_src;// the source regexes needed for copy/begin/end
list<regex_t>regex; // not sure multiple regexes are a smart idea
int whichregexwasmatched;
regmatch_t pm[N_pm];
int I_pm; // matched subexpressions
int error; // result of regex calls
int so,eo,previous_eo; // positions
//int matchMask;//bitset; which fields to return by the * operator
protected:
Mode mode;
void advance();
void reset();
int set(string _input,list<const char*> _regex);
public:
/** default constructor. */
RegexTokenizer();
/** Tokenize a string in a mode. */
RegexTokenizer(string _input,Mode _mode);
/** Tokenize a string according to a single regular expression. */
RegexTokenizer(string _input,const char* oneregex);
/** Tokenize a string according to several regular expressions.
(If the first regular expression fails, the next one will be tried. )
*/
RegexTokenizer(string _input,list<const char*> _regex);
/** copy constructor */
RegexTokenizer(const RegexTokenizer &r);
//void selectFields(int m){ matchMask= m; }
/** The begin state */
RegexTokenizer begin() const;
/** The end state */
RegexTokenizer end() const;
/**
from Input Iterator
Returns the current token.
*/
const string operator*() const
{ return result[0]; };
/** from Input Iterator
Returns the i-th matched subexpression.
*/
const string operator[](int i) const
{ return result[i]; };
/** from Input Iterator
PreIncrement
*/
RegexTokenizer& operator++()
{ (*this).advance(); return *this; };
/** from Input Iterator
PostIncrement
*/
RegexTokenizer& operator++(int i)
{ while(i>0){ (*this).advance(); --i; }; return *this; };
/** Destructor */
virtual ~RegexTokenizer();
/** compare not equal */
bool operator != (const RegexTokenizer &R) const{// const & I say, const
return so != R.so || eo != R.eo || previous_eo != R.previous_eo;
}
/** compare two RegexTokenizers */
bool operator == (const RegexTokenizer &R) const{
return !( *this != R );
}
/** print the current state of the RegexTokenizer */
friend ostream& operator<<(ostream &o,const RegexTokenizer &r);
};
//////////////////////////////////////////////////
// TokenIterator //
//////////////////////////////////////////////////
/** The TokenIterator extracts tokens from string or stream input.
There are four main modes and a custom mode. In all modes, the
backslash works as an escape character for the next character i.e.
'one\\\\backslash' is read as 'one\backslash'.
Description of the main modes:
1. Words separated by whitespace, with "whitespace" consisting of
tabulators and the blank.
\code
TokenIterator tokenize(inputStr,TokenIterator::Word);
\endcode
2. Words separated by whitespace, "one word" is one token.
whitespace is defined to be only tabulators and the blank.
\code
TokenIterator tokenize(inputStr,TokenIterator::Word,true);
\endcode
3. Each line is a token.
Escaped newlines will become part of the token.
example:
\code
TokenIterator tokenize(inputStr,TokenIterator::Line);
\endcode
4. RFC style:
Whitespace at start of next line appends next line.
The use of escaping the newline to append the next line,
like in Makefiles, is NOT part of this mode.
example:
\code
TokenIterator tokenize(inputStr,TokenIterator::RFC);
\endcode
5. The Custom Mode: The custom mode is intended for reading from
data that is in almost human-readable-format, like /etc/passwd.
Separating elements are not returned as Tokens, but are stored in
thesep and previoussep. In /etc/passwd ':' is the separator,
while newlines separate records.
\code
class MyCustomTokenIterator: public TokenIterator{
public:
MyCustomTokenIterator(string inputStr, bool b=false)
: TokenIterator(inputStr,TokenIterator::Custom, b){
eoltoken= '\n';
separator= ":\n";
};
MyCustomTokenIterator(istream &inputStr, bool b=false)
: TokenIterator(inputStr,TokenIterator::Custom, b){
eoltoken= '\n';
separator= ":\n";
};
\endcode
See \a CustomTokenIterator.cpp for the full example.
Bugs (Custom Mode): Does not recognize a separator preceded by whitespace
Instead, the tokenizer will collapse a series of whitespace, but
will offer it as a separator in thesep.
This is probably not what you want.
*/
class TokenIterator:istream_iterator<string,int> {
private:
istream *i;
bool ismyistream;
string buffer;
static TokenIterator finalIterator;
static string mooncheese;
public:
/** \relates TokenIterator
The modes allowed as arguments.
*/
enum Mode {Word, Line, RFC, Custom};
typedef TokenIterator iterator;
protected:
int brace;
int braceoftoken;
string bracestack;
bool braces;
Mode mode;
const char *whitespace; // ALL whitespace must be listed here
const char *separator; // separators
const char *continuation;// lists continuation
const char *leftbrace; // leftbrace[i] matches rightbrace[i]
const char *rightbrace; // supports multiple levels of braces
char escapechar;// escapechar is the escape char; default \ .
char eoltoken; // use this instead of end of line
char whitetoken;// use this instead of whitespace
void setMode(Mode m);
void reset();
public:
/**
Returns one token each call.
An empty token does NOT signal the end of the input.
*/
virtual string operator()();
/** Dummy constructor */
/** constructs an Iterator that has reached end */
TokenIterator();
/** Constructor used to tokenize a string s,
using \a Mode m (default is Words),
by default without braces.
*/
TokenIterator(string s, Mode m=Word, bool braces=false);
/** Constructor used to tokenize from an input stream,
using \a Mode m (default is Words),
by default without braces.
The input stream is consumed, which is why
the TokenIterator doesn''t offer backward iterator capabilities.
*/
TokenIterator(istream &is, Mode m=Word, bool braces=false);
/** A begin function returning bool.
\a begin and \a end functions have been crafted to
work with this way of using iterators:
\code
ifstream is(somefilename);
TokenIterator tokenize(is);
while( tokenize->begin() != tokenize->end() ){
string token= tokenize();
...
}
\endcode
*/
iterator& begin() const;
/** A end function returning an iterator. See \a begin .
*/
inline iterator& end() const{ return finalIterator; };
virtual ~TokenIterator();
//! from Input Iterator
//! Returns the current object in the stream.
operator string() const;
//! from Input Iterator
//! Returns the current object in the stream,
//! and the next object if the stream hasn't been read yet
operator string();
//! from Input Iterator
//! Returns the current object in the stream.
const string operator*() const;
//! from Input Iterator
//! Returns the current object in the stream,
//! and the next object if the stream hasn't been read yet
const string operator*();
//! from Input Iterator
//! Preincrement.
TokenIterator& operator++();
//! from Input Iterator
//! Postincrement.
//! this works .. almost
TokenIterator& operator++(int i);
/** compare not equal */
bool operator != (TokenIterator &R) const;
/** compare two Tokenizers */
bool operator == (TokenIterator &R) const;
/** need this for foreach template */
bool operator ! (void) const;
/** Introducing an implicit conversion to bool is not */
/** good because it creates an ambiguity, */
/** since bool may be converted implicitly to int and String. */
bool hastoken (void) const;
/** contains the separator that ended the token */
char thesep;
/** holds the separator that preceded the token */
char previoussep;
/** when using braces (in custom mode),
check this to get the number of unclosed braces. */
inline int bracingdepth() const{ return braceoftoken; };
/** use this to compare with instead of end of line \\n */
inline char eolToken() const{ return eoltoken; };
// use this to compare with instead of space */
inline char whiteToken() const{ return whitetoken; };
};
/** \example TokenIterator_test.cpp */
/** \example CustomTokenIterator.cpp */
/**
The LexxStyleToken is returned by the \a LexxStyleTokenIterator
\code
struct LexxStyleToken{
enum Tokentype {T1_separator, T1_string};
Tokentype ttype;
string Tstring;
char Tchar;
};
\endcode
*/
struct LexxStyleToken{
enum Tokentype {T1_separator, T1_string};
Tokentype ttype;
string Tstring;
char Tchar;
};
/**
The \a LexxStyleToken iterator is a wrapper around the
\a TokenIterator . It returns the separators and the parts
of the string that are separated by the separators
in alteration.
*/
class LexxStyleTokenIterator{
private:
TokenIterator *base;
int state;
public:
/**
Return the current token,
without proceeding to the next token.
*/
LexxStyleToken thetoken;
/**
Wrap the TokenIterator in the LexxStyleTokenIterator.
*/
LexxStyleTokenIterator(TokenIterator *Tbase);
/**
Return the next token.
*/
LexxStyleToken& operator()();
};
/**
\a crop_token removes leading and trailing whitespace from a token.
Example:
\code
cout << crop_token( " \thead tail \t" ) << endl; // prints "head tail"
\endcode
*/
inline string crop_token(const string &s, const string whitespace=string(" /t") ){
size_t left = s.find_first_not_of(whitespace.c_str());
size_t right= s.find_last_not_of(whitespace.c_str());
return string(s,left,right-left+1);
};
/** \a text_escape escapes newlines and escape characters
inside a string such that it may be read by the \a TokenIterator
in \a TokenIterator::Line or \a TokenIterator::Word Mode.
*/
inline string text_escape(const string &lines)
{
unsigned int count= 0;
//
// count how many characters have to be escaped
//
for( unsigned int i=0; i<lines.size(); ++i )
if( lines[i]=='\n' || lines[i]=='\\' )
++count;
string result("");
result.reserve( lines.size()+count+1 );
//
// escape characters
//
{
for( unsigned int i=0; i<lines.size(); ++i )
{
if( lines[i]=='\n' || lines[i]=='\\' )
result += '\\';
result += lines[i];
}
}
return result;
}
#endif // !defined(__LIB_TEXT_HPP__)

View File

@ -1,60 +0,0 @@
/*
* $Source$
* $Revision$
* $Date$
*
* Copyright (c) 1999 by CyberSolutions GmbH.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by CyberSolutions GmbH.
*
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <iostream>
#include <list>
#include "text.hpp"
int
main(int argc, char ** argv)
{
//
// Test the tokenizer.
//
list<string> l;
insert_iterator< list<string> > ii(l, l.end());
const string buf("this is a test\n\n\n\r\tskfj \t blax\n");
tokenize(ii, buf);
cout << "Found " << l.size() << " tokens." << endl;
if( l.size()!=6 )
return 1;
copy(l.begin(), l.end(), ostream_iterator<string>(cout, "\n"));
return 0;
}