Removed unused C++ code.
This commit is contained in:
parent
ee2cb50f52
commit
3402ec1a3b
@ -1,59 +0,0 @@
|
||||
#include "text.hpp"
|
||||
#include <fstream.h>
|
||||
|
||||
/**
|
||||
Example using the Custom Mode of the TokenIterator
|
||||
*/
|
||||
class MyCustomTokenIterator: public TokenIterator{
|
||||
public:
|
||||
|
||||
MyCustomTokenIterator(string inputStr, bool b=false)
|
||||
: TokenIterator(inputStr,TokenIterator::Custom, b){
|
||||
eoltoken= '\n';
|
||||
separator= ":\n";
|
||||
whitespace= "";
|
||||
};
|
||||
|
||||
MyCustomTokenIterator(istream &inputStr, bool b=false)
|
||||
: TokenIterator(inputStr,TokenIterator::Custom, b){
|
||||
eoltoken= '\n';
|
||||
separator= ":\n";
|
||||
whitespace= "";
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
int main(int argc, char* argv[]){
|
||||
ifstream infile("/etc/passwd");
|
||||
MyCustomTokenIterator tokenize(infile);
|
||||
while(!infile.eof()){
|
||||
string user= tokenize();
|
||||
string password= tokenize();
|
||||
string userid = tokenize();
|
||||
string groupid= tokenize();
|
||||
string description= tokenize();
|
||||
string home = tokenize();
|
||||
string shell= tokenize();
|
||||
|
||||
if(password=="x")
|
||||
password="shadowed";
|
||||
|
||||
cout << "----"<<endl;
|
||||
cout << "user :" <<user <<endl;
|
||||
cout << "password :" <<password <<endl;
|
||||
cout << "userid :" <<userid <<endl;
|
||||
cout << "groupid :" <<groupid <<endl;
|
||||
cout << "description:" <<description <<endl;
|
||||
cout << "home :" <<home <<endl;
|
||||
cout << "shell :" <<shell <<endl;
|
||||
|
||||
while( !infile.eof() && tokenize.thesep!= tokenize.eolToken() )
|
||||
{
|
||||
string trailing_garbage = tokenize();
|
||||
cout<<"\\:"<< trailing_garbage;
|
||||
}
|
||||
cout<<endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1,259 +0,0 @@
|
||||
/*
|
||||
* $Source$
|
||||
* $Revision$
|
||||
* $Date$
|
||||
*
|
||||
* Copyright (c) 1999 by CyberSolutions GmbH, Germany.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* This product includes software developed by CyberSolutions GmbH.
|
||||
*
|
||||
* 4. The name of the author may not be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#include "text.hpp"
|
||||
|
||||
char RegexTokenizer::workspace[RegexTokenizer::N_substring+1]="";
|
||||
|
||||
RegexTokenizer::RegexTokenizer(){
|
||||
}
|
||||
|
||||
void RegexTokenizer::reset(){
|
||||
input= string();
|
||||
int i=N_pm; while(--i>0){ pm[i].rm_so=-1; pm[i].rm_eo=-1; }
|
||||
so= 0;
|
||||
eo= 0;
|
||||
previous_eo= -1;
|
||||
error= 0;
|
||||
}
|
||||
|
||||
int RegexTokenizer::set(string _input,list<const char*> _regex){
|
||||
reset();
|
||||
input= _input;
|
||||
|
||||
list<const char*>::iterator first= _regex.begin();
|
||||
list<const char*>::iterator last = _regex.end();
|
||||
|
||||
while(first!=last){
|
||||
|
||||
regex_t re;
|
||||
int i;
|
||||
|
||||
//REG_EXTENDED
|
||||
//use extended regular expressions
|
||||
//REG_NEWLINE
|
||||
//makes ^...$ work to match newline/endofline
|
||||
|
||||
i= regcomp (&re, *first, REG_EXTENDED|REG_NEWLINE);
|
||||
if(i)
|
||||
return i;
|
||||
regex.push_back(re);
|
||||
regex_src.push_back(*first);
|
||||
++first;
|
||||
}
|
||||
}
|
||||
|
||||
RegexTokenizer::RegexTokenizer(string _input,Mode _mode){
|
||||
mode= _mode;
|
||||
//create a list
|
||||
list<const char*>alist;
|
||||
switch(_mode){
|
||||
case Word:
|
||||
alist.push_back("([^ \t\n]*)([ \t\n]*)");
|
||||
break;
|
||||
case Line:
|
||||
alist.push_back("^(.*)$\n");
|
||||
break;
|
||||
case RFC:
|
||||
alist.push_back("((^.*$)((\n)^[ \t]+.*$)*)(\n)?");
|
||||
//this works, but output is confusing
|
||||
// that is, how to remove the glue ?
|
||||
break;
|
||||
case Custom:
|
||||
//break;
|
||||
default:
|
||||
cerr<<"RegexTokenizer mode constructor called with pointless mode."<<endl;
|
||||
}
|
||||
set(_input,alist);
|
||||
}
|
||||
|
||||
RegexTokenizer::RegexTokenizer(string _input,const char* oneregex){
|
||||
//create a list
|
||||
list<const char*>alist;
|
||||
alist.push_back(oneregex);
|
||||
set(_input,alist);
|
||||
}
|
||||
|
||||
RegexTokenizer::RegexTokenizer(string _input,list<const char*> _regex){
|
||||
set(_input,_regex);
|
||||
}
|
||||
|
||||
RegexTokenizer::RegexTokenizer(const RegexTokenizer &r){
|
||||
//cerr<<"(copy constructor)"<<endl;
|
||||
set(r.input,r.regex_src);
|
||||
|
||||
// result= r.result; "ANSI C++ fobids ..."
|
||||
memcpy(&result[0], &r.result[0], N_pm*sizeof(result[0]) );
|
||||
|
||||
whichregexwasmatched= r.whichregexwasmatched;
|
||||
|
||||
// pm= r.pm;
|
||||
memcpy(&pm[0], &r.pm[0], N_pm*sizeof(pm[0]) );
|
||||
|
||||
I_pm= r.I_pm;
|
||||
error= r.error;
|
||||
so= r.so;
|
||||
eo= r.eo;
|
||||
previous_eo= r.previous_eo;
|
||||
mode= r.mode;
|
||||
}
|
||||
|
||||
|
||||
RegexTokenizer RegexTokenizer::begin() const
|
||||
{
|
||||
//cerr<<"(begin)"<<endl;
|
||||
RegexTokenizer RT(*this);
|
||||
RT.error= 0;
|
||||
RT.so= 0;
|
||||
RT.eo= 0;
|
||||
RT.previous_eo= -1;
|
||||
return RT;
|
||||
}
|
||||
|
||||
RegexTokenizer RegexTokenizer::end() const
|
||||
{
|
||||
//cerr<<"(end)"<<endl;
|
||||
RegexTokenizer RT(*this);
|
||||
RT.error= 1;
|
||||
RT.so= input.length();
|
||||
RT.eo= input.length();
|
||||
RT.previous_eo= RT.eo;
|
||||
return RT;
|
||||
}
|
||||
|
||||
void RegexTokenizer::advance(){
|
||||
//try all patterns until one matches
|
||||
|
||||
//cerr<<"advance"<<endl;
|
||||
//wonder where to get the string from ?
|
||||
//using a char * buffer is ugly, but there is no regex for string
|
||||
// (no regex stuff which I'm aware of at the time of writing (1999) )
|
||||
if(eo < (signed int)input.size()){
|
||||
// there is no c_substr(eo,N_substring) ;-(
|
||||
string sWorkspace(input,eo,N_substring);
|
||||
// waste of time, but I´m not sure when sWorkspace.c_str() gets freed;
|
||||
strncpy(workspace, sWorkspace.c_str(), N_substring) ;
|
||||
}
|
||||
else
|
||||
workspace[0]='\0';
|
||||
|
||||
result[0]= string();
|
||||
|
||||
if(
|
||||
error == 0 && /* regex ok ? */
|
||||
*workspace != 0 && /* check end buffer */
|
||||
previous_eo < eo /* make sure we finish */
|
||||
)
|
||||
{/* while matches found */
|
||||
//cerr<<"go over regex's supplied"<<endl;
|
||||
list<regex_t>::iterator first= regex.begin();
|
||||
list<regex_t>::iterator last = regex.end();
|
||||
error= 1;
|
||||
|
||||
previous_eo= eo;
|
||||
while(error && result[0].empty() && first!=last){//check for empty buffer
|
||||
{
|
||||
//cerr<<endl <<"matching "<< workspace + eo<< endl;
|
||||
|
||||
/* substring found between pm.rm_so and pm.rm_eo */
|
||||
/* This call to regexec() finds the next match */
|
||||
error = regexec(&*first, workspace, N_pm, &pm[0], 0);
|
||||
++first;
|
||||
}
|
||||
|
||||
if(!error){
|
||||
int final_so= eo;
|
||||
int final_eo= eo;
|
||||
//Go over the members of pm to see submatches
|
||||
int i;
|
||||
i=N_pm; while(--i>0){ result[i]= string(); }
|
||||
i=0;
|
||||
while(i<N_pm &&
|
||||
pm[i].rm_so>=0 && pm[i].rm_eo>0 &&
|
||||
pm[i].rm_so<N_substring && pm[i].rm_eo<=N_substring
|
||||
){
|
||||
int local_so= previous_eo+pm[i].rm_so;
|
||||
int local_eo= previous_eo+pm[i].rm_eo;
|
||||
if(i==0)
|
||||
{
|
||||
final_so= local_so;
|
||||
final_eo= local_eo;
|
||||
}
|
||||
result[i]= input.substr(local_so, local_eo-local_so);
|
||||
//cout <<"match["<<i<<"]{"<<pm[i].rm_so<<","<<pm[i].rm_eo<<"}";
|
||||
//cout <<"("<< local_so <<","<< local_eo <<"): " << result[i] << endl;
|
||||
|
||||
i++;
|
||||
}
|
||||
so= final_so;
|
||||
eo= final_eo;
|
||||
I_pm= i;
|
||||
}
|
||||
else{
|
||||
(void)regerror(error, &*first, workspace, N_substring);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
//if the final match has been passed,
|
||||
//signal end (to make != operator work ?PS)
|
||||
// like in *this= end();
|
||||
so= input.length();
|
||||
eo= input.length();
|
||||
previous_eo= eo;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
RegexTokenizer::~RegexTokenizer(){
|
||||
list<regex_t>::iterator first= regex.begin();
|
||||
list<regex_t>::iterator last = regex.end();
|
||||
|
||||
while(first!=last){
|
||||
//cerr<<"freeing "<<&*first<<endl;
|
||||
(void) regfree (&*first);
|
||||
++first;
|
||||
}
|
||||
}
|
||||
|
||||
ostream& operator<<(ostream &o, const RegexTokenizer &r){
|
||||
o<<"("<<&r<<" "<<r.previous_eo<<"-"<<r.so<<"/"<<r.eo<<" ?"<<r.error<<")["<<r.input<<"]"<<endl;
|
||||
return o;
|
||||
}
|
||||
|
||||
@ -1,234 +0,0 @@
|
||||
/*
|
||||
* $Source$
|
||||
* $Revision$
|
||||
* $Date$
|
||||
*
|
||||
* Copyright (c) 1999 by CyberSolutions GmbH.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* This product includes software developed by CyberSolutions GmbH.
|
||||
*
|
||||
* 4. The name of the author may not be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "text.hpp"
|
||||
|
||||
// begin test section
|
||||
// Set verbose to 1 if you want to see the parsed tokens
|
||||
const int verbose=0;
|
||||
|
||||
int test1(){
|
||||
string input("a22bbb4444ccccc999999999dfgDFG");
|
||||
|
||||
// Set up the tokenizer to match the input string
|
||||
// against a regular expression.
|
||||
// The entire match will be returned by *rt,
|
||||
// subexpressions by rt[1], rt[2], rt[3], ..
|
||||
RegexTokenizer rt(input,"([a-z]*)([^a-z]*)");
|
||||
|
||||
int count=0;
|
||||
RegexTokenizer::iterator next= rt.begin();
|
||||
const RegexTokenizer::iterator last= rt.end();
|
||||
|
||||
if(verbose)
|
||||
cout << "*** begin 1*** "<<endl;
|
||||
|
||||
while(next!=last){
|
||||
++next;// Preinc - processes input
|
||||
if(next!=last){
|
||||
if(verbose)
|
||||
cout << *next // Entire match,
|
||||
<<"="
|
||||
<<next[1] // first subexpression,
|
||||
<<"+"
|
||||
<<next[2] // 2nd subexpr.
|
||||
<< endl;
|
||||
}
|
||||
count++;
|
||||
|
||||
}
|
||||
if(verbose)
|
||||
cout << "--- end 1 ---"<<count<<endl;
|
||||
if(count != 5)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int test2(){
|
||||
string input("Word-Satz 1\n Satz 1a\nSatz 2\n\tSatz2a.");
|
||||
|
||||
// Set up the tokenizer to match the input string
|
||||
// against a regular expression.
|
||||
// The entire match will be returned by *rt,
|
||||
// subexpressions by rt[1], rt[2], rt[3], ..
|
||||
RegexTokenizer rt(input,"([^ \t\n]*)([ \t\n]*)");
|
||||
int count=0;
|
||||
|
||||
RegexTokenizer::iterator next= rt.begin();
|
||||
const RegexTokenizer::iterator last= rt.end();
|
||||
|
||||
if(verbose)
|
||||
cout << "*** begin 2*** "<<endl;
|
||||
|
||||
while(next!=last){
|
||||
++next;// Preinc - processes input
|
||||
if(next!=last){
|
||||
if(verbose)
|
||||
cout <<next[1] // first matched subexpression
|
||||
<<"["
|
||||
<<next[2] // second matched subexpression
|
||||
<<"]"<< endl;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
|
||||
if(verbose)
|
||||
cout << "--- end 2 ---"<<count<<endl;
|
||||
if(count != 8)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int testWord(){
|
||||
string input("Ein Satz aus vielen langen Wor-ten.\nUnd ein zweiter Satz.2\n3");
|
||||
|
||||
// Set up the tokenizer to match the input string
|
||||
// against a regular expression that defines the word-wise tokenizing.
|
||||
// The expression used is "([^ \t\n]*)([ \t\n]*)" .
|
||||
// The entire match will be returned by *rt,
|
||||
// subexpressions by rt[1], rt[2], rt[3], ..
|
||||
RegexTokenizer rt(input,RegexTokenizer::Word);
|
||||
int count=0;
|
||||
|
||||
RegexTokenizer::iterator next= rt.begin();
|
||||
const RegexTokenizer::iterator last= rt.end();
|
||||
|
||||
if(verbose)
|
||||
cout << "*** begin Word*** "<<endl;
|
||||
|
||||
while(next!=last){
|
||||
++next;// Preinc - processes input
|
||||
if(next!=last){
|
||||
if(verbose)
|
||||
cout <<next[1] // first matched subexpression,
|
||||
<<"["
|
||||
<<next[2] // 2nd matched subexpr.
|
||||
<<"]"<< endl;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
|
||||
if(verbose)
|
||||
cout << "--- end Word ---"<<count<<endl;
|
||||
if(count != 12)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int testLine(){
|
||||
string input("Line-Satz 1\n Satz 1a\nSatz 2\n\tSatz2a.");
|
||||
|
||||
// Set up the tokenizer to match the input string
|
||||
// against a regular expression that defines line by line tokenizing.
|
||||
// The expression used is "^(.*)$\n" .
|
||||
// The entire match will be returned by *rt,
|
||||
// subexpressions by rt[1], rt[2], rt[3], ..
|
||||
RegexTokenizer rt(input,RegexTokenizer::Line);
|
||||
int count=0;
|
||||
|
||||
RegexTokenizer::iterator next= rt.begin();
|
||||
const RegexTokenizer::iterator last= rt.end();
|
||||
|
||||
if(verbose)
|
||||
cout << "*** begin Line*** "<<endl;
|
||||
|
||||
while(next!=last){
|
||||
++next;// Preinc - processes input
|
||||
if(next!=last){
|
||||
if(verbose)
|
||||
cout <<"'"
|
||||
<<next[1] // first matched subexpression
|
||||
<<"'"<<"["
|
||||
<<next[2] // second matched subexpression
|
||||
<<"]"<< endl;
|
||||
}
|
||||
count++;
|
||||
}
|
||||
|
||||
if(verbose)
|
||||
cout << "--- end Line ---"<<count<<endl;
|
||||
if(count != 5)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int testRFC(){
|
||||
string input("RFC-Satz 1\n Satz 1a\nSatz 2\n\tSatz2a\n\tSatz2b.");
|
||||
|
||||
// Set up the tokenizer to match the input string
|
||||
// against a regular expression that defines RFC-style tokenizing.
|
||||
// The expression used is "((^.*$)((\n)^[ \t]+.*$)*)(\n)?" .
|
||||
// Bug: whitespace that glues one line to the next is not removed.
|
||||
// (afaik, there not way to do this with a single regular expression).
|
||||
// The entire match will be returned by *rt,
|
||||
// subexpressions by rt[1], rt[2], rt[3], ..
|
||||
RegexTokenizer rt(input,RegexTokenizer::RFC);
|
||||
int count=0;
|
||||
|
||||
RegexTokenizer::iterator next= rt.begin();
|
||||
const RegexTokenizer::iterator last= rt.end();
|
||||
|
||||
if(verbose)
|
||||
cout << "*** begin RFC*** "<<endl;
|
||||
while(next!=last){
|
||||
++next;// Preinc - processes input
|
||||
if(next!=last){
|
||||
if(verbose)
|
||||
cout <<"'"<<next[1]<<"'"<<"["<<next[2]<<"]"<<"["<<next[3]<<"]"<< endl;
|
||||
// first, second and third matched subexpression
|
||||
}
|
||||
count++;
|
||||
}
|
||||
|
||||
if(verbose)
|
||||
cout << "--- end RFC ---"<<count<<endl;
|
||||
if(count != 3)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
return test1() || test2() || testWord() || testLine() || testRFC() ;
|
||||
}
|
||||
@ -1,380 +0,0 @@
|
||||
/*
|
||||
* $Source$
|
||||
* $Revision$
|
||||
* $Date$
|
||||
*
|
||||
* Copyright (c) 1999 by CyberSolutions GmbH, Germany.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* This product includes software developed by CyberSolutions GmbH.
|
||||
*
|
||||
* 4. The name of the author may not be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <strstream>
|
||||
|
||||
#include "text.hpp"
|
||||
|
||||
static int mystrpos(const char *c,char s){
|
||||
int i=0;
|
||||
while(c[i])
|
||||
{
|
||||
if(c[i]==s){
|
||||
return i;
|
||||
}
|
||||
i++;
|
||||
};
|
||||
if(!c[i])
|
||||
return -1;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
string TokenIterator::mooncheese= string("The Moon is A green cheese (sheesh!).");
|
||||
|
||||
void TokenIterator::reset(){
|
||||
i= (istream*)0;
|
||||
brace= 0; bracestack[0]='\0';
|
||||
braceoftoken= 0;
|
||||
thesep= '\0'; previoussep= '\0';
|
||||
eoltoken= '\n';
|
||||
whitetoken= ' ';
|
||||
buffer= mooncheese;
|
||||
}
|
||||
|
||||
void TokenIterator::setMode(Mode m){
|
||||
mode= m;
|
||||
switch(mode){
|
||||
case Word:
|
||||
whitespace=" \t";
|
||||
separator="";
|
||||
continuation="";
|
||||
leftbrace="\"";
|
||||
rightbrace="\"";
|
||||
escapechar = '\\';
|
||||
break;
|
||||
case Line:
|
||||
whitespace="";
|
||||
separator="";
|
||||
continuation="";
|
||||
leftbrace="";
|
||||
rightbrace="";
|
||||
escapechar = '\\';
|
||||
break;
|
||||
case RFC:
|
||||
whitespace="";
|
||||
separator="";
|
||||
continuation=" \t";
|
||||
leftbrace="";
|
||||
rightbrace="";
|
||||
escapechar ='\\';
|
||||
break;
|
||||
default:
|
||||
whitespace = " \t";
|
||||
separator = ",;:+-=/\\@";
|
||||
continuation="";
|
||||
leftbrace = "\"([{<";
|
||||
rightbrace = "\")]}>";
|
||||
escapechar = '\\';
|
||||
}
|
||||
}
|
||||
|
||||
TokenIterator::TokenIterator(){
|
||||
reset();
|
||||
braces= false;
|
||||
setMode(Word);
|
||||
ismyistream= false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
TokenIterator TokenIterator::finalIterator = TokenIterator();
|
||||
|
||||
//TokenIterator::TokenIterator(string s, Mode m=Word, bool b=false){
|
||||
TokenIterator::TokenIterator(string s, Mode m, bool b){
|
||||
reset();
|
||||
braces= b;
|
||||
setMode(m);
|
||||
ismyistream= true;
|
||||
i= new istrstream(s.c_str());
|
||||
//++(*this);// read first value (not done; makes this unwieldly)
|
||||
}
|
||||
|
||||
//TokenIterator::TokenIterator(istream &is, Mode m=Word, bool b=false){
|
||||
TokenIterator::TokenIterator(istream &is, Mode m, bool b){
|
||||
reset();
|
||||
braces= b;
|
||||
setMode(m);
|
||||
ismyistream= false;
|
||||
i= &is;
|
||||
//++(*this);// read first value (not done; makes this unwieldly)
|
||||
}
|
||||
|
||||
TokenIterator::~TokenIterator(){
|
||||
if(ismyistream)
|
||||
delete i;
|
||||
}
|
||||
|
||||
|
||||
TokenIterator::iterator& TokenIterator::begin() const
|
||||
{
|
||||
if( i && i->good() && !i->eof() )
|
||||
return *const_cast<TokenIterator*> (this);
|
||||
else
|
||||
return finalIterator;
|
||||
};
|
||||
|
||||
|
||||
//! from Input Iterator
|
||||
//! Returns the next object in the stream.
|
||||
TokenIterator::operator string() const
|
||||
{
|
||||
return buffer;
|
||||
};
|
||||
|
||||
|
||||
//! from Input Iterator
|
||||
//! Returns the next object in the stream.
|
||||
TokenIterator::operator string()
|
||||
{
|
||||
if( buffer== mooncheese )
|
||||
(*this)();
|
||||
return buffer;
|
||||
};
|
||||
|
||||
|
||||
//! from Input Iterator
|
||||
//! Returns the next object in the stream.
|
||||
const string TokenIterator::operator*() const
|
||||
{
|
||||
return buffer;
|
||||
};
|
||||
|
||||
|
||||
//! from Input Iterator
|
||||
//! Returns the next object in the stream.
|
||||
const string TokenIterator::operator*()
|
||||
{
|
||||
if( buffer== mooncheese )
|
||||
(*this)();
|
||||
return buffer;
|
||||
};
|
||||
|
||||
|
||||
//! from Input Iterator
|
||||
//! Preincrement.
|
||||
TokenIterator& TokenIterator::operator++()
|
||||
{
|
||||
(*this)(); return *this;
|
||||
};
|
||||
|
||||
//! from Input Iterator
|
||||
//! Postincrement.
|
||||
//! this works .. almost
|
||||
|
||||
|
||||
TokenIterator& TokenIterator::operator++(int i)
|
||||
{
|
||||
static TokenIterator t = *this;
|
||||
while(i>0){ --i; (*this)++; }
|
||||
return t;
|
||||
};
|
||||
|
||||
|
||||
/** compare not equal */
|
||||
bool TokenIterator::operator != (TokenIterator &R) const{// const & I say, const
|
||||
// note: const TokenIterator &R will create a copy of R :-(
|
||||
// this can't work; have to allow use of const in the above
|
||||
// has to be compared differently( endflags .. ! )
|
||||
|
||||
return &R!= this;
|
||||
}
|
||||
|
||||
/** compare two Tokenizers */
|
||||
bool TokenIterator::operator == (TokenIterator &R) const{
|
||||
// note: const TokenIterator &R will create a copy of R :-(
|
||||
// this can't work; have to allow use of const in the above
|
||||
// has to be compared differently( endflags .. ! )
|
||||
|
||||
return !( *this != R );
|
||||
}
|
||||
|
||||
/** need this for foreach template */
|
||||
bool TokenIterator::operator ! (void) const{
|
||||
return !( i && i->good() && !i->eof() );
|
||||
}
|
||||
|
||||
/** need this for fun */
|
||||
bool TokenIterator::hastoken(void) const{
|
||||
return i && i->good() && !i->eof();
|
||||
}
|
||||
|
||||
|
||||
|
||||
inline bool linefeed(char c, istream *i){
|
||||
if(c=='\r'){
|
||||
char d;
|
||||
if( i->get(d) ){
|
||||
if(d=='\n')
|
||||
;/* dos line feed */
|
||||
else
|
||||
i->unget();
|
||||
}
|
||||
return true;
|
||||
}else if(c=='\n'){
|
||||
char d;
|
||||
if( i->get(d) ){
|
||||
if(d=='\r')
|
||||
;/* carriage return after line feed(?) */
|
||||
else
|
||||
i->unget();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
string TokenIterator::operator()(){
|
||||
char c= 0;
|
||||
int pos;
|
||||
|
||||
previoussep= thesep;
|
||||
buffer= string("");
|
||||
|
||||
while( i->get(c) ){
|
||||
|
||||
if(c==escapechar){
|
||||
|
||||
char d;// special translations need to be plugged in here
|
||||
|
||||
if( i->get(d) ){
|
||||
if( brace && linefeed(d,i) )
|
||||
buffer+= '\n';
|
||||
else
|
||||
buffer+= d;
|
||||
}
|
||||
}
|
||||
|
||||
else if( linefeed(c,i) ){
|
||||
|
||||
thesep= eoltoken;
|
||||
{
|
||||
switch(mode){
|
||||
case Word:
|
||||
if( previoussep!=whitetoken || buffer.length() )// space" = "
|
||||
return buffer;
|
||||
break;
|
||||
case Line:
|
||||
return buffer;
|
||||
break;
|
||||
case RFC:
|
||||
{
|
||||
char d;
|
||||
if( i->get(d) ){
|
||||
if(!strchr(continuation,d) ){
|
||||
i->unget();
|
||||
return buffer;
|
||||
}else
|
||||
i->unget();
|
||||
|
||||
}
|
||||
|
||||
do{
|
||||
if(!i->get(d)){ return buffer; }
|
||||
}while( strchr(continuation,d) );
|
||||
|
||||
//should "A\n \tB" be returned as one token "AB" or as "A B" ?
|
||||
// currently, "AB" is returned
|
||||
i->unget();// unget
|
||||
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return buffer;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if( !(brace) && strchr(whitespace,c) ){ // brace>0 implies braces==true
|
||||
|
||||
if(buffer.length()){
|
||||
thesep= whitetoken;
|
||||
return buffer;// send token
|
||||
}else
|
||||
previoussep= whitetoken;// !?
|
||||
;/* skip */
|
||||
|
||||
}else if(strchr(separator,c)){
|
||||
thesep= c;
|
||||
if( previoussep!=whitetoken || buffer.length() )// space" = "
|
||||
return buffer;// send token
|
||||
|
||||
}else if(brace>0 && bracestack[brace]==c){
|
||||
|
||||
/* closing brace */
|
||||
braceoftoken= brace;
|
||||
brace--; /* pop stack of braces */
|
||||
|
||||
thesep= c;
|
||||
return buffer;// send token
|
||||
|
||||
}else if( braces && (pos=mystrpos(leftbrace,c), pos>=0) ){//pos>0
|
||||
|
||||
/* opening brace */
|
||||
braceoftoken= brace;
|
||||
bracestack[++brace]= rightbrace[pos];
|
||||
if( previoussep!=whitetoken || buffer.length() ){// space" = "
|
||||
thesep= c;
|
||||
return buffer;// send token
|
||||
}
|
||||
}else{
|
||||
/* normal, append to token */
|
||||
buffer+= c;
|
||||
}
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
|
||||
LexxStyleTokenIterator::LexxStyleTokenIterator(TokenIterator *Tbase){
|
||||
state=0;
|
||||
base= Tbase;
|
||||
}
|
||||
|
||||
LexxStyleToken& LexxStyleTokenIterator::operator()(){
|
||||
state= !state;
|
||||
thetoken.ttype= (LexxStyleToken::Tokentype)state;
|
||||
if(state){
|
||||
thetoken.Tstring= (*base)();
|
||||
}else{
|
||||
thetoken.Tchar= base->thesep;
|
||||
}
|
||||
return thetoken;
|
||||
}
|
||||
@ -1,222 +0,0 @@
|
||||
/*
|
||||
* $Source$
|
||||
* $Revision$
|
||||
* $Date$
|
||||
*
|
||||
* Copyright (c) 1999 by CyberSolutions GmbH.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* This product includes software developed by CyberSolutions GmbH.
|
||||
*
|
||||
* 4. The name of the author may not be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <cstdlib>
|
||||
#include <fstream.h>
|
||||
#include <string>
|
||||
#include "text.hpp"
|
||||
#include <algo.h>
|
||||
|
||||
|
||||
// Set verbose=1 to see the tokens
|
||||
const int verbose=1;
|
||||
|
||||
/**
|
||||
The do_sth_with_aTokenIterator class
|
||||
was written with the intent
|
||||
to have the class written to cout
|
||||
using the for_each template
|
||||
*/
|
||||
class do_sth_with_aTokenIterator {
|
||||
string s;
|
||||
public:
|
||||
explicit do_sth_with_aTokenIterator() : s() {};
|
||||
void operator()(const TokenIterator& s)
|
||||
{ cout<< *s <<endl; };
|
||||
|
||||
//! Postincrement.
|
||||
do_sth_with_aTokenIterator& operator++(int i)
|
||||
{
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
int main(int argc,char *argv[]){
|
||||
{
|
||||
//Tokenize words
|
||||
ifstream i("test.txt");
|
||||
if(!i)
|
||||
{ cerr<<"Test Data not found(test.txt)"<< endl; return(2); }
|
||||
|
||||
//Initialize the Tokenizer for word mode
|
||||
TokenIterator tokenize(i,TokenIterator::Word);
|
||||
|
||||
string token="";
|
||||
int count=0;
|
||||
if(verbose)
|
||||
cout<<endl<<"--Word"<<endl;
|
||||
|
||||
//Loop over all tokens
|
||||
while( tokenize.hastoken() ){
|
||||
token= tokenize();
|
||||
if(verbose)
|
||||
cout<<":"<<token<<"\n";
|
||||
count ++;
|
||||
}
|
||||
if(verbose)
|
||||
cout<<endl<<count<<endl;
|
||||
if(count!=27)
|
||||
;//return(1);
|
||||
}
|
||||
{
|
||||
//Tokenize words, with " "
|
||||
ifstream i("test.txt");
|
||||
if(!i) return 255;
|
||||
|
||||
//Initialize the Tokenizer for word mode, "a b" is one word
|
||||
TokenIterator tokenize(i,TokenIterator::Word,true);
|
||||
|
||||
string token="";
|
||||
int count=0;
|
||||
if(verbose)
|
||||
cout<<endl<<"--\"Word\""<<endl;
|
||||
|
||||
//Loop over all tokens
|
||||
while( tokenize.hastoken() ){
|
||||
token= tokenize();
|
||||
if(verbose)
|
||||
cout<<":"<<token<<"\n";
|
||||
count ++;
|
||||
}
|
||||
if(verbose)
|
||||
cout<<endl<<count<<endl;
|
||||
if(count!=25)
|
||||
;//return(1);
|
||||
}
|
||||
{
|
||||
//Tokenize lines
|
||||
ifstream i("test.txt");
|
||||
if(!i) return 255;
|
||||
|
||||
//Initialize the Tokenizer for line mode ( one line == one token )
|
||||
TokenIterator tokenize(i,TokenIterator::Line);
|
||||
|
||||
string token="";
|
||||
int count=0;
|
||||
if(verbose)
|
||||
cout<<endl<<"--Line"<<endl;
|
||||
|
||||
while( tokenize.hastoken() ){
|
||||
token= tokenize();
|
||||
if(verbose)
|
||||
cout<<":"<<token<<"\n";
|
||||
count ++;
|
||||
}
|
||||
if(verbose)
|
||||
cout<<endl<<count<<endl;
|
||||
if(count!=10)
|
||||
;//return(1);
|
||||
}
|
||||
{
|
||||
//Tokenize 'RFC-style'
|
||||
ifstream i("test.txt");
|
||||
if(!i) return 255;
|
||||
|
||||
//Initialize Tokenizer for RFC mode
|
||||
// ( If the following line starts with space or tabulator,
|
||||
// it is glued to the previous line ).
|
||||
TokenIterator tokenize(i,TokenIterator::RFC);
|
||||
string token="";
|
||||
int count=0;
|
||||
if(verbose)
|
||||
cout<<endl<<"--RFC"<<endl;
|
||||
|
||||
//Loop over all tokens
|
||||
while( tokenize.hastoken() ){
|
||||
token= tokenize();
|
||||
if(verbose)
|
||||
cout<<":"<<token<<"\n";
|
||||
count ++;
|
||||
}
|
||||
if(verbose)
|
||||
cout<<endl<<count<<endl;
|
||||
if(count!=5)
|
||||
;//return(1);
|
||||
}
|
||||
|
||||
//trying sequence capability ..
|
||||
{
|
||||
//Tokenize words
|
||||
ifstream i("test.txt");
|
||||
if(!i) return 255;
|
||||
TokenIterator tokenize(i,TokenIterator::Word);
|
||||
string token="";
|
||||
int count=0;
|
||||
if(verbose)
|
||||
cout<<endl<<"--Word(seq)"<<endl;
|
||||
|
||||
//TokenIterator has only dummy capabilities, hence the warnings
|
||||
// or maybe I just don't get what a unary function is ?
|
||||
/* for_each (
|
||||
tokenize.begin(),
|
||||
tokenize.end(),
|
||||
do_sth_with_aTokenIterator()
|
||||
)
|
||||
++count;
|
||||
*/
|
||||
|
||||
if(verbose)
|
||||
cout<<endl<<count<<endl;
|
||||
|
||||
if(count!=0)
|
||||
cerr<<"You mean someone repaired that ?!"<<endl;
|
||||
}
|
||||
|
||||
// there are different way to use the TokenIterator;
|
||||
// this way seems intuitive to me.
|
||||
// testing this way ..
|
||||
{
|
||||
|
||||
ifstream i("test.txt");
|
||||
if(!i) return 255;
|
||||
TokenIterator lines(i, TokenIterator::Line);
|
||||
|
||||
cout<<endl;
|
||||
cout<<"testing while( line= ++lines, ( lines.begin() != lines.end() ) )"
|
||||
<<"\n cout<<\"<<line<<\"<<endl; "<<endl;
|
||||
|
||||
|
||||
string line;
|
||||
while( line= ++lines, ( lines.begin() != lines.end() ) )
|
||||
cout<<"\""<<line<<"\""<<endl;
|
||||
|
||||
}
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
539
libtext/text.hpp
539
libtext/text.hpp
@ -1,539 +0,0 @@
|
||||
/*
|
||||
* $Source$
|
||||
* $Revision$
|
||||
* $Date$
|
||||
*
|
||||
* Copyright (c) 1999 by CyberSolutions GmbH.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* This product includes software developed by CyberSolutions GmbH.
|
||||
*
|
||||
* 4. The name of the author may not be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef __LIB_TEXT_HPP__
|
||||
#define __LIB_TEXT_HPP__
|
||||
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <iterator>
|
||||
#include <list>
|
||||
#include <sys/types.h>
|
||||
#include "../RegExp/RegExp.hpp"
|
||||
|
||||
/** \file text.hpp
|
||||
|
||||
A library for text parsing and manipulation.
|
||||
|
||||
This library contains a couple of useful functions for dealing
|
||||
with strings, most notably a regular expression class and a
|
||||
generic config file parser.
|
||||
*/
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// tokenize() //
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
template<class T>
|
||||
void tokenize(insert_iterator<T> & ii, const string & buffer,
|
||||
const char * sep = " \t\r\n")
|
||||
{
|
||||
string::size_type pos = 0;
|
||||
while(pos != string::npos) {
|
||||
string::size_type end_pos = buffer.find_first_of(sep, pos);
|
||||
string token = buffer.substr(pos, end_pos-pos);
|
||||
if (!token.empty()) {
|
||||
*ii = token;
|
||||
++ii;
|
||||
end_pos = buffer.find_first_not_of(sep, end_pos);
|
||||
}
|
||||
if (end_pos != string::npos)
|
||||
end_pos = buffer.find_first_not_of(sep, end_pos);
|
||||
pos = end_pos;
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// RegexTokenizer() //
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
|
||||
/** The RegexTokenizer extracts tokens from 'string' input.
|
||||
|
||||
string or stream input has to be converted to string. This means
|
||||
the Tokenizer should be useful with large input which is divided
|
||||
into large chunks. A match is performed against a list of regular
|
||||
expressions. Each expression defines a match-separator pair.
|
||||
Regular Expressions are compiled with REG_EXTENDED flag.
|
||||
*/
|
||||
|
||||
class RegexTokenizer: forward_iterator<RegexTokenizer, int> {
|
||||
public:
|
||||
|
||||
/** maximum number of registers, subexpressions */
|
||||
static const int N_pm=10;
|
||||
|
||||
/** maximum length of a match */
|
||||
static const int N_substring=1024;
|
||||
|
||||
/** the workspace */
|
||||
static char workspace[N_substring+1]; //+1 for trailing \0
|
||||
|
||||
/** Modes (other than Custom) make the \a RegexTokenizer use a standard regular expression.
|
||||
|
||||
\a Custom : The tokenizer uses the regular expression you specify.
|
||||
|
||||
\a Word : The tokenizer gives chunks of input separated by space and tabs.
|
||||
|
||||
\a Line : The tokenizer splits input at end of line.
|
||||
|
||||
\a RFC : The tokenizer splits input at end of line.
|
||||
Lines may be continued by starting a new line with spaces or tabs.
|
||||
These continuation characters are NOT stripped from the tokens.
|
||||
|
||||
*/
|
||||
enum Mode {Custom, Word, Line, RFC};
|
||||
|
||||
/** RegexTokenizer is it''s own iterator. */
|
||||
typedef RegexTokenizer iterator;
|
||||
private:
|
||||
string input;
|
||||
string result[N_pm];
|
||||
list<const char*>regex_src;// the source regexes needed for copy/begin/end
|
||||
list<regex_t>regex; // not sure multiple regexes are a smart idea
|
||||
int whichregexwasmatched;
|
||||
regmatch_t pm[N_pm];
|
||||
int I_pm; // matched subexpressions
|
||||
int error; // result of regex calls
|
||||
int so,eo,previous_eo; // positions
|
||||
//int matchMask;//bitset; which fields to return by the * operator
|
||||
protected:
|
||||
Mode mode;
|
||||
void advance();
|
||||
void reset();
|
||||
int set(string _input,list<const char*> _regex);
|
||||
public:
|
||||
/** default constructor. */
|
||||
RegexTokenizer();
|
||||
|
||||
/** Tokenize a string in a mode. */
|
||||
RegexTokenizer(string _input,Mode _mode);
|
||||
|
||||
/** Tokenize a string according to a single regular expression. */
|
||||
RegexTokenizer(string _input,const char* oneregex);
|
||||
|
||||
/** Tokenize a string according to several regular expressions.
|
||||
(If the first regular expression fails, the next one will be tried. )
|
||||
*/
|
||||
RegexTokenizer(string _input,list<const char*> _regex);
|
||||
|
||||
/** copy constructor */
|
||||
RegexTokenizer(const RegexTokenizer &r);
|
||||
|
||||
//void selectFields(int m){ matchMask= m; }
|
||||
|
||||
/** The begin state */
|
||||
RegexTokenizer begin() const;
|
||||
|
||||
/** The end state */
|
||||
RegexTokenizer end() const;
|
||||
|
||||
/**
|
||||
from Input Iterator
|
||||
Returns the current token.
|
||||
*/
|
||||
const string operator*() const
|
||||
{ return result[0]; };
|
||||
|
||||
|
||||
/** from Input Iterator
|
||||
Returns the i-th matched subexpression.
|
||||
*/
|
||||
const string operator[](int i) const
|
||||
{ return result[i]; };
|
||||
|
||||
/** from Input Iterator
|
||||
PreIncrement
|
||||
*/
|
||||
RegexTokenizer& operator++()
|
||||
{ (*this).advance(); return *this; };
|
||||
|
||||
/** from Input Iterator
|
||||
PostIncrement
|
||||
*/
|
||||
RegexTokenizer& operator++(int i)
|
||||
{ while(i>0){ (*this).advance(); --i; }; return *this; };
|
||||
|
||||
/** Destructor */
|
||||
virtual ~RegexTokenizer();
|
||||
|
||||
/** compare not equal */
|
||||
bool operator != (const RegexTokenizer &R) const{// const & I say, const
|
||||
return so != R.so || eo != R.eo || previous_eo != R.previous_eo;
|
||||
}
|
||||
|
||||
/** compare two RegexTokenizers */
|
||||
bool operator == (const RegexTokenizer &R) const{
|
||||
return !( *this != R );
|
||||
}
|
||||
|
||||
/** print the current state of the RegexTokenizer */
|
||||
friend ostream& operator<<(ostream &o,const RegexTokenizer &r);
|
||||
};
|
||||
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// TokenIterator //
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
|
||||
/** The TokenIterator extracts tokens from string or stream input.
|
||||
|
||||
There are four main modes and a custom mode. In all modes, the
|
||||
backslash works as an escape character for the next character i.e.
|
||||
'one\\\\backslash' is read as 'one\backslash'.
|
||||
|
||||
Description of the main modes:
|
||||
|
||||
1. Words separated by whitespace, with "whitespace" consisting of
|
||||
tabulators and the blank.
|
||||
\code
|
||||
TokenIterator tokenize(inputStr,TokenIterator::Word);
|
||||
\endcode
|
||||
|
||||
2. Words separated by whitespace, "one word" is one token.
|
||||
whitespace is defined to be only tabulators and the blank.
|
||||
\code
|
||||
TokenIterator tokenize(inputStr,TokenIterator::Word,true);
|
||||
\endcode
|
||||
|
||||
3. Each line is a token.
|
||||
Escaped newlines will become part of the token.
|
||||
example:
|
||||
\code
|
||||
TokenIterator tokenize(inputStr,TokenIterator::Line);
|
||||
\endcode
|
||||
|
||||
4. RFC style:
|
||||
Whitespace at start of next line appends next line.
|
||||
The use of escaping the newline to append the next line,
|
||||
like in Makefiles, is NOT part of this mode.
|
||||
example:
|
||||
\code
|
||||
TokenIterator tokenize(inputStr,TokenIterator::RFC);
|
||||
\endcode
|
||||
|
||||
5. The Custom Mode: The custom mode is intended for reading from
|
||||
data that is in almost human-readable-format, like /etc/passwd.
|
||||
Separating elements are not returned as Tokens, but are stored in
|
||||
thesep and previoussep. In /etc/passwd ':' is the separator,
|
||||
while newlines separate records.
|
||||
\code
|
||||
class MyCustomTokenIterator: public TokenIterator{
|
||||
public:
|
||||
|
||||
MyCustomTokenIterator(string inputStr, bool b=false)
|
||||
: TokenIterator(inputStr,TokenIterator::Custom, b){
|
||||
eoltoken= '\n';
|
||||
separator= ":\n";
|
||||
};
|
||||
|
||||
MyCustomTokenIterator(istream &inputStr, bool b=false)
|
||||
: TokenIterator(inputStr,TokenIterator::Custom, b){
|
||||
eoltoken= '\n';
|
||||
separator= ":\n";
|
||||
};
|
||||
\endcode
|
||||
See \a CustomTokenIterator.cpp for the full example.
|
||||
|
||||
Bugs (Custom Mode): Does not recognize a separator preceded by whitespace
|
||||
Instead, the tokenizer will collapse a series of whitespace, but
|
||||
will offer it as a separator in thesep.
|
||||
This is probably not what you want.
|
||||
*/
|
||||
|
||||
|
||||
class TokenIterator:istream_iterator<string,int> {
|
||||
|
||||
private:
|
||||
istream *i;
|
||||
bool ismyistream;
|
||||
string buffer;
|
||||
|
||||
static TokenIterator finalIterator;
|
||||
|
||||
static string mooncheese;
|
||||
|
||||
public:
|
||||
/** \relates TokenIterator
|
||||
The modes allowed as arguments.
|
||||
*/
|
||||
enum Mode {Word, Line, RFC, Custom};
|
||||
|
||||
typedef TokenIterator iterator;
|
||||
|
||||
protected:
|
||||
int brace;
|
||||
int braceoftoken;
|
||||
string bracestack;
|
||||
bool braces;
|
||||
Mode mode;
|
||||
|
||||
const char *whitespace; // ALL whitespace must be listed here
|
||||
const char *separator; // separators
|
||||
const char *continuation;// lists continuation
|
||||
const char *leftbrace; // leftbrace[i] matches rightbrace[i]
|
||||
const char *rightbrace; // supports multiple levels of braces
|
||||
char escapechar;// escapechar is the escape char; default \ .
|
||||
char eoltoken; // use this instead of end of line
|
||||
char whitetoken;// use this instead of whitespace
|
||||
|
||||
void setMode(Mode m);
|
||||
void reset();
|
||||
|
||||
public:
|
||||
/**
|
||||
Returns one token each call.
|
||||
An empty token does NOT signal the end of the input.
|
||||
*/
|
||||
virtual string operator()();
|
||||
|
||||
/** Dummy constructor */
|
||||
/** constructs an Iterator that has reached end */
|
||||
TokenIterator();
|
||||
|
||||
/** Constructor used to tokenize a string s,
|
||||
using \a Mode m (default is Words),
|
||||
by default without braces.
|
||||
*/
|
||||
TokenIterator(string s, Mode m=Word, bool braces=false);
|
||||
|
||||
/** Constructor used to tokenize from an input stream,
|
||||
using \a Mode m (default is Words),
|
||||
by default without braces.
|
||||
|
||||
The input stream is consumed, which is why
|
||||
the TokenIterator doesn''t offer backward iterator capabilities.
|
||||
*/
|
||||
TokenIterator(istream &is, Mode m=Word, bool braces=false);
|
||||
|
||||
|
||||
/** A begin function returning bool.
|
||||
\a begin and \a end functions have been crafted to
|
||||
work with this way of using iterators:
|
||||
\code
|
||||
ifstream is(somefilename);
|
||||
TokenIterator tokenize(is);
|
||||
|
||||
while( tokenize->begin() != tokenize->end() ){
|
||||
string token= tokenize();
|
||||
...
|
||||
}
|
||||
\endcode
|
||||
*/
|
||||
iterator& begin() const;
|
||||
|
||||
|
||||
/** A end function returning an iterator. See \a begin .
|
||||
*/
|
||||
inline iterator& end() const{ return finalIterator; };
|
||||
|
||||
|
||||
virtual ~TokenIterator();
|
||||
|
||||
//! from Input Iterator
|
||||
//! Returns the current object in the stream.
|
||||
operator string() const;
|
||||
|
||||
|
||||
//! from Input Iterator
|
||||
//! Returns the current object in the stream,
|
||||
//! and the next object if the stream hasn't been read yet
|
||||
operator string();
|
||||
|
||||
|
||||
//! from Input Iterator
|
||||
//! Returns the current object in the stream.
|
||||
const string operator*() const;
|
||||
|
||||
|
||||
//! from Input Iterator
|
||||
//! Returns the current object in the stream,
|
||||
//! and the next object if the stream hasn't been read yet
|
||||
const string operator*();
|
||||
|
||||
|
||||
//! from Input Iterator
|
||||
//! Preincrement.
|
||||
TokenIterator& operator++();
|
||||
|
||||
//! from Input Iterator
|
||||
//! Postincrement.
|
||||
//! this works .. almost
|
||||
TokenIterator& operator++(int i);
|
||||
|
||||
|
||||
/** compare not equal */
|
||||
bool operator != (TokenIterator &R) const;
|
||||
|
||||
|
||||
/** compare two Tokenizers */
|
||||
bool operator == (TokenIterator &R) const;
|
||||
|
||||
|
||||
/** need this for foreach template */
|
||||
bool operator ! (void) const;
|
||||
|
||||
|
||||
/** Introducing an implicit conversion to bool is not */
|
||||
/** good because it creates an ambiguity, */
|
||||
/** since bool may be converted implicitly to int and String. */
|
||||
bool hastoken (void) const;
|
||||
|
||||
|
||||
|
||||
/** contains the separator that ended the token */
|
||||
char thesep;
|
||||
|
||||
/** holds the separator that preceded the token */
|
||||
char previoussep;
|
||||
|
||||
/** when using braces (in custom mode),
|
||||
check this to get the number of unclosed braces. */
|
||||
inline int bracingdepth() const{ return braceoftoken; };
|
||||
|
||||
/** use this to compare with instead of end of line \\n */
|
||||
inline char eolToken() const{ return eoltoken; };
|
||||
|
||||
// use this to compare with instead of space */
|
||||
inline char whiteToken() const{ return whitetoken; };
|
||||
};
|
||||
/** \example TokenIterator_test.cpp */
|
||||
/** \example CustomTokenIterator.cpp */
|
||||
|
||||
|
||||
/**
|
||||
The LexxStyleToken is returned by the \a LexxStyleTokenIterator
|
||||
\code
|
||||
struct LexxStyleToken{
|
||||
enum Tokentype {T1_separator, T1_string};
|
||||
Tokentype ttype;
|
||||
string Tstring;
|
||||
char Tchar;
|
||||
};
|
||||
\endcode
|
||||
*/
|
||||
struct LexxStyleToken{
|
||||
enum Tokentype {T1_separator, T1_string};
|
||||
Tokentype ttype;
|
||||
string Tstring;
|
||||
char Tchar;
|
||||
};
|
||||
|
||||
/**
|
||||
The \a LexxStyleToken iterator is a wrapper around the
|
||||
\a TokenIterator . It returns the separators and the parts
|
||||
of the string that are separated by the separators
|
||||
in alteration.
|
||||
*/
|
||||
class LexxStyleTokenIterator{
|
||||
private:
|
||||
TokenIterator *base;
|
||||
int state;
|
||||
public:
|
||||
/**
|
||||
Return the current token,
|
||||
without proceeding to the next token.
|
||||
*/
|
||||
LexxStyleToken thetoken;
|
||||
|
||||
/**
|
||||
Wrap the TokenIterator in the LexxStyleTokenIterator.
|
||||
*/
|
||||
LexxStyleTokenIterator(TokenIterator *Tbase);
|
||||
|
||||
/**
|
||||
Return the next token.
|
||||
*/
|
||||
LexxStyleToken& operator()();
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
\a crop_token removes leading and trailing whitespace from a token.
|
||||
Example:
|
||||
\code
|
||||
cout << crop_token( " \thead tail \t" ) << endl; // prints "head tail"
|
||||
\endcode
|
||||
*/
|
||||
|
||||
inline string crop_token(const string &s, const string whitespace=string(" /t") ){
|
||||
size_t left = s.find_first_not_of(whitespace.c_str());
|
||||
size_t right= s.find_last_not_of(whitespace.c_str());
|
||||
return string(s,left,right-left+1);
|
||||
};
|
||||
|
||||
|
||||
/** \a text_escape escapes newlines and escape characters
|
||||
inside a string such that it may be read by the \a TokenIterator
|
||||
in \a TokenIterator::Line or \a TokenIterator::Word Mode.
|
||||
*/
|
||||
inline string text_escape(const string &lines)
|
||||
{
|
||||
unsigned int count= 0;
|
||||
|
||||
//
|
||||
// count how many characters have to be escaped
|
||||
//
|
||||
for( unsigned int i=0; i<lines.size(); ++i )
|
||||
if( lines[i]=='\n' || lines[i]=='\\' )
|
||||
++count;
|
||||
|
||||
|
||||
string result("");
|
||||
result.reserve( lines.size()+count+1 );
|
||||
|
||||
//
|
||||
// escape characters
|
||||
//
|
||||
{
|
||||
|
||||
for( unsigned int i=0; i<lines.size(); ++i )
|
||||
{
|
||||
if( lines[i]=='\n' || lines[i]=='\\' )
|
||||
result += '\\';
|
||||
result += lines[i];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#endif // !defined(__LIB_TEXT_HPP__)
|
||||
@ -1,60 +0,0 @@
|
||||
/*
|
||||
* $Source$
|
||||
* $Revision$
|
||||
* $Date$
|
||||
*
|
||||
* Copyright (c) 1999 by CyberSolutions GmbH.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this software
|
||||
* must display the following acknowledgement:
|
||||
* This product includes software developed by CyberSolutions GmbH.
|
||||
*
|
||||
* 4. The name of the author may not be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <list>
|
||||
|
||||
#include "text.hpp"
|
||||
|
||||
int
|
||||
main(int argc, char ** argv)
|
||||
{
|
||||
//
|
||||
// Test the tokenizer.
|
||||
//
|
||||
list<string> l;
|
||||
insert_iterator< list<string> > ii(l, l.end());
|
||||
const string buf("this is a test\n\n\n\r\tskfj \t blax\n");
|
||||
tokenize(ii, buf);
|
||||
cout << "Found " << l.size() << " tokens." << endl;
|
||||
if( l.size()!=6 )
|
||||
return 1;
|
||||
copy(l.begin(), l.end(), ostream_iterator<string>(cout, "\n"));
|
||||
|
||||
return 0;
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user