260 lines
6.9 KiB
C++
260 lines
6.9 KiB
C++
/*
|
||
* $Source$
|
||
* $Revision$
|
||
* $Date$
|
||
*
|
||
* Copyright (c) 1999 by CyberSolutions GmbH, Germany.
|
||
* All rights reserved.
|
||
*
|
||
* Redistribution and use in source and binary forms, with or without
|
||
* modification, are permitted provided that the following conditions
|
||
* are met:
|
||
*
|
||
* 1. Redistributions of source code must retain the above copyright
|
||
* notice, this list of conditions and the following disclaimer.
|
||
*
|
||
* 2. Redistributions in binary form must reproduce the above copyright
|
||
* notice, this list of conditions and the following disclaimer in the
|
||
* documentation and/or other materials provided with the distribution.
|
||
*
|
||
* 3. All advertising materials mentioning features or use of this software
|
||
* must display the following acknowledgement:
|
||
* This product includes software developed by CyberSolutions GmbH.
|
||
*
|
||
* 4. The name of the author may not be used to endorse or promote products
|
||
* derived from this software without specific prior written permission.
|
||
*
|
||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
*/
|
||
|
||
|
||
#include "text.hpp"
|
||
|
||
char RegexTokenizer::workspace[RegexTokenizer::N_substring+1]="";
|
||
|
||
RegexTokenizer::RegexTokenizer(){
|
||
}
|
||
|
||
void RegexTokenizer::reset(){
|
||
input= string();
|
||
int i=N_pm; while(--i>0){ pm[i].rm_so=-1; pm[i].rm_eo=-1; }
|
||
so= 0;
|
||
eo= 0;
|
||
previous_eo= -1;
|
||
error= 0;
|
||
}
|
||
|
||
int RegexTokenizer::set(string _input,list<const char*> _regex){
|
||
reset();
|
||
input= _input;
|
||
|
||
list<const char*>::iterator first= _regex.begin();
|
||
list<const char*>::iterator last = _regex.end();
|
||
|
||
while(first!=last){
|
||
|
||
regex_t re;
|
||
int i;
|
||
|
||
//REG_EXTENDED
|
||
//use extended regular expressions
|
||
//REG_NEWLINE
|
||
//makes ^...$ work to match newline/endofline
|
||
|
||
i= regcomp (&re, *first, REG_EXTENDED|REG_NEWLINE);
|
||
if(i)
|
||
return i;
|
||
regex.push_back(re);
|
||
regex_src.push_back(*first);
|
||
++first;
|
||
}
|
||
}
|
||
|
||
RegexTokenizer::RegexTokenizer(string _input,Mode _mode){
|
||
mode= _mode;
|
||
//create a list
|
||
list<const char*>alist;
|
||
switch(_mode){
|
||
case Word:
|
||
alist.push_back("([^ \t\n]*)([ \t\n]*)");
|
||
break;
|
||
case Line:
|
||
alist.push_back("^(.*)$\n");
|
||
break;
|
||
case RFC:
|
||
alist.push_back("((^.*$)((\n)^[ \t]+.*$)*)(\n)?");
|
||
//this works, but output is confusing
|
||
// that is, how to remove the glue ?
|
||
break;
|
||
case Custom:
|
||
//break;
|
||
default:
|
||
cerr<<"RegexTokenizer mode constructor called with pointless mode."<<endl;
|
||
}
|
||
set(_input,alist);
|
||
}
|
||
|
||
RegexTokenizer::RegexTokenizer(string _input,const char* oneregex){
|
||
//create a list
|
||
list<const char*>alist;
|
||
alist.push_back(oneregex);
|
||
set(_input,alist);
|
||
}
|
||
|
||
RegexTokenizer::RegexTokenizer(string _input,list<const char*> _regex){
|
||
set(_input,_regex);
|
||
}
|
||
|
||
RegexTokenizer::RegexTokenizer(const RegexTokenizer &r){
|
||
//cerr<<"(copy constructor)"<<endl;
|
||
set(r.input,r.regex_src);
|
||
|
||
// result= r.result; "ANSI C++ fobids ..."
|
||
memcpy(&result[0], &r.result[0], N_pm*sizeof(result[0]) );
|
||
|
||
whichregexwasmatched= r.whichregexwasmatched;
|
||
|
||
// pm= r.pm;
|
||
memcpy(&pm[0], &r.pm[0], N_pm*sizeof(pm[0]) );
|
||
|
||
I_pm= r.I_pm;
|
||
error= r.error;
|
||
so= r.so;
|
||
eo= r.eo;
|
||
previous_eo= r.previous_eo;
|
||
mode= r.mode;
|
||
}
|
||
|
||
|
||
RegexTokenizer RegexTokenizer::begin() const
|
||
{
|
||
//cerr<<"(begin)"<<endl;
|
||
RegexTokenizer RT(*this);
|
||
RT.error= 0;
|
||
RT.so= 0;
|
||
RT.eo= 0;
|
||
RT.previous_eo= -1;
|
||
return RT;
|
||
}
|
||
|
||
RegexTokenizer RegexTokenizer::end() const
|
||
{
|
||
//cerr<<"(end)"<<endl;
|
||
RegexTokenizer RT(*this);
|
||
RT.error= 1;
|
||
RT.so= input.length();
|
||
RT.eo= input.length();
|
||
RT.previous_eo= RT.eo;
|
||
return RT;
|
||
}
|
||
|
||
void RegexTokenizer::advance(){
|
||
//try all patterns until one matches
|
||
|
||
//cerr<<"advance"<<endl;
|
||
//wonder where to get the string from ?
|
||
//using a char * buffer is ugly, but there is no regex for string
|
||
// (no regex stuff which I'm aware of at the time of writing (1999) )
|
||
if(eo < (signed int)input.size()){
|
||
// there is no c_substr(eo,N_substring) ;-(
|
||
string sWorkspace(input,eo,N_substring);
|
||
// waste of time, but I<>m not sure when sWorkspace.c_str() gets freed;
|
||
strncpy(workspace, sWorkspace.c_str(), N_substring) ;
|
||
}
|
||
else
|
||
workspace[0]='\0';
|
||
|
||
result[0]= string();
|
||
|
||
if(
|
||
error == 0 && /* regex ok ? */
|
||
*workspace != 0 && /* check end buffer */
|
||
previous_eo < eo /* make sure we finish */
|
||
)
|
||
{/* while matches found */
|
||
//cerr<<"go over regex's supplied"<<endl;
|
||
list<regex_t>::iterator first= regex.begin();
|
||
list<regex_t>::iterator last = regex.end();
|
||
error= 1;
|
||
|
||
previous_eo= eo;
|
||
while(error && result[0].empty() && first!=last){//check for empty buffer
|
||
{
|
||
//cerr<<endl <<"matching "<< workspace + eo<< endl;
|
||
|
||
/* substring found between pm.rm_so and pm.rm_eo */
|
||
/* This call to regexec() finds the next match */
|
||
error = regexec(&*first, workspace, N_pm, &pm[0], 0);
|
||
++first;
|
||
}
|
||
|
||
if(!error){
|
||
int final_so= eo;
|
||
int final_eo= eo;
|
||
//Go over the members of pm to see submatches
|
||
int i;
|
||
i=N_pm; while(--i>0){ result[i]= string(); }
|
||
i=0;
|
||
while(i<N_pm &&
|
||
pm[i].rm_so>=0 && pm[i].rm_eo>0 &&
|
||
pm[i].rm_so<N_substring && pm[i].rm_eo<=N_substring
|
||
){
|
||
int local_so= previous_eo+pm[i].rm_so;
|
||
int local_eo= previous_eo+pm[i].rm_eo;
|
||
if(i==0)
|
||
{
|
||
final_so= local_so;
|
||
final_eo= local_eo;
|
||
}
|
||
result[i]= input.substr(local_so, local_eo-local_so);
|
||
//cout <<"match["<<i<<"]{"<<pm[i].rm_so<<","<<pm[i].rm_eo<<"}";
|
||
//cout <<"("<< local_so <<","<< local_eo <<"): " << result[i] << endl;
|
||
|
||
i++;
|
||
}
|
||
so= final_so;
|
||
eo= final_eo;
|
||
I_pm= i;
|
||
}
|
||
else{
|
||
(void)regerror(error, &*first, workspace, N_substring);
|
||
}
|
||
}
|
||
}else{
|
||
//if the final match has been passed,
|
||
//signal end (to make != operator work ?PS)
|
||
// like in *this= end();
|
||
so= input.length();
|
||
eo= input.length();
|
||
previous_eo= eo;
|
||
|
||
}
|
||
|
||
}
|
||
|
||
RegexTokenizer::~RegexTokenizer(){
|
||
list<regex_t>::iterator first= regex.begin();
|
||
list<regex_t>::iterator last = regex.end();
|
||
|
||
while(first!=last){
|
||
//cerr<<"freeing "<<&*first<<endl;
|
||
(void) regfree (&*first);
|
||
++first;
|
||
}
|
||
}
|
||
|
||
ostream& operator<<(ostream &o, const RegexTokenizer &r){
|
||
o<<"("<<&r<<" "<<r.previous_eo<<"-"<<r.so<<"/"<<r.eo<<" ?"<<r.error<<")["<<r.input<<"]"<<endl;
|
||
return o;
|
||
}
|
||
|