260 lines
6.9 KiB
C++
260 lines
6.9 KiB
C++
|
|
/*
|
|||
|
|
* $Source$
|
|||
|
|
* $Revision$
|
|||
|
|
* $Date$
|
|||
|
|
*
|
|||
|
|
* Copyright (c) 1999 by CyberSolutions GmbH, Germany.
|
|||
|
|
* All rights reserved.
|
|||
|
|
*
|
|||
|
|
* Redistribution and use in source and binary forms, with or without
|
|||
|
|
* modification, are permitted provided that the following conditions
|
|||
|
|
* are met:
|
|||
|
|
*
|
|||
|
|
* 1. Redistributions of source code must retain the above copyright
|
|||
|
|
* notice, this list of conditions and the following disclaimer.
|
|||
|
|
*
|
|||
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|||
|
|
* notice, this list of conditions and the following disclaimer in the
|
|||
|
|
* documentation and/or other materials provided with the distribution.
|
|||
|
|
*
|
|||
|
|
* 3. All advertising materials mentioning features or use of this software
|
|||
|
|
* must display the following acknowledgement:
|
|||
|
|
* This product includes software developed by CyberSolutions GmbH.
|
|||
|
|
*
|
|||
|
|
* 4. The name of the author may not be used to endorse or promote products
|
|||
|
|
* derived from this software without specific prior written permission.
|
|||
|
|
*
|
|||
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|||
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|||
|
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|||
|
|
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|||
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|||
|
|
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|||
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|||
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|||
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|||
|
|
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
|
|||
|
|
#include "text.hpp"
|
|||
|
|
|
|||
|
|
char RegexTokenizer::workspace[RegexTokenizer::N_substring+1]="";
|
|||
|
|
|
|||
|
|
RegexTokenizer::RegexTokenizer(){
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void RegexTokenizer::reset(){
|
|||
|
|
input= string();
|
|||
|
|
int i=N_pm; while(--i>0){ pm[i].rm_so=-1; pm[i].rm_eo=-1; }
|
|||
|
|
so= 0;
|
|||
|
|
eo= 0;
|
|||
|
|
previous_eo= -1;
|
|||
|
|
error= 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
int RegexTokenizer::set(string _input,list<const char*> _regex){
|
|||
|
|
reset();
|
|||
|
|
input= _input;
|
|||
|
|
|
|||
|
|
list<const char*>::iterator first= _regex.begin();
|
|||
|
|
list<const char*>::iterator last = _regex.end();
|
|||
|
|
|
|||
|
|
while(first!=last){
|
|||
|
|
|
|||
|
|
regex_t re;
|
|||
|
|
int i;
|
|||
|
|
|
|||
|
|
//REG_EXTENDED
|
|||
|
|
//use extended regular expressions
|
|||
|
|
//REG_NEWLINE
|
|||
|
|
//makes ^...$ work to match newline/endofline
|
|||
|
|
|
|||
|
|
i= regcomp (&re, *first, REG_EXTENDED|REG_NEWLINE);
|
|||
|
|
if(i)
|
|||
|
|
return i;
|
|||
|
|
regex.push_back(re);
|
|||
|
|
regex_src.push_back(*first);
|
|||
|
|
++first;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
RegexTokenizer::RegexTokenizer(string _input,Mode _mode){
|
|||
|
|
mode= _mode;
|
|||
|
|
//create a list
|
|||
|
|
list<const char*>alist;
|
|||
|
|
switch(_mode){
|
|||
|
|
case Word:
|
|||
|
|
alist.push_back("([^ \t\n]*)([ \t\n]*)");
|
|||
|
|
break;
|
|||
|
|
case Line:
|
|||
|
|
alist.push_back("^(.*)$\n");
|
|||
|
|
break;
|
|||
|
|
case RFC:
|
|||
|
|
alist.push_back("((^.*$)((\n)^[ \t]+.*$)*)(\n)?");
|
|||
|
|
//this works, but output is confusing
|
|||
|
|
// that is, how to remove the glue ?
|
|||
|
|
break;
|
|||
|
|
case Custom:
|
|||
|
|
//break;
|
|||
|
|
default:
|
|||
|
|
cerr<<"RegexTokenizer mode constructor called with pointless mode."<<endl;
|
|||
|
|
}
|
|||
|
|
set(_input,alist);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
RegexTokenizer::RegexTokenizer(string _input,const char* oneregex){
|
|||
|
|
//create a list
|
|||
|
|
list<const char*>alist;
|
|||
|
|
alist.push_back(oneregex);
|
|||
|
|
set(_input,alist);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
RegexTokenizer::RegexTokenizer(string _input,list<const char*> _regex){
|
|||
|
|
set(_input,_regex);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
RegexTokenizer::RegexTokenizer(const RegexTokenizer &r){
|
|||
|
|
//cerr<<"(copy constructor)"<<endl;
|
|||
|
|
set(r.input,r.regex_src);
|
|||
|
|
|
|||
|
|
// result= r.result; "ANSI C++ fobids ..."
|
|||
|
|
memcpy(&result[0], &r.result[0], N_pm*sizeof(result[0]) );
|
|||
|
|
|
|||
|
|
whichregexwasmatched= r.whichregexwasmatched;
|
|||
|
|
|
|||
|
|
// pm= r.pm;
|
|||
|
|
memcpy(&pm[0], &r.pm[0], N_pm*sizeof(pm[0]) );
|
|||
|
|
|
|||
|
|
I_pm= r.I_pm;
|
|||
|
|
error= r.error;
|
|||
|
|
so= r.so;
|
|||
|
|
eo= r.eo;
|
|||
|
|
previous_eo= r.previous_eo;
|
|||
|
|
mode= r.mode;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
RegexTokenizer RegexTokenizer::begin() const
|
|||
|
|
{
|
|||
|
|
//cerr<<"(begin)"<<endl;
|
|||
|
|
RegexTokenizer RT(*this);
|
|||
|
|
RT.error= 0;
|
|||
|
|
RT.so= 0;
|
|||
|
|
RT.eo= 0;
|
|||
|
|
RT.previous_eo= -1;
|
|||
|
|
return RT;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
RegexTokenizer RegexTokenizer::end() const
|
|||
|
|
{
|
|||
|
|
//cerr<<"(end)"<<endl;
|
|||
|
|
RegexTokenizer RT(*this);
|
|||
|
|
RT.error= 1;
|
|||
|
|
RT.so= input.length();
|
|||
|
|
RT.eo= input.length();
|
|||
|
|
RT.previous_eo= RT.eo;
|
|||
|
|
return RT;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
void RegexTokenizer::advance(){
|
|||
|
|
//try all patterns until one matches
|
|||
|
|
|
|||
|
|
//cerr<<"advance"<<endl;
|
|||
|
|
//wonder where to get the string from ?
|
|||
|
|
//using a char * buffer is ugly, but there is no regex for string
|
|||
|
|
// (no regex stuff which I'm aware of at the time of writing (1999) )
|
|||
|
|
if(eo < (signed int)input.size()){
|
|||
|
|
// there is no c_substr(eo,N_substring) ;-(
|
|||
|
|
string sWorkspace(input,eo,N_substring);
|
|||
|
|
// waste of time, but I<>m not sure when sWorkspace.c_str() gets freed;
|
|||
|
|
strncpy(workspace, sWorkspace.c_str(), N_substring) ;
|
|||
|
|
}
|
|||
|
|
else
|
|||
|
|
workspace[0]='\0';
|
|||
|
|
|
|||
|
|
result[0]= string();
|
|||
|
|
|
|||
|
|
if(
|
|||
|
|
error == 0 && /* regex ok ? */
|
|||
|
|
*workspace != 0 && /* check end buffer */
|
|||
|
|
previous_eo < eo /* make sure we finish */
|
|||
|
|
)
|
|||
|
|
{/* while matches found */
|
|||
|
|
//cerr<<"go over regex's supplied"<<endl;
|
|||
|
|
list<regex_t>::iterator first= regex.begin();
|
|||
|
|
list<regex_t>::iterator last = regex.end();
|
|||
|
|
error= 1;
|
|||
|
|
|
|||
|
|
previous_eo= eo;
|
|||
|
|
while(error && result[0].empty() && first!=last){//check for empty buffer
|
|||
|
|
{
|
|||
|
|
//cerr<<endl <<"matching "<< workspace + eo<< endl;
|
|||
|
|
|
|||
|
|
/* substring found between pm.rm_so and pm.rm_eo */
|
|||
|
|
/* This call to regexec() finds the next match */
|
|||
|
|
error = regexec(&*first, workspace, N_pm, &pm[0], 0);
|
|||
|
|
++first;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if(!error){
|
|||
|
|
int final_so= eo;
|
|||
|
|
int final_eo= eo;
|
|||
|
|
//Go over the members of pm to see submatches
|
|||
|
|
int i;
|
|||
|
|
i=N_pm; while(--i>0){ result[i]= string(); }
|
|||
|
|
i=0;
|
|||
|
|
while(i<N_pm &&
|
|||
|
|
pm[i].rm_so>=0 && pm[i].rm_eo>0 &&
|
|||
|
|
pm[i].rm_so<N_substring && pm[i].rm_eo<=N_substring
|
|||
|
|
){
|
|||
|
|
int local_so= previous_eo+pm[i].rm_so;
|
|||
|
|
int local_eo= previous_eo+pm[i].rm_eo;
|
|||
|
|
if(i==0)
|
|||
|
|
{
|
|||
|
|
final_so= local_so;
|
|||
|
|
final_eo= local_eo;
|
|||
|
|
}
|
|||
|
|
result[i]= input.substr(local_so, local_eo-local_so);
|
|||
|
|
//cout <<"match["<<i<<"]{"<<pm[i].rm_so<<","<<pm[i].rm_eo<<"}";
|
|||
|
|
//cout <<"("<< local_so <<","<< local_eo <<"): " << result[i] << endl;
|
|||
|
|
|
|||
|
|
i++;
|
|||
|
|
}
|
|||
|
|
so= final_so;
|
|||
|
|
eo= final_eo;
|
|||
|
|
I_pm= i;
|
|||
|
|
}
|
|||
|
|
else{
|
|||
|
|
(void)regerror(error, &*first, workspace, N_substring);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}else{
|
|||
|
|
//if the final match has been passed,
|
|||
|
|
//signal end (to make != operator work ?PS)
|
|||
|
|
// like in *this= end();
|
|||
|
|
so= input.length();
|
|||
|
|
eo= input.length();
|
|||
|
|
previous_eo= eo;
|
|||
|
|
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
RegexTokenizer::~RegexTokenizer(){
|
|||
|
|
list<regex_t>::iterator first= regex.begin();
|
|||
|
|
list<regex_t>::iterator last = regex.end();
|
|||
|
|
|
|||
|
|
while(first!=last){
|
|||
|
|
//cerr<<"freeing "<<&*first<<endl;
|
|||
|
|
(void) regfree (&*first);
|
|||
|
|
++first;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
ostream& operator<<(ostream &o, const RegexTokenizer &r){
|
|||
|
|
o<<"("<<&r<<" "<<r.previous_eo<<"-"<<r.so<<"/"<<r.eo<<" ?"<<r.error<<")["<<r.input<<"]"<<endl;
|
|||
|
|
return o;
|
|||
|
|
}
|
|||
|
|
|