petidomo/libtext/RegexTokenizer.cpp
2000-12-13 15:45:25 +00:00

260 lines
6.9 KiB
C++
Raw Blame History

/*
* $Source$
* $Revision$
* $Date$
*
* Copyright (c) 1999 by CyberSolutions GmbH, Germany.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by CyberSolutions GmbH.
*
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "text.hpp"
char RegexTokenizer::workspace[RegexTokenizer::N_substring+1]="";
RegexTokenizer::RegexTokenizer(){
}
void RegexTokenizer::reset(){
input= string();
int i=N_pm; while(--i>0){ pm[i].rm_so=-1; pm[i].rm_eo=-1; }
so= 0;
eo= 0;
previous_eo= -1;
error= 0;
}
int RegexTokenizer::set(string _input,list<const char*> _regex){
reset();
input= _input;
list<const char*>::iterator first= _regex.begin();
list<const char*>::iterator last = _regex.end();
while(first!=last){
regex_t re;
int i;
//REG_EXTENDED
//use extended regular expressions
//REG_NEWLINE
//makes ^...$ work to match newline/endofline
i= regcomp (&re, *first, REG_EXTENDED|REG_NEWLINE);
if(i)
return i;
regex.push_back(re);
regex_src.push_back(*first);
++first;
}
}
RegexTokenizer::RegexTokenizer(string _input,Mode _mode){
mode= _mode;
//create a list
list<const char*>alist;
switch(_mode){
case Word:
alist.push_back("([^ \t\n]*)([ \t\n]*)");
break;
case Line:
alist.push_back("^(.*)$\n");
break;
case RFC:
alist.push_back("((^.*$)((\n)^[ \t]+.*$)*)(\n)?");
//this works, but output is confusing
// that is, how to remove the glue ?
break;
case Custom:
//break;
default:
cerr<<"RegexTokenizer mode constructor called with pointless mode."<<endl;
}
set(_input,alist);
}
RegexTokenizer::RegexTokenizer(string _input,const char* oneregex){
//create a list
list<const char*>alist;
alist.push_back(oneregex);
set(_input,alist);
}
RegexTokenizer::RegexTokenizer(string _input,list<const char*> _regex){
set(_input,_regex);
}
RegexTokenizer::RegexTokenizer(const RegexTokenizer &r){
//cerr<<"(copy constructor)"<<endl;
set(r.input,r.regex_src);
// result= r.result; "ANSI C++ fobids ..."
memcpy(&result[0], &r.result[0], N_pm*sizeof(result[0]) );
whichregexwasmatched= r.whichregexwasmatched;
// pm= r.pm;
memcpy(&pm[0], &r.pm[0], N_pm*sizeof(pm[0]) );
I_pm= r.I_pm;
error= r.error;
so= r.so;
eo= r.eo;
previous_eo= r.previous_eo;
mode= r.mode;
}
RegexTokenizer RegexTokenizer::begin() const
{
//cerr<<"(begin)"<<endl;
RegexTokenizer RT(*this);
RT.error= 0;
RT.so= 0;
RT.eo= 0;
RT.previous_eo= -1;
return RT;
}
RegexTokenizer RegexTokenizer::end() const
{
//cerr<<"(end)"<<endl;
RegexTokenizer RT(*this);
RT.error= 1;
RT.so= input.length();
RT.eo= input.length();
RT.previous_eo= RT.eo;
return RT;
}
void RegexTokenizer::advance(){
//try all patterns until one matches
//cerr<<"advance"<<endl;
//wonder where to get the string from ?
//using a char * buffer is ugly, but there is no regex for string
// (no regex stuff which I'm aware of at the time of writing (1999) )
if(eo < (signed int)input.size()){
// there is no c_substr(eo,N_substring) ;-(
string sWorkspace(input,eo,N_substring);
// waste of time, but I<>m not sure when sWorkspace.c_str() gets freed;
strncpy(workspace, sWorkspace.c_str(), N_substring) ;
}
else
workspace[0]='\0';
result[0]= string();
if(
error == 0 && /* regex ok ? */
*workspace != 0 && /* check end buffer */
previous_eo < eo /* make sure we finish */
)
{/* while matches found */
//cerr<<"go over regex's supplied"<<endl;
list<regex_t>::iterator first= regex.begin();
list<regex_t>::iterator last = regex.end();
error= 1;
previous_eo= eo;
while(error && result[0].empty() && first!=last){//check for empty buffer
{
//cerr<<endl <<"matching "<< workspace + eo<< endl;
/* substring found between pm.rm_so and pm.rm_eo */
/* This call to regexec() finds the next match */
error = regexec(&*first, workspace, N_pm, &pm[0], 0);
++first;
}
if(!error){
int final_so= eo;
int final_eo= eo;
//Go over the members of pm to see submatches
int i;
i=N_pm; while(--i>0){ result[i]= string(); }
i=0;
while(i<N_pm &&
pm[i].rm_so>=0 && pm[i].rm_eo>0 &&
pm[i].rm_so<N_substring && pm[i].rm_eo<=N_substring
){
int local_so= previous_eo+pm[i].rm_so;
int local_eo= previous_eo+pm[i].rm_eo;
if(i==0)
{
final_so= local_so;
final_eo= local_eo;
}
result[i]= input.substr(local_so, local_eo-local_so);
//cout <<"match["<<i<<"]{"<<pm[i].rm_so<<","<<pm[i].rm_eo<<"}";
//cout <<"("<< local_so <<","<< local_eo <<"): " << result[i] << endl;
i++;
}
so= final_so;
eo= final_eo;
I_pm= i;
}
else{
(void)regerror(error, &*first, workspace, N_substring);
}
}
}else{
//if the final match has been passed,
//signal end (to make != operator work ?PS)
// like in *this= end();
so= input.length();
eo= input.length();
previous_eo= eo;
}
}
RegexTokenizer::~RegexTokenizer(){
list<regex_t>::iterator first= regex.begin();
list<regex_t>::iterator last = regex.end();
while(first!=last){
//cerr<<"freeing "<<&*first<<endl;
(void) regfree (&*first);
++first;
}
}
ostream& operator<<(ostream &o, const RegexTokenizer &r){
o<<"("<<&r<<" "<<r.previous_eo<<"-"<<r.so<<"/"<<r.eo<<" ?"<<r.error<<")["<<r.input<<"]"<<endl;
return o;
}