petidomo/libtext/RegexTokenizer_test.cpp
2000-12-13 15:45:25 +00:00

235 lines
6.4 KiB
C++

/*
* $Source$
* $Revision$
* $Date$
*
* Copyright (c) 1999 by CyberSolutions GmbH.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by CyberSolutions GmbH.
*
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "text.hpp"
// begin test section
// Set verbose to 1 if you want to see the parsed tokens
const int verbose=0;
int test1(){
string input("a22bbb4444ccccc999999999dfgDFG");
// Set up the tokenizer to match the input string
// against a regular expression.
// The entire match will be returned by *rt,
// subexpressions by rt[1], rt[2], rt[3], ..
RegexTokenizer rt(input,"([a-z]*)([^a-z]*)");
int count=0;
RegexTokenizer::iterator next= rt.begin();
const RegexTokenizer::iterator last= rt.end();
if(verbose)
cout << "*** begin 1*** "<<endl;
while(next!=last){
++next;// Preinc - processes input
if(next!=last){
if(verbose)
cout << *next // Entire match,
<<"="
<<next[1] // first subexpression,
<<"+"
<<next[2] // 2nd subexpr.
<< endl;
}
count++;
}
if(verbose)
cout << "--- end 1 ---"<<count<<endl;
if(count != 5)
return 1;
return 0;
}
int test2(){
string input("Word-Satz 1\n Satz 1a\nSatz 2\n\tSatz2a.");
// Set up the tokenizer to match the input string
// against a regular expression.
// The entire match will be returned by *rt,
// subexpressions by rt[1], rt[2], rt[3], ..
RegexTokenizer rt(input,"([^ \t\n]*)([ \t\n]*)");
int count=0;
RegexTokenizer::iterator next= rt.begin();
const RegexTokenizer::iterator last= rt.end();
if(verbose)
cout << "*** begin 2*** "<<endl;
while(next!=last){
++next;// Preinc - processes input
if(next!=last){
if(verbose)
cout <<next[1] // first matched subexpression
<<"["
<<next[2] // second matched subexpression
<<"]"<< endl;
}
count++;
}
if(verbose)
cout << "--- end 2 ---"<<count<<endl;
if(count != 8)
return 1;
return 0;
}
int testWord(){
string input("Ein Satz aus vielen langen Wor-ten.\nUnd ein zweiter Satz.2\n3");
// Set up the tokenizer to match the input string
// against a regular expression that defines the word-wise tokenizing.
// The expression used is "([^ \t\n]*)([ \t\n]*)" .
// The entire match will be returned by *rt,
// subexpressions by rt[1], rt[2], rt[3], ..
RegexTokenizer rt(input,RegexTokenizer::Word);
int count=0;
RegexTokenizer::iterator next= rt.begin();
const RegexTokenizer::iterator last= rt.end();
if(verbose)
cout << "*** begin Word*** "<<endl;
while(next!=last){
++next;// Preinc - processes input
if(next!=last){
if(verbose)
cout <<next[1] // first matched subexpression,
<<"["
<<next[2] // 2nd matched subexpr.
<<"]"<< endl;
}
count++;
}
if(verbose)
cout << "--- end Word ---"<<count<<endl;
if(count != 12)
return 1;
return 0;
}
int testLine(){
string input("Line-Satz 1\n Satz 1a\nSatz 2\n\tSatz2a.");
// Set up the tokenizer to match the input string
// against a regular expression that defines line by line tokenizing.
// The expression used is "^(.*)$\n" .
// The entire match will be returned by *rt,
// subexpressions by rt[1], rt[2], rt[3], ..
RegexTokenizer rt(input,RegexTokenizer::Line);
int count=0;
RegexTokenizer::iterator next= rt.begin();
const RegexTokenizer::iterator last= rt.end();
if(verbose)
cout << "*** begin Line*** "<<endl;
while(next!=last){
++next;// Preinc - processes input
if(next!=last){
if(verbose)
cout <<"'"
<<next[1] // first matched subexpression
<<"'"<<"["
<<next[2] // second matched subexpression
<<"]"<< endl;
}
count++;
}
if(verbose)
cout << "--- end Line ---"<<count<<endl;
if(count != 5)
return 1;
return 0;
}
int testRFC(){
string input("RFC-Satz 1\n Satz 1a\nSatz 2\n\tSatz2a\n\tSatz2b.");
// Set up the tokenizer to match the input string
// against a regular expression that defines RFC-style tokenizing.
// The expression used is "((^.*$)((\n)^[ \t]+.*$)*)(\n)?" .
// Bug: whitespace that glues one line to the next is not removed.
// (afaik, there not way to do this with a single regular expression).
// The entire match will be returned by *rt,
// subexpressions by rt[1], rt[2], rt[3], ..
RegexTokenizer rt(input,RegexTokenizer::RFC);
int count=0;
RegexTokenizer::iterator next= rt.begin();
const RegexTokenizer::iterator last= rt.end();
if(verbose)
cout << "*** begin RFC*** "<<endl;
while(next!=last){
++next;// Preinc - processes input
if(next!=last){
if(verbose)
cout <<"'"<<next[1]<<"'"<<"["<<next[2]<<"]"<<"["<<next[3]<<"]"<< endl;
// first, second and third matched subexpression
}
count++;
}
if(verbose)
cout << "--- end RFC ---"<<count<<endl;
if(count != 3)
return 1;
return 0;
}
int main(int argc, char *argv[])
{
return test1() || test2() || testWord() || testLine() || testRFC() ;
}