213 lines
7.4 KiB
C++
213 lines
7.4 KiB
C++
/**
|
|
*
|
|
* Copyright (c) 2009-2010, Kitty Barnett
|
|
*
|
|
* The source code in this file is provided to you under the terms of the
|
|
* GNU General Public License, version 2.0, but WITHOUT ANY WARRANTY;
|
|
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
|
* PARTICULAR PURPOSE. Terms of the GPL can be found in doc/GPL-license.txt
|
|
* in this distribution, or online at http://www.gnu.org/licenses/gpl-2.0.txt
|
|
*
|
|
* By copying, modifying or distributing this software, you acknowledge that
|
|
* you have read and understood your obligations described above, and agree to
|
|
* abide by those obligations.
|
|
*
|
|
*/
|
|
|
|
#include "llviewerprecompiledheaders.h"
|
|
|
|
#include "rlvmultistringsearch.h"
|
|
|
|
// ====================================================================================
|
|
|
|
#ifndef RLV_LOWORD
|
|
#define RLV_LOWORD(x) ( (U16)( ((U32)x) & 0xFFFF) )
|
|
#endif // RLV_LOWORD
|
|
|
|
#ifndef RLV_HIWORD
|
|
#define RLV_HIWORD(x) ( (U16)( (((U32)x) >> 16) & 0xFFFF) )
|
|
#endif // RLV_HIWORD
|
|
|
|
// ====================================================================================
|
|
|
|
// (TODO-RLV: oops, forgot I was experimenting with word matching, get rid of that again?)
|
|
#define isLetter(ch) \
|
|
( ( (ch >= 'a') && (ch <= 'z') ) || ( (ch >= 'A') && (ch <= 'Z') ) )
|
|
|
|
RlvMultiStringSearch::RlvMultiStringSearch()
|
|
: m_FSM(256), // Start our FSM with room for 256 states (enough for all attachment point names)
|
|
m_cntState(0)
|
|
{
|
|
}
|
|
|
|
void RlvMultiStringSearch::addKeyword(const std::string& strKeyword, U16 nParam) {
|
|
U16 nCurState = 0; // Always start the loop at state 0
|
|
|
|
//
|
|
// Make sure there are enough unused rows to accomodate the worst case (== strKeyword.length() new states)
|
|
//
|
|
size_t nMaxState = m_FSM.getSize();
|
|
if (m_cntState + strKeyword.length() > nMaxState)
|
|
// Allocate enough new rows (in batches of 256 rows)
|
|
m_FSM.resize(nMaxState + ((strKeyword.length() / 256) + 1) * 256);
|
|
|
|
//
|
|
// Walk the string character by character
|
|
//
|
|
for (int idxCh = 0, cntCh = strKeyword.length(); idxCh < cntCh; idxCh++) {
|
|
// Look up the next state for current character
|
|
unsigned char ch = strKeyword[idxCh];
|
|
U16 nState = RLV_LOWORD(m_FSM[nCurState][ch]);
|
|
|
|
// If we're at the last character in the keyword then set the termination bit
|
|
if (cntCh - 1 == idxCh)
|
|
{
|
|
// (Only set the termination bit for the state because this keyword might be a substring of another keyword)
|
|
m_FSM[nCurState][ch] = (nParam << 16) | (nState | 0x8000);
|
|
}
|
|
else if ( (nState & 0x7FFF) == 0 ) // If the new state is 0 then we're creating a new path
|
|
{
|
|
// (Preserve the termination bit because another keyword might be a substring of this one)
|
|
nState = ++m_cntState | (nState & 0x8000);
|
|
|
|
// Store the new path in the FSM
|
|
//m_FSM[nCurState][ch] = (nParam << 16) | nState;
|
|
m_FSM[nCurState][ch] |= nState;
|
|
}
|
|
|
|
nCurState = nState & 0x7FFF; // Mask out the termination bit since we never need it for the current state
|
|
}
|
|
}
|
|
|
|
// (Iterating over a "const char*" is *significantly* faster than "std::string")
|
|
bool RlvMultiStringSearch::findNext(const char* pstrText, int idxCh, int cntCh, RlvMultiStringSearchMatch& match, bool fWordMatch) const
|
|
{
|
|
U16 nCurState = 0; // Always start the loop at state 0
|
|
U32 nLastMatch = 0; // Holds the state of the last (possibly partial) keyword match
|
|
|
|
//
|
|
// Walk the string character by character
|
|
//
|
|
for (; idxCh < cntCh; idxCh++)
|
|
{
|
|
// Keep track of the current state in case we need to backtrack
|
|
U16 nPrevState = nCurState;
|
|
|
|
// If we're currently in state 0, save the current character index (for backtracking or as keyword index match)
|
|
if (nCurState == 0)
|
|
match.idxMatch = idxCh;
|
|
|
|
// Look up the current character in the FSM
|
|
unsigned char ch = (unsigned char)pstrText[idxCh];
|
|
U32 nCell = m_FSM[nCurState & 0x7FFF][ch];
|
|
|
|
// If the termination bit is set then we found a keyword substring match
|
|
// If the next state is non-zero then we can't stop yet because the matched keyword might be a substring of another keyword
|
|
if (nCell & 0x8000)
|
|
{
|
|
if ( 0 == (nCell & 0x7FFF) )
|
|
{
|
|
// Termination bit with 'next state' equal to 0: matched keyword which isn't also a substring of any other keyword
|
|
match.lenMatch = idxCh - match.idxMatch + 1;
|
|
match.nParam = RLV_HIWORD(nCell);
|
|
|
|
// Rudimentary word matching: check if the match is a 'word'
|
|
if
|
|
(
|
|
(!fWordMatch) ||
|
|
(
|
|
( (0 == match.idxMatch) || (!isLetter(pstrText[match.idxMatch - 1])) ) && // Start of string OR non-letter
|
|
( (!isLetter(pstrText[match.idxMatch + match.lenMatch])) )
|
|
)
|
|
)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
// Not a word, but there's no need to backtrack: we can move on from the character after the current one
|
|
nCell = 0; // Will set nCurState == 0 further down
|
|
match.idxMatch = idxCh; // Makes sure we move on to the next character instead of backtracking
|
|
}
|
|
else
|
|
{
|
|
nLastMatch = nCell;
|
|
|
|
// In case it turns out that we need to backtrack and return this match, save the length of this match
|
|
match.lenMatch = idxCh - match.idxMatch + 1;
|
|
}
|
|
}
|
|
|
|
nCurState = RLV_LOWORD(nCell);
|
|
|
|
// If our new state is 0, but our previous state wasn't, then we followed a false lead and need to backtrack
|
|
if ( (nPrevState != 0) && (nCurState == 0) )
|
|
{
|
|
// * if nLastMatch == 0 then we need to backtrack and keep going
|
|
// * if nLastMatch != 0 then we previously encountered a keyword match so return that one
|
|
if (nLastMatch) {
|
|
// Rudimentary word matching: check if the match is a 'word'
|
|
if
|
|
(
|
|
(!fWordMatch) ||
|
|
(
|
|
( (0 == match.idxMatch) || (!isLetter(pstrText[match.idxMatch - 1])) ) && // Start of string OR non-letter
|
|
( (!isLetter(pstrText[match.idxMatch + match.lenMatch])) )
|
|
)
|
|
)
|
|
{
|
|
match.nParam = RLV_HIWORD(nLastMatch);
|
|
return true;
|
|
} else
|
|
// Not a word match, so throw away this partial match and backtrack
|
|
nLastMatch = 0;
|
|
}
|
|
|
|
idxCh = match.idxMatch;
|
|
}
|
|
}
|
|
|
|
// We encountered a match, but while investigating whether it was a substring of another keyword we ran out of characters
|
|
if (nLastMatch)
|
|
{
|
|
// Rudimentary word matching: check if we started at the beginning of a word (we know the one behind us is '\0')
|
|
if ( (!fWordMatch) || ( (0 == match.idxMatch) || (!isLetter(pstrText[match.idxMatch - 1])) ) )
|
|
{
|
|
match.nParam = RLV_HIWORD(nLastMatch);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// None of the keywords is contained in the string: return failure
|
|
match.idxMatch = -1;
|
|
return false;
|
|
}
|
|
|
|
bool RlvMultiStringSearch::findLast(const std::string& strText, RlvMultiStringSearchMatch& match) const {
|
|
RlvMultiStringSearchMatch matchTemp;
|
|
match.idxMatch = -1; // (Needed to make the return work in case we don't find anything)
|
|
matchTemp.lenMatch = 0; // (Needed to make the first loop iteration start at 0)
|
|
|
|
// Iterating over a "const char*" is *significantly* faster than "std::string"
|
|
const char* pstrText = strText.c_str();
|
|
int lenText = strText.length();
|
|
|
|
while (findNext(pstrText, matchTemp.idxMatch + matchTemp.lenMatch + 1, lenText, matchTemp))
|
|
match = matchTemp;
|
|
|
|
return (match.idxMatch != -1);
|
|
}
|
|
|
|
std::vector<RlvMultiStringSearchMatch> RlvMultiStringSearch::findAll(const std::string& strText) {
|
|
std::vector<RlvMultiStringSearchMatch> arMatch;
|
|
|
|
RlvMultiStringSearchMatch match;
|
|
match.lenMatch = 0; // (Needed to make the first loop iteration start at 0)
|
|
|
|
while (findNext(strText, match.idxMatch + match.lenMatch + 1, match))
|
|
arMatch.push_back(match);
|
|
|
|
return arMatch;
|
|
}
|
|
|
|
// ====================================================================================
|