208 lines
6.5 KiB
C++
208 lines
6.5 KiB
C++
/**
|
|
*
|
|
* Copyright (c) 2009-2010, Kitty Barnett
|
|
*
|
|
* The source code in this file is provided to you under the terms of the
|
|
* GNU General Public License, version 2.0, but WITHOUT ANY WARRANTY;
|
|
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
|
* PARTICULAR PURPOSE. Terms of the GPL can be found in doc/GPL-license.txt
|
|
* in this distribution, or online at http://www.gnu.org/licenses/gpl-2.0.txt
|
|
*
|
|
* By copying, modifying or distributing this software, you acknowledge that
|
|
* you have read and understood your obligations described above, and agree to
|
|
* abide by those obligations.
|
|
*
|
|
*/
|
|
|
|
#ifndef RLV_MULTISTRINGSEARCH_H
|
|
#define RLV_MULTISTRINGSEARCH_H
|
|
|
|
// ============================================================================
|
|
// Template classes for our state machine (2 dimensional array of type T)
|
|
|
|
// STL vector
|
|
template<typename T> class RlvMultiStringSearchFSM_STL
|
|
{
|
|
public:
|
|
/*
|
|
* Constructor/destructor
|
|
*/
|
|
|
|
// Initialize the FSM with an initial capacity of 'nCapacity' states
|
|
RlvMultiStringSearchFSM_STL(size_t nCapacity)
|
|
{
|
|
m_arFSM.reserve(nCapacity);
|
|
|
|
T* pT;
|
|
for (size_t idx = 0; idx < nCapacity; idx++)
|
|
{
|
|
// The width of each row is determined by the alphabet we're using (in this case UTF-8
|
|
// so while every character might consist of multiple bytes there are
|
|
// still only 256 'columns' in the state machine)
|
|
pT = new T[256]();
|
|
|
|
// The above *should* initialize to 0 but since we can't account for every compiler doing it :(
|
|
memset(pT, 0, sizeof(T) * 256);
|
|
|
|
m_arFSM.push_back(pT);
|
|
}
|
|
};
|
|
|
|
~RlvMultiStringSearchFSM_STL()
|
|
{
|
|
// Free any memory we previously allocated
|
|
for (int idx = 0, cnt = m_arFSM.size(); idx < cnt; idx++)
|
|
delete[] m_arFSM[idx];
|
|
}
|
|
|
|
/*
|
|
* Operators
|
|
*/
|
|
// ASSERTION: nState < m_arFSM.size() at all times
|
|
// In other words: do *NOT* go out of bounds on the array (no memory will have allocated for that non-existing state)
|
|
// (There probably should be a check for that even in release but it seems wasteful, just don't do it :p)
|
|
inline T* operator[](size_t nState)
|
|
{
|
|
//#ifdef _DEBUG
|
|
// assert( nState < m_arFSM.size() );
|
|
//#endif // _DEBUG
|
|
|
|
return m_arFSM[nState];
|
|
}
|
|
inline const T* operator[](size_t nState) const
|
|
{
|
|
//#ifdef _DEBUG
|
|
// assert( nState < m_arFSM.size() );
|
|
//#endif // _DEBUG
|
|
|
|
return m_arFSM[nState];
|
|
}
|
|
|
|
/*
|
|
* Public member functions
|
|
*/
|
|
|
|
size_t getSize() const { return m_arFSM.size(); }
|
|
|
|
void resize(size_t nNewCapacity)
|
|
{
|
|
// Get our current capacity (only rows > capacity need memory allocated)
|
|
size_t nCurCapacity = m_arFSM.capacity();
|
|
|
|
// Only expand, never shrink
|
|
if (nNewCapacity <= nCurCapacity)
|
|
{
|
|
//#ifdef _DEBUG
|
|
// assert(false);
|
|
//#endif //_DEBUG
|
|
|
|
return;
|
|
}
|
|
m_arFSM.resize(nNewCapacity);
|
|
|
|
// For each new state we added, allocate memory for the columns
|
|
for(size_t idx = nCurCapacity; idx < nNewCapacity; idx++)
|
|
// The memset is redundant (or rather *should* be) but since we can't account for every compiler doing it :(
|
|
m_arFSM[idx] = (T*)memset(new T[256](), 0, sizeof(T) * 256);
|
|
}
|
|
|
|
protected:
|
|
/*
|
|
* Member variables
|
|
*/
|
|
std::vector<T*> m_arFSM;
|
|
};
|
|
|
|
// ============================================================================
|
|
|
|
struct RlvMultiStringSearchMatch
|
|
{
|
|
int idxMatch; // Starting character index into the string of the matched keyword (-1 if no match)
|
|
int lenMatch; // Length of the matched keyword (undefined if no match)
|
|
U16 nParam; // User supplied parameter for the matched keyword (undefined if no match)
|
|
|
|
RlvMultiStringSearchMatch() : idxMatch(-1) {}
|
|
};
|
|
|
|
// ============================================================================
|
|
// The actual search class
|
|
|
|
class RlvMultiStringSearch
|
|
{
|
|
public:
|
|
/*
|
|
* Constructor/destructor
|
|
*/
|
|
RlvMultiStringSearch();
|
|
//~RlvMultiStringSearch();
|
|
|
|
/*
|
|
* Public member functions
|
|
*/
|
|
|
|
// Add a keyword to the state machine (if it already exists then it will simply overwrite the existing parameter)
|
|
void addKeyword(const std::string& strKeyword, U16 nParam);
|
|
|
|
BOOL getExactMatchParam(const std::string& strText, U16& nParam) const
|
|
{
|
|
RlvMultiStringSearchMatch match;
|
|
if (findFirst(strText, match))
|
|
{
|
|
// We have an exact match if the starting index is 0
|
|
// and the length of the match matches the length of the string
|
|
if ( (0 == match.idxMatch) && (match.lenMatch == (int)strText.length()) )
|
|
{
|
|
nParam = match.nParam;
|
|
return TRUE;
|
|
}
|
|
}
|
|
|
|
return FALSE; // Fall-through: no (exact) match
|
|
}
|
|
|
|
// Finds the first occurance of any keyword in the supplied string
|
|
bool findFirst(const std::string& strText, RlvMultiStringSearchMatch& match) const;
|
|
// Finds the next occurance of any keyword in the supplied string
|
|
bool findNext(const std::string& strText, int idxCh, RlvMultiStringSearchMatch& match) const;
|
|
// Finds all occurances of any keyword in the supplied string
|
|
std::vector<RlvMultiStringSearchMatch> findAll(const std::string& strText);
|
|
// Finds the last occurance of any keyword in the supplied string (non-optimized)
|
|
bool findLast(const std::string& strText, RlvMultiStringSearchMatch& match) const;
|
|
|
|
protected:
|
|
// Finds the next occurance of any keyword in the supplied string
|
|
bool findNext(const char* pstrText, int idxCh, int cntCh, RlvMultiStringSearchMatch& match, bool fWordMatch = true) const;
|
|
|
|
/*
|
|
* Member variables
|
|
*/
|
|
RlvMultiStringSearchFSM_STL<U32> m_FSM; // Our finite state machine (4 bytes * 256 = 1Kb of memory/state)
|
|
// HIWORD(U32) = 16-bits of user data
|
|
// LOWORD(U32) = ABBBBBBBBBBBBBBB
|
|
// A = termination bit
|
|
// If (set) and (B == 0): match
|
|
// If (set) and (B != 0): match, but might only be a substring of another keyword
|
|
// B = next state (0..32767)
|
|
// If (B == 0): false lead -> backtrack
|
|
// If (B != 0): partial keyword match, next state
|
|
size_t m_cntState; // The number of states in the FSM (= the number of *used* rows in the array)
|
|
};
|
|
|
|
// ============================================================================
|
|
// Inlined member functions
|
|
//
|
|
|
|
inline bool RlvMultiStringSearch::findFirst(const std::string& strText, RlvMultiStringSearchMatch& match) const
|
|
{
|
|
return findNext(strText.c_str(), 0, strText.length(), match);
|
|
}
|
|
|
|
inline bool RlvMultiStringSearch::findNext(const std::string& strText, int idxCh, RlvMultiStringSearchMatch& match) const
|
|
{
|
|
return findNext(strText.c_str(), idxCh, strText.length(), match);
|
|
}
|
|
|
|
// ============================================================================
|
|
|
|
#endif // RLV_MULTISTRINGSEARCH_H
|