"""\ @file named_query.py @author Ryan Williams, Phoenix @date 2007-07-31 @brief An API for running named queries. $LicenseInfo:firstyear=2007&license=mit$ Copyright (c) 2007-2009, Linden Research, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. $/LicenseInfo$ """ import errno import MySQLdb import MySQLdb.cursors import os import os.path import re import time from indra.base import llsd from indra.base import config DEBUG = False NQ_FILE_SUFFIX = config.get('named-query-file-suffix', '.nq') NQ_FILE_SUFFIX_LEN = len(NQ_FILE_SUFFIX) _g_named_manager = None def _init_g_named_manager(sql_dir = None): """Initializes a global NamedManager object to point at a specified named queries hierarchy. This function is intended entirely for testing purposes, because it's tricky to control the config from inside a test.""" if sql_dir is None: sql_dir = config.get('named-query-base-dir') # extra fallback directory in case config doesn't return what we want if sql_dir is None: sql_dir = os.path.abspath( os.path.join( os.path.realpath(os.path.dirname(__file__)), "..", "..", "..", "..", "web", "dataservice", "sql")) global _g_named_manager _g_named_manager = NamedQueryManager( os.path.abspath(os.path.realpath(sql_dir))) def get(name): "Get the named query object to be used to perform queries" if _g_named_manager is None: _init_g_named_manager() return _g_named_manager.get(name) def sql(connection, name, params): # use module-global NamedQuery object to perform default substitution return get(name).sql(connection, params) def run(connection, name, params, expect_rows = None): """\ @brief given a connection, run a named query with the params Note that this function will fetch ALL rows. @param connection The connection to use @param name The name of the query to run @param params The parameters passed into the query @param expect_rows The number of rows expected. Set to 1 if return_as_map is true. Raises ExpectationFailed if the number of returned rows doesn't exactly match. Kind of a hack. @return Returns the result set as a list of dicts. """ return get(name).run(connection, params, expect_rows) class ExpectationFailed(Exception): """ Exception that is raised when an expectation for an sql query is not met.""" def __init__(self, message): Exception.__init__(self, message) self.message = message class NamedQuery(object): def __init__(self, name, filename): """ Construct a NamedQuery object. The name argument is an arbitrary name as a handle for the query, and the filename is a path to a file or a file-like object containing an llsd named query document.""" self._stat_interval_seconds = 5 # 5 seconds self._name = name if (filename is not None and isinstance(filename, (str, unicode)) and NQ_FILE_SUFFIX != filename[-NQ_FILE_SUFFIX_LEN:]): filename = filename + NQ_FILE_SUFFIX self._location = filename self._alternative = dict() self._last_mod_time = 0 self._last_check_time = 0 self.deleted = False self.load_contents() def name(self): """ The name of the query. """ return self._name def get_modtime(self): """ Returns the mtime (last modified time) of the named query filename. For file-like objects, expect a modtime of 0""" if self._location and isinstance(self._location, (str, unicode)): return os.path.getmtime(self._location) return 0 def load_contents(self): """ Loads and parses the named query file into self. Does nothing if self.location is nonexistant.""" if self._location: if isinstance(self._location, (str, unicode)): contents = llsd.parse(open(self._location).read()) else: # we probably have a file-like object. Godspeed! contents = llsd.parse(self._location.read()) self._reference_contents(contents) # Check for alternative implementations try: for name, alt in self._contents['alternative'].items(): nq = NamedQuery(name, None) nq._reference_contents(alt) self._alternative[name] = nq except KeyError, e: pass self._last_mod_time = self.get_modtime() self._last_check_time = time.time() def _reference_contents(self, contents): "Helper method which builds internal structure from parsed contents" self._contents = contents self._ttl = int(self._contents.get('ttl', 0)) self._return_as_map = bool(self._contents.get('return_as_map', False)) self._legacy_dbname = self._contents.get('legacy_dbname', None) # reset these before doing the sql conversion because we will # read them there. reset these while loading so we pick up # changes. self._around = set() self._append = set() self._integer = set() self._options = self._contents.get('dynamic_where', {}) for key in self._options: if isinstance(self._options[key], basestring): self._options[key] = self._convert_sql(self._options[key]) elif isinstance(self._options[key], list): lines = [] for line in self._options[key]: lines.append(self._convert_sql(line)) self._options[key] = lines else: moreopt = {} for kk in self._options[key]: moreopt[kk] = self._convert_sql(self._options[key][kk]) self._options[key] = moreopt self._base_query = self._convert_sql(self._contents['base_query']) self._query_suffix = self._convert_sql( self._contents.get('query_suffix', '')) def _convert_sql(self, sql): """convert the parsed sql into a useful internal structure. This function has to turn the named query format into a pyformat style. It also has to look for %:name% and :name% and ready them for use in LIKE statements""" if sql: # This first sub is to properly escape any % signs that # are meant to be literally passed through to mysql in the # query. It leaves any %'s that are used for # like-expressions. expr = re.compile("(?<=[^a-zA-Z0-9_-])%(?=[^:])") sql = expr.sub('%%', sql) # This should tackle the rest of the %'s in the query, by # converting them to LIKE clauses. expr = re.compile("(%?):([a-zA-Z][a-zA-Z0-9_-]*)%") sql = expr.sub(self._prepare_like, sql) expr = re.compile("#:([a-zA-Z][a-zA-Z0-9_-]*)") sql = expr.sub(self._prepare_integer, sql) expr = re.compile(":([a-zA-Z][a-zA-Z0-9_-]*)") sql = expr.sub("%(\\1)s", sql) return sql def _prepare_like(self, match): """This function changes LIKE statement replace behavior It works by turning %:name% to %(_name_around)s and :name% to %(_name_append)s. Since a leading '_' is not a valid keyname input (enforced via unit tests), it will never clash with existing keys. Then, when building the statement, the query runner will generate corrected strings.""" if match.group(1) == '%': # there is a leading % so this is treated as prefix/suffix self._around.add(match.group(2)) return "%(" + self._build_around_key(match.group(2)) + ")s" else: # there is no leading %, so this is suffix only self._append.add(match.group(2)) return "%(" + self._build_append_key(match.group(2)) + ")s" def _build_around_key(self, key): return "_" + key + "_around" def _build_append_key(self, key): return "_" + key + "_append" def _prepare_integer(self, match): """This function adjusts the sql for #:name replacements It works by turning #:name to %(_name_as_integer)s. Since a leading '_' is not a valid keyname input (enforced via unit tests), it will never clash with existing keys. Then, when building the statement, the query runner will generate corrected strings.""" self._integer.add(match.group(1)) return "%(" + self._build_integer_key(match.group(1)) + ")s" def _build_integer_key(self, key): return "_" + key + "_as_integer" def _strip_wildcards_to_list(self, value): """Take string, and strip out the LIKE special characters. Technically, this is database dependant, but postgresql and mysql use the same wildcards, and I am not aware of a general way to handle this. I think you need a sql statement of the form: LIKE_STRING( [ANY,ONE,str]... ) which would treat ANY as their any string, and ONE as their single glyph, and str as something that needs database specific encoding to not allow any % or _ to affect the query. As it stands, I believe it's impossible to write a named query style interface which uses like to search the entire space of text available. Imagine the query: % of brain used by average linden In order to search for %, it must be escaped, so once you have escaped the string to not do wildcard searches, and be escaped for the database, and then prepended the wildcard you come back with one of: 1) %\% of brain used by average linden 2) %%% of brain used by average linden Then, when passed to the database to be escaped to be database safe, you get back: 1) %\\% of brain used by average linden : which means search for any character sequence, followed by a backslash, followed by any sequence, followed by ' of brain...' 2) %%% of brain used by average linden : which (I believe) means search for a % followed by any character sequence followed by 'of brain...' Neither of which is what we want! So, we need a vendor (or extention) for LIKE_STRING. Anyone want to write it?""" utf8_value = unicode(value, "utf-8") esc_list = [] remove_chars = set(u"%_") for glyph in utf8_value: if glyph in remove_chars: continue esc_list.append(glyph.encode("utf-8")) return esc_list def delete(self): """ Makes this query unusable by deleting all the members and setting the deleted member. This is desired when the on-disk query has been deleted but the in-memory copy remains.""" # blow away all members except _name, _location, and deleted name, location = self._name, self._location for key in self.__dict__.keys(): del self.__dict__[key] self.deleted = True self._name, self._location = name, location def ttl(self): """ Estimated time to live of this query. Used for web services to set the Expires header.""" return self._ttl def legacy_dbname(self): return self._legacy_dbname def return_as_map(self): """ Returns true if this query is configured to return its results as a single map (as opposed to a list of maps, the normal behavior).""" return self._return_as_map def for_schema(self, db_name): "Look trough the alternates and return the correct query" try: return self._alternative[db_name] except KeyError, e: pass return self def run(self, connection, params, expect_rows = None, use_dictcursor = True): """given a connection, run a named query with the params Note that this function will fetch ALL rows. We do this because it opens and closes the cursor to generate the values, and this isn't a generator so the cursor has no life beyond the method call. @param cursor The connection to use (this generates its own cursor for the query) @param name The name of the query to run @param params The parameters passed into the query @param expect_rows The number of rows expected. Set to 1 if return_as_map is true. Raises ExpectationFailed if the number of returned rows doesn't exactly match. Kind of a hack. @param use_dictcursor Set to false to use a normal cursor and manually convert the rows to dicts. @return Returns the result set as a list of dicts, or, if the named query has return_as_map set to true, returns a single dict. """ if use_dictcursor: cursor = connection.cursor(MySQLdb.cursors.DictCursor) else: cursor = connection.cursor() statement = self.sql(connection, params) if DEBUG: print "SQL:", statement rows = cursor.execute(statement) # *NOTE: the expect_rows argument is a very cheesy way to get some # validation on the result set. If you want to add more expectation # logic, do something more object-oriented and flexible. Or use an ORM. if(self._return_as_map): expect_rows = 1 if expect_rows is not None and rows != expect_rows: cursor.close() raise ExpectationFailed("Statement expected %s rows, got %s. Sql: %s" % ( expect_rows, rows, statement)) # convert to dicts manually if we're not using a dictcursor if use_dictcursor: result_set = cursor.fetchall() else: if cursor.description is None: # an insert or something x = cursor.fetchall() cursor.close() return x names = [x[0] for x in cursor.description] result_set = [] for row in cursor.fetchall(): converted_row = {} for idx, col_name in enumerate(names): converted_row[col_name] = row[idx] result_set.append(converted_row) cursor.close() if self._return_as_map: return result_set[0] return result_set def sql(self, connection, params): """ Generates an SQL statement from the named query document and a dictionary of parameters. """ self.refresh() # build the query from the options available and the params base_query = [] base_query.append(self._base_query) for opt, extra_where in self._options.items(): if type(extra_where) in (dict, list, tuple): if opt in params: base_query.append(extra_where[params[opt]]) else: if opt in params and params[opt]: base_query.append(extra_where) if self._query_suffix: base_query.append(self._query_suffix) full_query = '\n'.join(base_query) # Go through the query and rewrite all of the ones with the # @:name syntax. rewrite = _RewriteQueryForArray(params) expr = re.compile("@%\(([a-zA-Z][a-zA-Z0-9_-]*)\)s") full_query = expr.sub(rewrite.operate, full_query) params.update(rewrite.new_params) # build out the params for like. We only have to do this # parameters which were detected to have ued the where syntax # during load. # # * treat the incoming string as utf-8 # * strip wildcards # * append or prepend % as appropriate new_params = {} for key in params: if key in self._around: new_value = ['%'] new_value.extend(self._strip_wildcards_to_list(params[key])) new_value.append('%') new_params[self._build_around_key(key)] = ''.join(new_value) if key in self._append: new_value = self._strip_wildcards_to_list(params[key]) new_value.append('%') new_params[self._build_append_key(key)] = ''.join(new_value) if key in self._integer: new_params[self._build_integer_key(key)] = int(params[key]) params.update(new_params) # do substitution using the mysql (non-standard) 'literal' # function to do the escaping. sql = full_query % connection.literal(params) return sql def refresh(self): """ Refresh self from the file on the filesystem. This is optimized to be callable as frequently as you wish, without adding too much load. It does so by only stat-ing the file every N seconds, where N defaults to 5 and is configurable through the member _stat_interval_seconds. If the stat reveals that the file has changed, refresh will re-parse the contents of the file and use them to update the named query instance. If the stat reveals that the file has been deleted, refresh will call self.delete to make the in-memory representation unusable.""" now = time.time() if(now - self._last_check_time > self._stat_interval_seconds): self._last_check_time = now try: modtime = self.get_modtime() if(modtime > self._last_mod_time): self.load_contents() except OSError, e: if e.errno == errno.ENOENT: # file not found self.delete() # clean up self raise # pass the exception along to the caller so they know that this query disappeared class NamedQueryManager(object): """ Manages the lifespan of NamedQuery objects, drawing from a directory hierarchy of named query documents. In practice this amounts to a memory cache of NamedQuery objects.""" def __init__(self, named_queries_dir): """ Initializes a manager to look for named queries in a directory.""" self._dir = os.path.abspath(os.path.realpath(named_queries_dir)) self._cached_queries = {} def sql(self, connection, name, params): nq = self.get(name) return nq.sql(connection, params) def get(self, name): """ Returns a NamedQuery instance based on the name, either from memory cache, or by parsing from disk. The name is simply a relative path to the directory associated with the manager object. Before returning the instance, the NamedQuery object is cached in memory, so that subsequent accesses don't have to read from disk or do any parsing. This means that NamedQuery objects returned by this method are shared across all users of the manager object. NamedQuery.refresh is used to bring the NamedQuery objects in sync with the actual files on disk.""" nq = self._cached_queries.get(name) if nq is None: nq = NamedQuery(name, os.path.join(self._dir, name)) self._cached_queries[name] = nq else: try: nq.refresh() except OSError, e: if e.errno == errno.ENOENT: # file not found del self._cached_queries[name] raise # pass exception along to caller so they know that the query disappeared return nq class _RewriteQueryForArray(object): "Helper class for rewriting queries with the @:name syntax" def __init__(self, params): self.params = params self.new_params = dict() def operate(self, match): "Given a match, return the string that should be in use" key = match.group(1) value = self.params[key] if type(value) in (list,tuple): rv = [] for idx in range(len(value)): # if the value@idx is array-like, we are # probably dealing with a VALUES new_key = "_%s_%s"%(key, str(idx)) val_item = value[idx] if type(val_item) in (list, tuple, dict): if type(val_item) is dict: # this is because in Python, the order of # key, value retrieval from the dict is not # guaranteed to match what the input intended # and for VALUES, order is important. # TODO: Implemented ordered dict in LLSD parser? raise ExpectationFailed('Only lists/tuples allowed,\ received dict') values_keys = [] for value_idx, item in enumerate(val_item): # we want a key of the format : # key_#replacement_#value_row_#value_col # ugh... so if we are replacing 10 rows in user_note, # the first values clause would read (for @:user_notes) :- # ( :_user_notes_0_1_1, :_user_notes_0_1_2, :_user_notes_0_1_3 ) # the input LLSD for VALUES will look like: # ... # # user_notes # # # ... # ... # ... # # ... # # # ... values_key = "%s_%s"%(new_key, value_idx) self.new_params[values_key] = item values_keys.append("%%(%s)s"%values_key) # now collapse all these new place holders enclosed in () # from [':_key_0_1_1', ':_key_0_1_2', ':_key_0_1_3,...] # rv will have [ '(:_key_0_1_1, :_key_0_1_2, :_key_0_1_3)', ] # which is flattened a few lines below join(rv) rv.append('(%s)' % ','.join(values_keys)) else: self.new_params[new_key] = val_item rv.append("%%(%s)s"%new_key) return ','.join(rv) else: # not something that can be expanded, so just drop the # leading @ in the front of the match. This will mean that # the single value we have, be it a string, int, whatever # (other than dict) will correctly show up, eg: # # where foo in (@:foobar) -- foobar is a string, so we get # where foo in (:foobar) return match.group(0)[1:]