| Home | Trees | Indices | Help |
|
|---|
|
|
1
2 __doc__ = """Base classes for match providers.
3
4 They are used by business objects to give
5 phrasewheels the ability to guess phrases.
6
7 Copyright (C) GNUMed developers
8 license: GPL v2 or later
9 """
10 __author__ = "K.Hilbert <Karsten.Hilbert@gmx.net>, I.Haywood <ihaywood@gnu.org>, S.J.Tan <sjtan@bigpond.com>"
11
12 # std lib
13 import sys
14 import logging
15 import re as regex
16 import datetime as pydt
17
18
19 # GNUmed
20 if __name__ == "__main__":
21 sys.path.insert(0, '../../')
22 from Gnumed.pycommon import gmPG2
23
24
25 _log = logging.getLogger('gm.ui')
26
27
28 # these are stripped from the fragment passed to the
29 # match provider before looking for matches:
30 default_ignored_chars = "[?!.'\\(){}\[\]<>~#*$%^_]+" + '"'
31
32 # these are used to detect word boundaries which is,
33 # in turn, used to normalize word boundaries in the
34 # input fragment
35 default_word_separators = '[- \t=+&:@]+'
36 #============================================================
38 """Base class for match providing objects.
39
40 Match sources might be:
41 - database tables
42 - flat files
43 - previous input
44 - config files
45 - in-memory list created on the fly
46 """
47 print_queries = False
48 #--------------------------------------------------------
50 self.setThresholds()
51
52 self._context_vals = {}
53 self.__ignored_chars = regex.compile(default_ignored_chars)
54 # used to normalize word boundaries:
55 self.__word_separators = regex.compile(default_word_separators)
56 #--------------------------------------------------------
57 # actions
58 #--------------------------------------------------------
60 """Return matches according to aFragment and matching thresholds.
61
62 FIXME: design decision: we dont worry about data source changes
63 during the lifetime of a MatchProvider
64 FIXME: append _("*get all items*") on truncation
65 """
66 # sanity check
67 if aFragment is None:
68 raise ValueError('Cannot find matches without a fragment.')
69
70 # user explicitly wants all matches
71 if aFragment == '*':
72 return self.getAllMatches()
73
74 # case insensitivity
75 tmpFragment = aFragment.lower()
76 # remove ignored chars
77 if self.__ignored_chars is not None:
78 tmpFragment = self.__ignored_chars.sub('', tmpFragment)
79 # normalize word separators
80 if self.__word_separators is not None:
81 tmpFragment = ' '.join(self.__word_separators.split(tmpFragment))
82 # length in number of significant characters only
83 lngFragment = len(tmpFragment)
84
85 # order is important !
86 if lngFragment >= self.__threshold_substring:
87 return self.getMatchesBySubstr(tmpFragment)
88 elif lngFragment >= self.__threshold_word:
89 return self.getMatchesByWord(tmpFragment)
90 elif lngFragment >= self.__threshold_phrase:
91 return self.getMatchesByPhrase(tmpFragment)
92 else:
93 return (False, [])
94 #--------------------------------------------------------
97 #--------------------------------------------------------
100 #--------------------------------------------------------
103 #--------------------------------------------------------
106 #--------------------------------------------------------
109 #--------------------------------------------------------
110 # configuration
111 #--------------------------------------------------------
113 """Set match location thresholds.
114
115 - the fragment passed to getMatches() must contain at least this many
116 characters before it triggers a match search at:
117 1) phrase_start - start of phrase (first word)
118 2) word_start - start of any word within phrase
119 3) in_word - _inside_ any word within phrase
120 """
121 # sanity checks
122 if aSubstring < aWord:
123 _log.error('Setting substring threshold (%s) lower than word-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_substring, self.__threshold_word))
124 return False
125 if aWord < aPhrase:
126 _log.error('Setting word-start threshold (%s) lower than phrase-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_word, self.__threshold_phrase))
127 return False
128
129 # now actually reassign thresholds
130 self.__threshold_phrase = aPhrase
131 self.__threshold_word = aWord
132 self.__threshold_substring = aSubstring
133
134 return True
135 #--------------------------------------------------------
137 if word_separators is None:
138 self.__word_separators = None
139 else:
140 self.__word_separators = regex.compile(word_separators)
141
146
147 word_separators = property(_get_word_separators, _set_word_separators)
148 #--------------------------------------------------------
150 if ignored_chars is None:
151 self.__ignored_chars = None
152 else:
153 self.__ignored_chars = regex.compile(ignored_chars)
154
159
160 ignored_chars = property(_get_ignored_chars, _set_ignored_chars)
161 #--------------------------------------------------------
163 """Set value to provide context information for matches.
164
165 The matching code may ignore it depending on its exact
166 implementation. Names and values of the context depend
167 on what is being matched.
168
169 <context> -- the *placeholder* key *inside* the context
170 definition, not the context *definition* key
171 """
172 if context is None:
173 return False
174 self._context_vals[context] = val
175 return True
176 #--------------------------------------------------------
182 #------------------------------------------------------------
183 # usable instances
184 #------------------------------------------------------------
186 """Match provider where all possible options can be held
187 in a reasonably sized, pre-allocated list.
188 """
190 """aSeq must be a list of dicts. Each dict must have the keys (data, label, weight)
191 """
192 if not type(aSeq) in [type(None), list, tuple]:
193 _log.error('fixed list match provider argument must be a list/tuple of dicts/None')
194 raise TypeError('fixed list match provider argument must be a list/tuple of dicts/None')
195
196 self.__items = aSeq
197 cMatchProvider.__init__(self)
198
199 #--------------------------------------------------------
200 # internal matching algorithms
201 #
202 # if we end up here:
203 # - aFragment will not be "None"
204 # - aFragment will be lower case
205 # - we _do_ deliver matches (whether we find any is a different story)
206 #--------------------------------------------------------
208 """Return matches for aFragment at start of phrases."""
209 matches = []
210 # look for matches
211 for item in self.__items:
212 # at start of phrase, that is
213 if item['list_label'].lower().startswith(aFragment.lower()):
214 matches.append(item)
215 # no matches found
216 if len(matches) == 0:
217 return (False, [])
218
219 #matches.sort(self.__cmp_items)
220 matches.sort(key = lambda x: x['weight'], reverse = True)
221 return (True, matches)
222
223 #--------------------------------------------------------
225 """Return matches for aFragment at start of words inside phrases."""
226 matches = []
227 # look for matches
228 for item in self.__items:
229 item_label = item['list_label'].lower()
230 fragment_pos = item_label.find(aFragment.lower())
231 # found at start of phrase
232 if fragment_pos == 0:
233 matches.append(item)
234 # found as a true substring
235 elif fragment_pos > 0:
236 # but use only if substring is at start of a word
237 if item_label[fragment_pos-1] == ' ':
238 matches.append(item)
239 # no matches found
240 if len(matches) == 0:
241 return (False, [])
242
243 #matches.sort(self.__cmp_items)
244 matches.sort(key = lambda x: x['weight'], reverse = True)
245 return (True, matches)
246
247 #--------------------------------------------------------
249 """Return matches for aFragment as a true substring."""
250 matches = []
251 # look for matches
252 for item in self.__items:
253 if item['list_label'].lower().find(aFragment.lower()) != -1:
254 matches.append(item)
255 # no matches found
256 if len(matches) == 0:
257 return (False, [])
258
259 #matches.sort(self.__cmp_items)
260 matches.sort(key = lambda x: x['weight'], reverse = True)
261 return (True, matches)
262
263 #--------------------------------------------------------
265 """Return all items."""
266 matches = self.__items
267 # no matches found
268 if len(matches) == 0:
269 return (False, [])
270
271 #matches.sort(self.__cmp_items)
272 matches.sort(key = lambda x: x['weight'], reverse = True)
273 return (True, matches)
274
275 #--------------------------------------------------------
277 """items must be a list of dicts. Each dict must have the keys (data, list_label, weight)"""
278 self.__items = items
279
280 # #--------------------------------------------------------
281 # def __cmp_items(self, item1, item2):
282 # """Compare items based on weight."""
283 # if item1['weight'] == item2['weight']:
284 # return 0
285 #
286 # # do it the wrong way round to do sorting/reversing at once
287 # if item1['weight'] < item2['weight']:
288 # return 1
289 # if item1['weight'] > item2['weight']:
290 # return -1
291
292 # ===========================================================
294 """Match provider which searches matches
295 in the results of a function call.
296 """
298 """get_candidates() must return a list of strings."""
299 if get_candidates is None:
300 _log.error('must define function to retrieve match candidates list')
301 raise ValueError('must define function to retrieve match candidates list')
302
303 self._get_candidates = get_candidates
304 cMatchProvider.__init__(self)
305 #--------------------------------------------------------
306 # internal matching algorithms
307 #
308 # if we end up here:
309 # - aFragment will not be "None"
310 # - aFragment will be lower case
311 # - we _do_ deliver matches (whether we find any is a different story)
312 #--------------------------------------------------------
314 """Return matches for aFragment at start of phrases."""
315 matches = []
316 candidates = self._get_candidates()
317 # look for matches
318 for candidate in candidates:
319 # at start of phrase, that is
320 if aFragment.startswith(candidate['list_label'].lower()):
321 matches.append(candidate)
322 # no matches found
323 if len(matches) == 0:
324 return (False, [])
325
326 matches.sort(key = self.__cmp_candidates)
327 return (True, matches)
328 #--------------------------------------------------------
330 """Return matches for aFragment at start of words inside phrases."""
331 matches = []
332 candidates = self._get_candidates()
333 # look for matches
334 for candidate in candidates:
335 pos = candidate['list_label'].lower().find(aFragment)
336 # pos = string.find(string.lower(candidate['list_label']), aFragment)
337 # found as a true substring
338 # but use only if substring is at start of a word
339 # FIXME: use word seps
340 if (pos == 0) or (candidate['list_label'][pos-1] == ' '):
341 matches.append(candidate)
342 # no matches found
343 if len(matches) == 0:
344 return (False, [])
345
346 matches.sort(key = self.__cmp_candidates)
347 return (True, matches)
348 #--------------------------------------------------------
350 """Return matches for aFragment as a true substring."""
351 matches = []
352 candidates = self._get_candidates()
353 # look for matches
354 for candidate in candidates:
355 if candidate['list_label'].lower().find(aFragment) != -1:
356 # if string.find(string.lower(candidate['list_label']), aFragment) != -1:
357 matches.append(candidate)
358 # no matches found
359 if len(matches) == 0:
360 return (False, [])
361
362 matches.sort(key = self.__cmp_candidates)
363 return (True, matches)
364 #--------------------------------------------------------
368 #--------------------------------------------------------
369 #def __cmp_candidates(self, candidate1, candidate2):
373 # FIXME: do ordering
374 # if candidate1 < candidate2:
375 # return -1
376 # if candidate1 == candidate2:
377 # return 0
378 # return 1
379
380 # ===========================================================
382 """Match provider which searches matches
383 in possibly several database tables.
384
385 queries:
386 - a list of unicode strings
387 - each string is a query
388 - each string must contain: "... WHERE <column> %(fragment_condition)s ..."
389 - each string can contain in the where clause: "... %(<ctxt_key1>)s ..."
390 - each query must return (data, list_label, field_label)
391
392 context definitions to be used in the queries, example:
393 {'ctxt_key1': {'where_part': 'AND country = %(country)s', 'placeholder': 'country'}}
394
395 client code using .set_context() must use the 'placeholder':
396 <phrasewheel>/<match provider>.set_context('country', 'Germany')
397
398 full example query:
399
400 query = u" " "
401 SELECT DISTINCT ON (list_label)
402 pk_encounter
403 AS data,
404 to_char(started, 'YYYY Mon DD (HH24:MI)') || ': ' || l10n_type || ' [#' || pk_encounter || ']'
405 AS list_label,
406 to_char(started, 'YYYY Mon DD') || ': ' || l10n_type
407 AS field_label
408 FROM
409 clin.v_pat_encounters
410 WHERE
411 (
412 l10n_type %(fragment_condition)s
413 OR
414 type %(fragment_condition)s
415 ) %(ctxt_patient)s
416 ORDER BY
417 list_label
418 LIMIT
419 30
420 " " "
421 context = {'ctxt_patient': {
422 'where_part': u'AND pk_patient = %(PLACEHOLDER)s',
423 'placeholder': u'PLACEHOLDER'
424 }}
425 self.mp = gmMatchProvider.cMatchProvider_SQL2(queries = query, context = context)
426 self.set_context(context = 'PLACEHOLDER', val = '<THE VALUE>')
427
428 _SQL_data2match:
429 SQL to retrieve a match by, say, primary key
430 wherein the only keyword argument is 'pk'
431 """
433
434 cMatchProvider.__init__(self)
435
436 if type(queries) == type([]):
437 self._queries = queries
438 else:
439 self._queries = [queries]
440
441 if context is None:
442 self._context = {}
443 else:
444 self._context = context
445
446 self._args = {}
447
448 self._SQL_data2match = None
449 #--------------------------------------------------------
450 # internal matching algorithms
451 #
452 # if we end up here:
453 # - aFragment will not be "None"
454 # - aFragment will be lower case
455 # - we _do_ deliver matches (whether we find any is a different story)
456 #--------------------------------------------------------
458 """Return matches for aFragment at start of phrases."""
459
460 fragment_condition = "ILIKE %(fragment)s"
461 self._args['fragment'] = "%s%%" % aFragment
462
463 return self._find_matches(fragment_condition)
464 #--------------------------------------------------------
466 """Return matches for aFragment at start of words inside phrases."""
467
468 fragment_condition = "~* %(fragment)s"
469 aFragment = gmPG2.sanitize_pg_regex(expression = aFragment, escape_all = False)
470 self._args['fragment'] = "( %s)|(^%s)" % (aFragment, aFragment)
471
472 return self._find_matches(fragment_condition)
473 #--------------------------------------------------------
475 """Return matches for aFragment as a true substring."""
476
477 fragment_condition = "ILIKE %(fragment)s"
478 self._args['fragment'] = "%%%s%%" % aFragment
479
480 return self._find_matches(fragment_condition)
481 #--------------------------------------------------------
485 #--------------------------------------------------------
487 if self._SQL_data2match is None:
488 return None
489
490 query = {'cmd': self._SQL_data2match, 'args': {'pk': data}}
491 try:
492 rows, idx = gmPG2.run_ro_queries(queries = [query], get_col_idx = False)
493 except:
494 _log.exception('[%s]: error running _SQL_data2match, dropping query', self.__class__.__name__)
495 self._SQL_data2match = None
496 return None
497
498 # hopefully the most frequent case:
499 if len(rows) == 1:
500 return rows[0]
501
502 _log.error('[%s]: 0 or >1 rows found by running _SQL_data2match, ambiguous, ignoring', self.__class__.__name__)
503 return None
504 #--------------------------------------------------------
506 if self.print_queries:
507 print("----------------------")
508 print(pydt.datetime.now())
509 matches = []
510 for query in self._queries:
511 where_fragments = {'fragment_condition': fragment_condition}
512
513 for context_key, context_def in self._context.items():
514 try:
515 placeholder = context_def['placeholder']
516 where_part = context_def['where_part']
517 self._args[placeholder] = self._context_vals[placeholder]
518 # we do have a context value for this key, so add the where condition
519 where_fragments[context_key] = where_part
520 if self.print_queries:
521 print("ctxt ph:", placeholder)
522 print("ctxt where:", where_part)
523 print("ctxt val:", self._context_vals[placeholder])
524 except KeyError:
525 # we don't have a context value for this key, so skip the where condition
526 where_fragments[context_key] = ''
527 if self.print_queries:
528 print("invalid ctxt key:", context_key)
529
530 cmd = query % where_fragments
531
532 if self.print_queries:
533 print("class:", self.__class__.__name__)
534 print("ctxt:", self._context_vals)
535 print("args:", self._args)
536 print("query:", cmd)
537
538 try:
539 rows, idx = gmPG2.run_ro_queries(queries = [{'cmd': cmd, 'args': self._args}], get_col_idx = False)
540 except:
541 _log.exception('[%s]: error running match provider SQL, dropping query', self.__class__.__name__)
542 idx = self._queries.index(query)
543 del self._queries[idx]
544 break
545
546 # no matches found: try next query
547 if len(rows) == 0:
548 continue
549
550 for row in rows:
551 match = {'weight': 0}
552
553 try:
554 match['data'] = row['data']
555 except KeyError:
556 match['data'] = row[0]
557
558 try:
559 match['list_label'] = row['list_label']
560 except KeyError:
561 match['list_label'] = row[1]
562
563 # explicit "field_label" in result ?
564 try:
565 match['field_label'] = row['field_label']
566 # no
567 except KeyError:
568 # but does row[2] exist ?
569 try:
570 match['field_label'] = row[2]
571 # no: reuse "list_label"
572 except IndexError:
573 match['field_label'] = match['list_label']
574
575 # try:
576 # match['label'] = row['label']
577 # except KeyError:
578 # match['label'] = match['list_label']
579
580 matches.append(match)
581
582 return (True, matches)
583
584 # none found whatsoever
585 return (False, [])
586
587 #================================================================
588 if __name__ == '__main__':
589 pass
590
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Fri Jan 25 02:55:27 2019 | http://epydoc.sourceforge.net |