PageRenderTime 29ms CodeModel.GetById 34ms RepoModel.GetById 1ms app.codeStats 0ms

/corehq/motech/openmrs/finders.py

https://github.com/dimagi/commcare-hq
Python | 232 lines | 173 code | 11 blank | 48 comment | 7 complexity | a70a2a28a951d32625a988180f018162 MD5 | raw file
Possible License(s): BSD-3-Clause, LGPL-2.1
  1. """
  2. PatientFinders are used to find OpenMRS patients that correspond to
  3. CommCare cases if none of the patient identifiers listed in
  4. OpenmrsCaseConfig.match_on_ids have successfully matched a patient.
  5. See `README.md`__ for more context.
  6. """
  7. import logging
  8. from collections import namedtuple
  9. from functools import partial
  10. from pprint import pformat
  11. from dimagi.ext.couchdbkit import (
  12. DecimalProperty,
  13. DictProperty,
  14. DocumentSchema,
  15. ListProperty,
  16. )
  17. from corehq.motech.finders import (
  18. MATCH_FUNCTIONS,
  19. PropertyWeight,
  20. )
  21. from corehq.motech.openmrs.const import OPENMRS_DATA_TYPE_BOOLEAN
  22. from corehq.motech.value_source import (
  23. deserialize,
  24. recurse_subclasses,
  25. )
  26. logger = logging.getLogger(__name__)
  27. constant_false = {
  28. "value": 'False',
  29. # We are fetching from a case property or a form question value, and
  30. # we want `get_value()` to return False (bool). `get_value()`
  31. # serialises case properties and form question values as external
  32. # data types. OPENMRS_DATA_TYPE_BOOLEAN is useful because it is a
  33. # bool, not a string, so `constant_false.get_value()` will return
  34. # False (not 'False')
  35. "external_data_type": OPENMRS_DATA_TYPE_BOOLEAN,
  36. }
  37. class PatientFinder(DocumentSchema):
  38. """
  39. The ``PatientFinder`` base class was developed as a way to
  40. handle situations where patient cases are created in CommCare
  41. instead of being imported from OpenMRS.
  42. When patients are imported from OpenMRS, they will come with at
  43. least one identifier that MOTECH can use to match the case in
  44. CommCare with the corresponding patient in OpenMRS. But if the case
  45. is registered in CommCare then we may not have an ID, or the ID
  46. could be wrong. We need to search for a corresponding OpenMRS
  47. patient.
  48. Different projects may focus on different kinds of case properties,
  49. so it was felt that a base class would allow some flexibility.
  50. The ``PatientFinder.wrap()`` method allows you to wrap documents of
  51. subclasses.
  52. The ``PatientFinder.find_patients()`` method must be implemented by
  53. subclasses. It returns a list of zero, one, or many patients. If it
  54. returns one patient, the OpenmrsRepeater.find_or_create_patient()
  55. will accept that patient as a true match.
  56. .. NOTE:: The consequences of a false positive (a Type II error) are
  57. severe: A real patient will have their valid values
  58. overwritten by those of someone else. So ``PatientFinder``
  59. subclasses should be written and configured to skew
  60. towards false negatives (Type I errors). In other words,
  61. it is much better not to choose a patient than to choose
  62. the wrong patient.
  63. """
  64. # Whether to create a new patient if no patients are found
  65. create_missing = DictProperty(default=constant_false)
  66. @classmethod
  67. def wrap(cls, data):
  68. if 'create_missing' in data and isinstance(data['create_missing'], bool):
  69. data['create_missing'] = {
  70. 'external_data_type': OPENMRS_DATA_TYPE_BOOLEAN,
  71. 'value': str(data['create_missing'])
  72. }
  73. if cls is PatientFinder:
  74. subclass = {
  75. sub._doc_type: sub for sub in recurse_subclasses(cls)
  76. }.get(data['doc_type'])
  77. return subclass.wrap(data) if subclass else None
  78. else:
  79. return super(PatientFinder, cls).wrap(data)
  80. def find_patients(self, requests, case, case_config):
  81. """
  82. Given a case, search OpenMRS for possible matches. Return the
  83. best results. Subclasses must define "best". If just one result
  84. is returned, it will be chosen.
  85. """
  86. raise NotImplementedError
  87. PatientScore = namedtuple('PatientScore', ['patient', 'score'])
  88. class WeightedPropertyPatientFinder(PatientFinder):
  89. """
  90. The ``WeightedPropertyPatientFinder`` class finds OpenMRS patients
  91. that match CommCare cases by assigning weights to case properties,
  92. and adding the weights of matching patient properties to calculate a
  93. confidence score.
  94. """
  95. # Identifiers that are searchable in OpenMRS. e.g.
  96. # [ 'bahmni_id', 'household_id', 'last_name']
  97. searchable_properties = ListProperty()
  98. # The weight assigned to a matching property.
  99. # [
  100. # {"case_property": "bahmni_id", "weight": 0.9},
  101. # {"case_property": "household_id", "weight": 0.9},
  102. # {
  103. # "case_property": "dob",
  104. # "weight": 0.75,
  105. # "match_type": "days_diff",
  106. # // days_diff matches based on days difference from given date
  107. # "match_params": [364]
  108. # },
  109. # {
  110. # "case_property": "first_name",
  111. # "weight": 0.025,
  112. # "match_type": "levenshtein",
  113. # // levenshtein function takes edit_distance / len
  114. # "match_params": [0.2]
  115. # // i.e. 20% is one edit for every 5 characters
  116. # // e.g. "Riyaz" matches "Riaz" but not "Riazz"
  117. # },
  118. # {"case_property": "last_name", "weight": 0.025},
  119. # {"case_property": "municipality", "weight": 0.2},
  120. # ]
  121. property_weights = ListProperty(PropertyWeight)
  122. # The threshold that the sum of weights must pass for a CommCare case to
  123. # be considered a match to an OpenMRS patient
  124. threshold = DecimalProperty(default=1.0)
  125. # If more than one patient passes `threshold`, the margin by which the
  126. # weight of the best match must exceed the weight of the second-best match
  127. # to be considered correct.
  128. confidence_margin = DecimalProperty(default=0.667) # Default: Matches two thirds better than second-best
  129. def __init__(self, *args, **kwargs):
  130. super(WeightedPropertyPatientFinder, self).__init__(*args, **kwargs)
  131. self._property_map = {}
  132. def get_score(self, patient, case):
  133. """
  134. Return the sum of weighted properties to give an OpenMRS
  135. patient a score of how well they match a CommCare case.
  136. """
  137. def weights():
  138. for property_weight in self.property_weights:
  139. prop = property_weight['case_property']
  140. jsonpath, value_source_dict = self._property_map[prop]
  141. weight = property_weight['weight']
  142. matches = jsonpath.find(patient)
  143. for match in matches:
  144. patient_value = match.value
  145. case_value = case.get_case_property(prop)
  146. match_type = property_weight['match_type']
  147. match_params = property_weight['match_params']
  148. match_function = partial(MATCH_FUNCTIONS[match_type], *match_params)
  149. is_equivalent = match_function(deserialize(value_source_dict, patient_value), case_value)
  150. yield weight if is_equivalent else 0
  151. return sum(weights())
  152. def find_patients(self, requests, case, case_config):
  153. """
  154. Matches cases to patients. Returns a list of patients, each
  155. with a confidence score >= self.threshold
  156. """
  157. from corehq.motech.openmrs.openmrs_config import get_property_map
  158. from corehq.motech.openmrs.repeater_helpers import search_patients
  159. self._property_map = get_property_map(case_config)
  160. candidates = {} # key on OpenMRS UUID to filter duplicates
  161. for prop in self.searchable_properties:
  162. value = case.get_case_property(prop)
  163. if value:
  164. response_json = search_patients(requests, value)
  165. for patient in response_json['results']:
  166. score = self.get_score(patient, case)
  167. if score >= self.threshold:
  168. candidates[patient['uuid']] = PatientScore(patient, score)
  169. if not candidates:
  170. logger.info(
  171. 'Unable to match case "%s" (%s): No candidate patients found.',
  172. case.name, case.get_id,
  173. )
  174. return []
  175. if len(candidates) == 1:
  176. patient = list(candidates.values())[0].patient
  177. logger.info(
  178. 'Matched case "%s" (%s) to ONLY patient candidate: \n%s',
  179. case.name, case.get_id, pformat(patient, indent=2),
  180. )
  181. return [patient]
  182. patients_scores = sorted(candidates.values(), key=lambda candidate: candidate.score, reverse=True)
  183. if patients_scores[0].score / patients_scores[1].score > 1 + self.confidence_margin:
  184. # There is more than a `confidence_margin` difference
  185. # (defaults to 66.7%) in score between the best-ranked
  186. # patient and the second-best-ranked patient. Let's go with
  187. # Patient One.
  188. patient = patients_scores[0].patient
  189. logger.info(
  190. 'Matched case "%s" (%s) to BEST patient candidate: \n%s',
  191. case.name, case.get_id, pformat(patients_scores, indent=2),
  192. )
  193. return [patient]
  194. # We can't be sure. Just send them all.
  195. logger.info(
  196. 'Unable to match case "%s" (%s) to patient candidates: \n%s',
  197. case.name, case.get_id, pformat(patients_scores, indent=2),
  198. )
  199. return [ps.patient for ps in patients_scores]