Skip to content

Filtering

FilteringManager

Manages person-to-venue matching and filtering logic. Decouples filtering rules from the main distributor.

Source code in may/venue_distributor/filtering.py
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
class FilteringManager:
    """
    Manages person-to-venue matching and filtering logic.
    Decouples filtering rules from the main distributor.
    """

    def __init__(self, distributor):
        self.distributor = distributor
        self.config = distributor.config
        self.verbose = distributor.verbose

    def apply_global_filters(self, people: List) -> List:
        """
        Apply global filters and exclusions to a list of people.
        Vectorized where possible if people list is large.
        """
        # vectorized path
        if (hasattr(self.distributor, 'population_arrays') and 
            self.distributor.population_arrays and 
            len(people) > 1000 and 
            self.distributor._can_vectorize_filters(self.distributor._pre_processed_filters)):

            indices = []
            pid_to_idx = self.distributor.person_id_to_index
            for p in people:
                idx = pid_to_idx.get(p.id)
                if idx is not None:
                    indices.append(idx)

            if len(indices) == len(people):
                indices_arr = np.array(indices, dtype=np.int32)
                filtered_indices = self.distributor._apply_filters_vectorized(
                    indices_arr, self.distributor._pre_processed_filters
                )
                survivors = self.distributor.population_arrays['people'][filtered_indices].tolist()

                # Exclusions can't be vectorized (they depend on residence.properties,
                # e.g. household original_pattern), so apply them scalar-ly to the
                # already-filtered survivors. Skipping this was silently ignoring
                # eligibility.exclude rules for any run with >1000 eligible people.
                pre_processed_exclude = getattr(self.distributor, '_pre_processed_exclude', {})
                if pre_processed_exclude:
                    survivors = [
                        p for p in survivors
                        if not self.person_excluded(p, pre_processed_exclude)
                    ]
                return survivors

        # Scalar fallback
        eligible = []
        filtered_by_global = 0
        filtered_by_exclusions = 0

        pre_processed_filters = getattr(self.distributor, '_pre_processed_filters', [])
        pre_processed_exclude = getattr(self.distributor, '_pre_processed_exclude', {})

        # Pre-cache getters for performance
        if pre_processed_filters and 'getter' not in pre_processed_filters[0]:
            for f in pre_processed_filters:
                f['getter'] = self.distributor._create_path_getter(f['path_parts'])

        for person in people:
            match = True
            for f in pre_processed_filters:
                val = f['getter'](person)
                if val is None or not self._check_condition(val, f):
                    match = False
                    break

            if not match:
                filtered_by_global += 1
                continue

            if pre_processed_exclude and self.person_excluded(person, pre_processed_exclude):
                filtered_by_exclusions += 1
                continue

            eligible.append(person)

        if self.verbose:
            logger.info(f"Global filters: {filtered_by_global} filtered by global rules, "
                        f"{filtered_by_exclusions} filtered by exclusions, {len(eligible)} eligible")

        return eligible

    def person_matches_filters(self, person, filters: List[Dict]) -> bool:
        """Check if person matches all filters in a group."""
        if not filters:
            return True

        is_pre_processed = 'is_nested' in filters[0]

        if is_pre_processed:
            for filter_rule in filters:
                person_value = self._get_person_value_optimized(person, filter_rule)
                if person_value is None: return False

                if not self._check_condition(person_value, filter_rule):
                    return False
            return True
        else:
            # Fallback for raw filters
            for filter_rule in filters:
                attr_name = filter_rule.get('attribute')
                person_value = self._get_person_value_raw(person, attr_name)
                if person_value is None: return False

                if not self._check_condition(person_value, filter_rule):
                    return False
            return True

    def _get_person_value_optimized(self, person, filter_rule: Dict) -> Any:
        """Get value using pre-processed filter rule information."""
        if filter_rule.get('is_residence'):
            res = person.residence
            if res is None: return None
            return self.distributor._get_nested_value_with_dict_support(res, filter_rule['residence_parts'])

        # Check for direct attributes for speed
        attr = filter_rule['attribute']
        if attr == 'age': return person.age
        if attr == 'sex': return person.sex

        # Check person.properties first (where custom attributes like Occode are stored)
        if hasattr(person, 'properties') and attr in person.properties:
            val = person.properties[attr]
            # NaN values should be treated as missing (None)
            if isinstance(val, float) and val != val:  # fast NaN check
                return None
            return val

        return self.distributor._get_nested_value_with_dict_support(person, filter_rule['path_parts'])

    def _get_person_value_raw(self, person, attr_name: str) -> Any:
        """Fallback for raw filters without pre-processing."""
        return self.distributor._get_person_attribute(attr_name, person)

    def _check_condition(self, person_value, filter_rule: Dict) -> bool:
        filter_type = filter_rule.get('type', 'numerical')
        if filter_type == 'numerical':
            min_val = filter_rule.get('min')
            max_val = filter_rule.get('max')
            if min_val is not None and person_value < min_val: return False
            if max_val is not None and person_value > max_val: return False
        elif filter_type == 'categorical':
            val = filter_rule.get('value')
            vals = filter_rule.get('values')
            if val is not None and person_value != val: return False
            if vals is not None and person_value not in vals: return False
        return True

    def person_excluded(self, person, exclude_config: dict) -> bool:
        """Check if person should be excluded based on exclusion rules."""
        household_exclusions = exclude_config.get('households', {})
        if household_exclusions:
            res_venue = person.residence
            if res_venue is None or res_venue.type != 'household':
                return False

            for property_name, exclude_value in household_exclusions.items():
                if hasattr(res_venue, 'properties') and isinstance(res_venue.properties, dict):
                    actual_value = res_venue.properties.get(property_name)
                    if actual_value == exclude_value:
                        if self.verbose:
                            logger.debug(f"Person {person.id} excluded: household.{property_name} == '{actual_value}'")
                        return True
        return False

    def apply_probability_filter(self, people: List, prob_config, group_name: str) -> List:
        """Apply probability filtering to a list of people."""
        if not prob_config:
            return people

        if isinstance(prob_config, (int, float)):
            probability = float(prob_config)
            return [p for p in people if np.random.random() < probability]

        if prob_config.get('type') == 'file':
            file_path = prob_config.get('file_path')
            prob_col = prob_config.get('probability_column')
            lookup_attr = prob_config.get('lookup_attribute', 'geographical_unit.name')

            cache_key = (file_path, prob_col)
            cached_data = getattr(self.distributor, 'probability_cache', {}).get(cache_key)

            if not cached_data:
                logger.warning(f"Group '{group_name}': No cached probabilities for {cache_key}")
                default_prob = prob_config.get('default', 0.0)
                return [p for p in people if np.random.random() < default_prob]

            prob_lookup = cached_data['lookup']
            default_prob = cached_data['default']

            selected = []
            for person in people:
                lookup_value = self.distributor._get_person_attribute(lookup_attr, person)
                probability = prob_lookup.get(lookup_value, default_prob) if lookup_value is not None else default_prob
                if np.random.random() < probability:
                    selected.append(person)
            return selected

        return people

apply_global_filters(people)

Apply global filters and exclusions to a list of people. Vectorized where possible if people list is large.

Source code in may/venue_distributor/filtering.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def apply_global_filters(self, people: List) -> List:
    """
    Apply global filters and exclusions to a list of people.
    Vectorized where possible if people list is large.
    """
    # vectorized path
    if (hasattr(self.distributor, 'population_arrays') and 
        self.distributor.population_arrays and 
        len(people) > 1000 and 
        self.distributor._can_vectorize_filters(self.distributor._pre_processed_filters)):

        indices = []
        pid_to_idx = self.distributor.person_id_to_index
        for p in people:
            idx = pid_to_idx.get(p.id)
            if idx is not None:
                indices.append(idx)

        if len(indices) == len(people):
            indices_arr = np.array(indices, dtype=np.int32)
            filtered_indices = self.distributor._apply_filters_vectorized(
                indices_arr, self.distributor._pre_processed_filters
            )
            survivors = self.distributor.population_arrays['people'][filtered_indices].tolist()

            # Exclusions can't be vectorized (they depend on residence.properties,
            # e.g. household original_pattern), so apply them scalar-ly to the
            # already-filtered survivors. Skipping this was silently ignoring
            # eligibility.exclude rules for any run with >1000 eligible people.
            pre_processed_exclude = getattr(self.distributor, '_pre_processed_exclude', {})
            if pre_processed_exclude:
                survivors = [
                    p for p in survivors
                    if not self.person_excluded(p, pre_processed_exclude)
                ]
            return survivors

    # Scalar fallback
    eligible = []
    filtered_by_global = 0
    filtered_by_exclusions = 0

    pre_processed_filters = getattr(self.distributor, '_pre_processed_filters', [])
    pre_processed_exclude = getattr(self.distributor, '_pre_processed_exclude', {})

    # Pre-cache getters for performance
    if pre_processed_filters and 'getter' not in pre_processed_filters[0]:
        for f in pre_processed_filters:
            f['getter'] = self.distributor._create_path_getter(f['path_parts'])

    for person in people:
        match = True
        for f in pre_processed_filters:
            val = f['getter'](person)
            if val is None or not self._check_condition(val, f):
                match = False
                break

        if not match:
            filtered_by_global += 1
            continue

        if pre_processed_exclude and self.person_excluded(person, pre_processed_exclude):
            filtered_by_exclusions += 1
            continue

        eligible.append(person)

    if self.verbose:
        logger.info(f"Global filters: {filtered_by_global} filtered by global rules, "
                    f"{filtered_by_exclusions} filtered by exclusions, {len(eligible)} eligible")

    return eligible

apply_probability_filter(people, prob_config, group_name)

Apply probability filtering to a list of people.

Source code in may/venue_distributor/filtering.py
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def apply_probability_filter(self, people: List, prob_config, group_name: str) -> List:
    """Apply probability filtering to a list of people."""
    if not prob_config:
        return people

    if isinstance(prob_config, (int, float)):
        probability = float(prob_config)
        return [p for p in people if np.random.random() < probability]

    if prob_config.get('type') == 'file':
        file_path = prob_config.get('file_path')
        prob_col = prob_config.get('probability_column')
        lookup_attr = prob_config.get('lookup_attribute', 'geographical_unit.name')

        cache_key = (file_path, prob_col)
        cached_data = getattr(self.distributor, 'probability_cache', {}).get(cache_key)

        if not cached_data:
            logger.warning(f"Group '{group_name}': No cached probabilities for {cache_key}")
            default_prob = prob_config.get('default', 0.0)
            return [p for p in people if np.random.random() < default_prob]

        prob_lookup = cached_data['lookup']
        default_prob = cached_data['default']

        selected = []
        for person in people:
            lookup_value = self.distributor._get_person_attribute(lookup_attr, person)
            probability = prob_lookup.get(lookup_value, default_prob) if lookup_value is not None else default_prob
            if np.random.random() < probability:
                selected.append(person)
        return selected

    return people

person_excluded(person, exclude_config)

Check if person should be excluded based on exclusion rules.

Source code in may/venue_distributor/filtering.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def person_excluded(self, person, exclude_config: dict) -> bool:
    """Check if person should be excluded based on exclusion rules."""
    household_exclusions = exclude_config.get('households', {})
    if household_exclusions:
        res_venue = person.residence
        if res_venue is None or res_venue.type != 'household':
            return False

        for property_name, exclude_value in household_exclusions.items():
            if hasattr(res_venue, 'properties') and isinstance(res_venue.properties, dict):
                actual_value = res_venue.properties.get(property_name)
                if actual_value == exclude_value:
                    if self.verbose:
                        logger.debug(f"Person {person.id} excluded: household.{property_name} == '{actual_value}'")
                    return True
    return False

person_matches_filters(person, filters)

Check if person matches all filters in a group.

Source code in may/venue_distributor/filtering.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def person_matches_filters(self, person, filters: List[Dict]) -> bool:
    """Check if person matches all filters in a group."""
    if not filters:
        return True

    is_pre_processed = 'is_nested' in filters[0]

    if is_pre_processed:
        for filter_rule in filters:
            person_value = self._get_person_value_optimized(person, filter_rule)
            if person_value is None: return False

            if not self._check_condition(person_value, filter_rule):
                return False
        return True
    else:
        # Fallback for raw filters
        for filter_rule in filters:
            attr_name = filter_rule.get('attribute')
            person_value = self._get_person_value_raw(person, attr_name)
            if person_value is None: return False

            if not self._check_condition(person_value, filter_rule):
                return False
        return True