7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208 | class FilteringManager:
"""
Manages person-to-venue matching and filtering logic.
Decouples filtering rules from the main distributor.
"""
def __init__(self, distributor):
self.distributor = distributor
self.config = distributor.config
self.verbose = distributor.verbose
def apply_global_filters(self, people: List) -> List:
"""
Apply global filters and exclusions to a list of people.
Vectorized where possible if people list is large.
"""
# vectorized path
if (hasattr(self.distributor, 'population_arrays') and
self.distributor.population_arrays and
len(people) > 1000 and
self.distributor._can_vectorize_filters(self.distributor._pre_processed_filters)):
indices = []
pid_to_idx = self.distributor.person_id_to_index
for p in people:
idx = pid_to_idx.get(p.id)
if idx is not None:
indices.append(idx)
if len(indices) == len(people):
indices_arr = np.array(indices, dtype=np.int32)
filtered_indices = self.distributor._apply_filters_vectorized(
indices_arr, self.distributor._pre_processed_filters
)
survivors = self.distributor.population_arrays['people'][filtered_indices].tolist()
# Exclusions can't be vectorized (they depend on residence.properties,
# e.g. household original_pattern), so apply them scalar-ly to the
# already-filtered survivors. Skipping this was silently ignoring
# eligibility.exclude rules for any run with >1000 eligible people.
pre_processed_exclude = getattr(self.distributor, '_pre_processed_exclude', {})
if pre_processed_exclude:
survivors = [
p for p in survivors
if not self.person_excluded(p, pre_processed_exclude)
]
return survivors
# Scalar fallback
eligible = []
filtered_by_global = 0
filtered_by_exclusions = 0
pre_processed_filters = getattr(self.distributor, '_pre_processed_filters', [])
pre_processed_exclude = getattr(self.distributor, '_pre_processed_exclude', {})
# Pre-cache getters for performance
if pre_processed_filters and 'getter' not in pre_processed_filters[0]:
for f in pre_processed_filters:
f['getter'] = self.distributor._create_path_getter(f['path_parts'])
for person in people:
match = True
for f in pre_processed_filters:
val = f['getter'](person)
if val is None or not self._check_condition(val, f):
match = False
break
if not match:
filtered_by_global += 1
continue
if pre_processed_exclude and self.person_excluded(person, pre_processed_exclude):
filtered_by_exclusions += 1
continue
eligible.append(person)
if self.verbose:
logger.info(f"Global filters: {filtered_by_global} filtered by global rules, "
f"{filtered_by_exclusions} filtered by exclusions, {len(eligible)} eligible")
return eligible
def person_matches_filters(self, person, filters: List[Dict]) -> bool:
"""Check if person matches all filters in a group."""
if not filters:
return True
is_pre_processed = 'is_nested' in filters[0]
if is_pre_processed:
for filter_rule in filters:
person_value = self._get_person_value_optimized(person, filter_rule)
if person_value is None: return False
if not self._check_condition(person_value, filter_rule):
return False
return True
else:
# Fallback for raw filters
for filter_rule in filters:
attr_name = filter_rule.get('attribute')
person_value = self._get_person_value_raw(person, attr_name)
if person_value is None: return False
if not self._check_condition(person_value, filter_rule):
return False
return True
def _get_person_value_optimized(self, person, filter_rule: Dict) -> Any:
"""Get value using pre-processed filter rule information."""
if filter_rule.get('is_residence'):
res = person.residence
if res is None: return None
return self.distributor._get_nested_value_with_dict_support(res, filter_rule['residence_parts'])
# Check for direct attributes for speed
attr = filter_rule['attribute']
if attr == 'age': return person.age
if attr == 'sex': return person.sex
# Check person.properties first (where custom attributes like Occode are stored)
if hasattr(person, 'properties') and attr in person.properties:
val = person.properties[attr]
# NaN values should be treated as missing (None)
if isinstance(val, float) and val != val: # fast NaN check
return None
return val
return self.distributor._get_nested_value_with_dict_support(person, filter_rule['path_parts'])
def _get_person_value_raw(self, person, attr_name: str) -> Any:
"""Fallback for raw filters without pre-processing."""
return self.distributor._get_person_attribute(attr_name, person)
def _check_condition(self, person_value, filter_rule: Dict) -> bool:
filter_type = filter_rule.get('type', 'numerical')
if filter_type == 'numerical':
min_val = filter_rule.get('min')
max_val = filter_rule.get('max')
if min_val is not None and person_value < min_val: return False
if max_val is not None and person_value > max_val: return False
elif filter_type == 'categorical':
val = filter_rule.get('value')
vals = filter_rule.get('values')
if val is not None and person_value != val: return False
if vals is not None and person_value not in vals: return False
return True
def person_excluded(self, person, exclude_config: dict) -> bool:
"""Check if person should be excluded based on exclusion rules."""
household_exclusions = exclude_config.get('households', {})
if household_exclusions:
res_venue = person.residence
if res_venue is None or res_venue.type != 'household':
return False
for property_name, exclude_value in household_exclusions.items():
if hasattr(res_venue, 'properties') and isinstance(res_venue.properties, dict):
actual_value = res_venue.properties.get(property_name)
if actual_value == exclude_value:
if self.verbose:
logger.debug(f"Person {person.id} excluded: household.{property_name} == '{actual_value}'")
return True
return False
def apply_probability_filter(self, people: List, prob_config, group_name: str) -> List:
"""Apply probability filtering to a list of people."""
if not prob_config:
return people
if isinstance(prob_config, (int, float)):
probability = float(prob_config)
return [p for p in people if np.random.random() < probability]
if prob_config.get('type') == 'file':
file_path = prob_config.get('file_path')
prob_col = prob_config.get('probability_column')
lookup_attr = prob_config.get('lookup_attribute', 'geographical_unit.name')
cache_key = (file_path, prob_col)
cached_data = getattr(self.distributor, 'probability_cache', {}).get(cache_key)
if not cached_data:
logger.warning(f"Group '{group_name}': No cached probabilities for {cache_key}")
default_prob = prob_config.get('default', 0.0)
return [p for p in people if np.random.random() < default_prob]
prob_lookup = cached_data['lookup']
default_prob = cached_data['default']
selected = []
for person in people:
lookup_value = self.distributor._get_person_attribute(lookup_attr, person)
probability = prob_lookup.get(lookup_value, default_prob) if lookup_value is not None else default_prob
if np.random.random() < probability:
selected.append(person)
return selected
return people
|