Skip to content

Population

PopulationManager

Manages population generation and distribution.

This class loads demographic data and creates Person objects distributed across geographical units according to specified distributions.

Source code in may/population/population.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
class PopulationManager:
    """
    Manages population generation and distribution.

    This class loads demographic data and creates Person objects distributed
    across geographical units according to specified distributions.
    """

    def __init__(self, geography, data_dir):
        """
        Initialize the PopulationManager.

        Args:
            geography (Geography): Geography object containing geographical units
            data_dir (str): Directory containing population data files
        """
        self.geography = geography
        self.data_dir = data_dir
        self.people = []
        self.people_by_id = {}

        # Precise demographics: geo_unit -> age -> sex -> count
        self.precise_demographics = {}

    def __len__(self):
        return len(self.people)

    @staticmethod
    def _create_nested_defaultdict():
        """
        Create a nested defaultdict for demographics storage.

        This is a separate function (not a lambda) to make the object pickle-compatible.
        Returns a defaultdict(dict) for storing age -> sex -> count mappings.
        """
        return defaultdict(dict)

    def load_demographics_from_csv(self, male_file="demographics_male.csv",
                                     female_file="demographics_female.csv"):
        """
        Load precise population demographics from matrix-style CSV files.

        Expected format (separate files for male/female):
            geo_unit,0,1,2,3,...,100
            E00004320,2,2,1,3,...,0
            E00004321,1,3,2,2,...,1
            ...

        Rows = geo units
        Columns = ages (1-year bins from 0 to 100)

        Args:
            male_file (str): Filename for male demographics
            female_file (str): Filename for female demographics

        """
        male_path = os.path.join(self.data_dir, male_file)
        female_path = os.path.join(self.data_dir, female_file)

        if not os.path.exists(male_path) or not os.path.exists(female_path):
            logger.error(f"Demographics files not found: {male_path} or {female_path}")
            logger.info("Cannot generate population without demographics data")
            return

        # Get the smallest geographical level from the loaded geography
        # to filter demographics to only relevant geo units
        smallest_level = self.geography.levels[0]
        smallest_units_dict = self.geography.get_units_by_level(smallest_level)

        if not smallest_units_dict:
            logger.warning(f"No {smallest_level} units found in geography. Cannot load demographics.")
            return

        # Create a set of geo unit names that exist in our geography for fast lookup
        valid_geo_units = set(smallest_units_dict.keys())
        logger.info(f"Filtering demographics to {len(valid_geo_units)} {smallest_level}s in loaded geography")

        logger.info(f"Loading male demographics from {male_path}")
        male_df = pd.read_csv(male_path)

        logger.info(f"Loading female demographics from {female_path}")
        female_df = pd.read_csv(female_path)

        # Validate structure
        if 'geo_unit' not in male_df.columns or 'geo_unit' not in female_df.columns:
            raise ValueError("Demographics files must have 'geo_unit' column")


        # Ignore index column if it exists
        for _df in [male_df, female_df]:
            if 'index' in _df.columns:
                _df.drop(columns=['index'], inplace=True)


        # Filter to only geo units in our geography BEFORE processing
        male_df = male_df[male_df['geo_unit'].isin(valid_geo_units)]
        female_df = female_df[female_df['geo_unit'].isin(valid_geo_units)]

        logger.info(f"Filtered to {len(male_df)} male geo units and {len(female_df)} female geo units")


        # Load into nested dict structure: geo_unit -> age -> sex -> count
        # Note: Using a regular function instead of lambda for pickle compatibility
        self.precise_demographics = defaultdict(self._create_nested_defaultdict)
        total_people = 0

        logger.info("Processing male demographics...")
        # Convert male dataframe to long format for efficient processing
        male_melted = male_df.melt(id_vars=['geo_unit'], var_name='age', value_name='count')
        male_melted['age'] = male_melted['age'].astype(int)
        male_melted['count'] = male_melted['count'].fillna(0).astype(int)
        # Filter out zero counts for efficiency
        male_melted = male_melted[male_melted['count'] > 0]

        logger.info("Processing female demographics...")
        # Convert female dataframe to long format
        female_melted = female_df.melt(id_vars=['geo_unit'], var_name='age', value_name='count')
        female_melted['age'] = female_melted['age'].astype(int)
        female_melted['count'] = female_melted['count'].fillna(0).astype(int)
        # Filter out zero counts for efficiency
        female_melted = female_melted[female_melted['count'] > 0]

        logger.info("Building demographic dictionary...")
        # Convert to numpy arrays for much faster iteration
        male_values = male_melted.values  # [[geo_unit, age, count], ...]
        female_values = female_melted.values

        # Build nested dictionary
        for row in male_values:
            geo_unit = str(row[0])
            age = int(row[1])
            count = int(row[2])

            self.precise_demographics[geo_unit][age]['male'] = count
            total_people += count

        for row in female_values:
            geo_unit = str(row[0])
            age = int(row[1])
            count = int(row[2])

            self.precise_demographics[geo_unit][age]['female'] = count
            total_people += count

        logger.info(f"Loaded precise demographics for {len(self.precise_demographics)} geographical units")
        logger.info(f"Total people in demographics: {total_people:,}")

    def load_explicit_from_csv(self, filename: str, column_mapping: Dict[str, str]):
        """
        Load individual-level population data from a CSV file.
        """
        path = os.path.join(self.data_dir, filename)
        if not os.path.exists(path):
            logger.error(f"Explicit population file not found: {path}")
            return

        logger.info(f"Loading explicit population from {path}")
        df = pd.read_csv(path)

        # Reset ID counter for consistency (at the entry point)
        Person.reset_counter()

        self.load_explicit_from_df(df, column_mapping)

    def load_explicit_from_df(self, df: pd.DataFrame, column_mapping: Dict[str, str]):
        """
        Internal method to load population from a DataFrame.
        """
        target_to_csv = column_mapping

        # Identify geographical column
        # Priority: 1. mapped 'geo_unit', 2. literal 'geo_unit', 3. literal 'SGU', 4. literal 'MGU'
        geo_levels = set(self.geography.levels)
        geo_cols = {'geo_unit', 'SGU', 'MGU'}.union(geo_levels)

        mapped_geo_col = target_to_csv.get('geo_unit')
        actual_geo_col = None

        if mapped_geo_col in df.columns:
            actual_geo_col = mapped_geo_col
        else:
            actual_geo_col = next((col for col in df.columns if col in geo_cols), None)

        if actual_geo_col is None:
             raise ValueError(f"Missing required geographical column (e.g., 'geo_unit', 'SGU', 'MGU') in population data")

        people_count = 0

        for row in df.itertuples(index=False):
            row_dict = row._asdict()
            properties = {}
            age = 0
            sex = "unknown"

            # 1. Determine geographical unit
            geo_unit_name = row_dict.get(actual_geo_col)
            geo_unit = self.geography.get_unit(geo_unit_name) if geo_unit_name else None

            if not geo_unit:
                logger.warning(f"No geographical unit found for person in row (col: {actual_geo_col}, val: {geo_unit_name}). Skipping.")
                continue

            # Extract known attributes
            for target, csv_col in target_to_csv.items():
                if csv_col not in row_dict:
                    continue

                val = row_dict[csv_col]
                if target == 'age':
                    try:
                        age = int(float(val))
                    except (ValueError, TypeError):
                        age = 0
                elif target == 'sex':
                    sex = str(val).lower().strip() if pd.notna(val) else "unknown"
                    # Normalize common sex strings
                    if sex in ['m', '1', 'male']: sex = 'male'
                    elif sex in ['f', '2', 'female']: sex = 'female'
                elif target == 'geo_unit':
                    unit_name = str(val).strip()
                    found_unit = self.geography.get_unit(unit_name)
                    if found_unit:
                        geo_unit = found_unit
                else:
                    # Treat as a generic property
                    properties[target] = val

            # Add all other columns not in mapping to properties.
            # The geographical column drives `geographical_unit`; it must not
            # also be duplicated as a property (parallel with VenueManager,
            # which excludes its geo column from the property dict).
            mapped_csv_cols = set(target_to_csv.values())
            reserved_cols = mapped_csv_cols | {actual_geo_col}
            for col, val in row_dict.items():
                if col not in reserved_cols:
                    properties[col] = val

            # Create and add person
            person = Person(age=age, sex=sex, geographical_unit=geo_unit, properties=properties)
            self.add_person(person)

            if geo_unit:
                geo_unit.add_person(person)

            people_count += 1

        logger.info(f"Successfully loaded {people_count:,} people from explicit data.")

    def generate_population(self, **kwargs):
        """
        Generate population from precise demographics data.

        Creates exact number of people per age/sex/geo_unit as specified
        in the demographics file. People are created in age order globally
        (person 0 is the youngest person across all smallest geographical units).

        Assumes demographics are provided for the smallest geographical level
        (first level in the hierarchy, e.g., SGU, village, census block, etc.).

        Args:
          **kwargs:
            Arbitrary keyword arguments to be passed to the creation of Person.
            Supported keys include:
              * activities (list[str], optional): list of activity names for each Person.
              * properties (dict, optional): a dict of properties of the Person, e.g. 'ethnicity', 'compliance', 'taste'.
              * activity_map (DefaultDict[str,list[Subset]], optional):
                a dict mapping an activity (same string as in activities) to a list of potential Subsets the Person would
                join to fulfil that activity. 

        """
        if not self.precise_demographics:
            logger.error("No demographics data loaded. Cannot generate population.")
            return

        logger.info("Generating population from precise demographics...")
        Person.reset_counter()

        # Get the smallest geographical level (first in the hierarchy)
        smallest_level = self.geography.levels[0]
        smallest_units_dict = self.geography.get_units_by_level(smallest_level)

        if not smallest_units_dict:
            logger.warning(f"No {smallest_level} units found in geography. Cannot generate population.")
            return

        # Collect all (age, sex, geo_unit, count) tuples and sort by age
        all_age_sex_geo = []

        for unit in smallest_units_dict.values():
            if unit.name not in self.precise_demographics:
                logger.debug(f"No demographic data for {unit.name}, skipping")
                continue

            age_sex_data = self.precise_demographics[unit.name]

            for age, sex_counts in age_sex_data.items():
                for sex, count in sex_counts.items():
                    all_age_sex_geo.append((age, sex, unit, count))

        # Sort by age first, then sex (for consistent ordering)
        all_age_sex_geo.sort(key=lambda x: (x[0], x[1]))

        # Now create people in age order across all smallest units
        total_people = 0
        geo_units_with_data = len(set(item[2] for item in all_age_sex_geo))

        for age, sex, unit, count in all_age_sex_geo:
            for _ in range(count):
                person = Person(age=age, sex=sex, geographical_unit=unit, **kwargs)
                self.add_person(person)
                # Add person to their geographical unit's people list
                unit.add_person(person)
                total_people += 1

        logger.info(f"Generated {total_people:,} people across {geo_units_with_data} {smallest_level}s")
        if geo_units_with_data > 0:
            logger.info(f"Average: {total_people / geo_units_with_data:.1f} people per {smallest_level}")

    def add_person(self, person: Person):
        self.people.append(person)
        self.people_by_id[person.id] = person

    def add_people(self, people: list[Person]):
        for person in people:
            self.add_person(person)

    def get_person(self, person_id):
        """
        Get a person by their ID.

        Args:
            person_id (int): ID of the person

        Returns:
            Person: The person object, or None if not found
        """
        return self.people_by_id.get(person_id)

    def get_all_people(self):
        """
        Get all people as a list.

        Returns:
            list: List of all Person objects
        """
        return self.people

    def get_people_by_age_range(self, min_age, max_age):
        """
        Get all people within an age range.

        Args:
            min_age (int): Minimum age (inclusive)
            max_age (int): Maximum age (inclusive)

        Returns:
            list: List of Person objects in age range
        """
        return [p for p in self.people if min_age <= p.age <= max_age]

    def get_people_by_sex(self, sex):
        """
        Get all people of a specific sex.

        Args:
            sex (str): Sex category

        Returns:
            list: List of Person objects
        """
        return [p for p in self.people if p.sex == sex]

    def get_people_by_activity(self, activity):
        """
        Get all people with a specific activity.

        Args:
            activity (str): Activity name

        Returns:
            list: List of Person objects with this activity
        """
        return [p for p in self.people if p.has_activity(activity)]

    def get_people_by_geo_unit(self, geo_unit_code):
        """
        Get all people in a specific geographical unit.

        Args:
            geo_unit_code (str): Name/code of the geographical unit

        Returns:
            list: List of Person objects in this geo_unit
        """
        unit = self.geography.get_unit(geo_unit_code)
        if unit is None:
            return []
        return unit.people if hasattr(unit, 'people') else []

    def get_statistics(self):
        """
        Get basic statistics about the population.

        Returns:
            dict: Dictionary of statistics
        """
        if not self.people:
            return {}

        ages = [p.age for p in self.people]
        sexes = [p.sex for p in self.people]

        # Count sex categories
        sex_counts = {}
        for sex in sexes:
            sex_counts[sex] = sex_counts.get(sex, 0) + 1

        # Collect all activities
        all_activities = set()
        for p in self.people:
            all_activities.update(p.activities)

        activity_counts = {}
        for activity in all_activities:
            activity_counts[activity] = len(self.get_people_by_activity(activity))

        return {
            'total_population': len(self.people),
            'mean_age': np.mean(ages),
            'median_age': np.median(ages),
            'min_age': np.min(ages),
            'max_age': np.max(ages),
            'sex_distribution': sex_counts,
            'activity_counts': activity_counts
        }
    def load_batch_explicit_from_csv(self, data_dir: str, column_mapping: Dict[str, str]):
        """
        Load individual-level population data from multiple MGU-level CSV files.
        """
        # 1. Identify all MGUs in the current geography
        mgu_units = self.geography.get_units_by_level("MGU")
        mgu_names = set(mgu_units.keys())

        # 2. Identify all loaded SGUs for internal filtering
        loaded_sgus = set(self.geography.get_units_by_level("SGU").keys())

        # Reset ID counter once for the whole batch
        Person.reset_counter()

        logger.info(f"Starting batch explicit population load for {len(mgu_names)} MGUs")

        total_files = 0
        for mgu_name in mgu_names:
            filename = f"{mgu_name}_pop.csv"
            path = os.path.join(data_dir, filename)
            if not os.path.exists(path):
                 continue

            df = pd.read_csv(path)
            total_files += 1

            # Filter rows by geographical unit to only keep what is in our geography
            # Check for any valid geo level column (SGU, MGU, or custom levels)
            geo_levels = set(self.geography.levels)
            geo_cols = {'SGU', 'MGU', 'geo_unit'}.union(geo_levels)
            actual_geo_col = next((col for col in df.columns if col in geo_cols), None)

            if actual_geo_col and actual_geo_col in df.columns:
                # We filter by whatever geographical units are currently loaded in the geography
                loaded_units = set(self.geography.get_all_units().keys())
                df = df[df[actual_geo_col].isin(loaded_units)]

            self.load_explicit_from_df(df, column_mapping)

        logger.info(f"Batch load complete. Processed {total_files} files.")

__init__(geography, data_dir)

Initialize the PopulationManager.

Parameters:

Name Type Description Default
geography Geography

Geography object containing geographical units

required
data_dir str

Directory containing population data files

required
Source code in may/population/population.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def __init__(self, geography, data_dir):
    """
    Initialize the PopulationManager.

    Args:
        geography (Geography): Geography object containing geographical units
        data_dir (str): Directory containing population data files
    """
    self.geography = geography
    self.data_dir = data_dir
    self.people = []
    self.people_by_id = {}

    # Precise demographics: geo_unit -> age -> sex -> count
    self.precise_demographics = {}

generate_population(**kwargs)

Generate population from precise demographics data.

Creates exact number of people per age/sex/geo_unit as specified in the demographics file. People are created in age order globally (person 0 is the youngest person across all smallest geographical units).

Assumes demographics are provided for the smallest geographical level (first level in the hierarchy, e.g., SGU, village, census block, etc.).

Parameters:

Name Type Description Default
**kwargs

Arbitrary keyword arguments to be passed to the creation of Person. Supported keys include: * activities (list[str], optional): list of activity names for each Person. * properties (dict, optional): a dict of properties of the Person, e.g. 'ethnicity', 'compliance', 'taste'. * activity_map (DefaultDict[str,list[Subset]], optional): a dict mapping an activity (same string as in activities) to a list of potential Subsets the Person would join to fulfil that activity.

{}
Source code in may/population/population.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
def generate_population(self, **kwargs):
    """
    Generate population from precise demographics data.

    Creates exact number of people per age/sex/geo_unit as specified
    in the demographics file. People are created in age order globally
    (person 0 is the youngest person across all smallest geographical units).

    Assumes demographics are provided for the smallest geographical level
    (first level in the hierarchy, e.g., SGU, village, census block, etc.).

    Args:
      **kwargs:
        Arbitrary keyword arguments to be passed to the creation of Person.
        Supported keys include:
          * activities (list[str], optional): list of activity names for each Person.
          * properties (dict, optional): a dict of properties of the Person, e.g. 'ethnicity', 'compliance', 'taste'.
          * activity_map (DefaultDict[str,list[Subset]], optional):
            a dict mapping an activity (same string as in activities) to a list of potential Subsets the Person would
            join to fulfil that activity. 

    """
    if not self.precise_demographics:
        logger.error("No demographics data loaded. Cannot generate population.")
        return

    logger.info("Generating population from precise demographics...")
    Person.reset_counter()

    # Get the smallest geographical level (first in the hierarchy)
    smallest_level = self.geography.levels[0]
    smallest_units_dict = self.geography.get_units_by_level(smallest_level)

    if not smallest_units_dict:
        logger.warning(f"No {smallest_level} units found in geography. Cannot generate population.")
        return

    # Collect all (age, sex, geo_unit, count) tuples and sort by age
    all_age_sex_geo = []

    for unit in smallest_units_dict.values():
        if unit.name not in self.precise_demographics:
            logger.debug(f"No demographic data for {unit.name}, skipping")
            continue

        age_sex_data = self.precise_demographics[unit.name]

        for age, sex_counts in age_sex_data.items():
            for sex, count in sex_counts.items():
                all_age_sex_geo.append((age, sex, unit, count))

    # Sort by age first, then sex (for consistent ordering)
    all_age_sex_geo.sort(key=lambda x: (x[0], x[1]))

    # Now create people in age order across all smallest units
    total_people = 0
    geo_units_with_data = len(set(item[2] for item in all_age_sex_geo))

    for age, sex, unit, count in all_age_sex_geo:
        for _ in range(count):
            person = Person(age=age, sex=sex, geographical_unit=unit, **kwargs)
            self.add_person(person)
            # Add person to their geographical unit's people list
            unit.add_person(person)
            total_people += 1

    logger.info(f"Generated {total_people:,} people across {geo_units_with_data} {smallest_level}s")
    if geo_units_with_data > 0:
        logger.info(f"Average: {total_people / geo_units_with_data:.1f} people per {smallest_level}")

get_all_people()

Get all people as a list.

Returns:

Name Type Description
list

List of all Person objects

Source code in may/population/population.py
361
362
363
364
365
366
367
368
def get_all_people(self):
    """
    Get all people as a list.

    Returns:
        list: List of all Person objects
    """
    return self.people

get_people_by_activity(activity)

Get all people with a specific activity.

Parameters:

Name Type Description Default
activity str

Activity name

required

Returns:

Name Type Description
list

List of Person objects with this activity

Source code in may/population/population.py
395
396
397
398
399
400
401
402
403
404
405
def get_people_by_activity(self, activity):
    """
    Get all people with a specific activity.

    Args:
        activity (str): Activity name

    Returns:
        list: List of Person objects with this activity
    """
    return [p for p in self.people if p.has_activity(activity)]

get_people_by_age_range(min_age, max_age)

Get all people within an age range.

Parameters:

Name Type Description Default
min_age int

Minimum age (inclusive)

required
max_age int

Maximum age (inclusive)

required

Returns:

Name Type Description
list

List of Person objects in age range

Source code in may/population/population.py
370
371
372
373
374
375
376
377
378
379
380
381
def get_people_by_age_range(self, min_age, max_age):
    """
    Get all people within an age range.

    Args:
        min_age (int): Minimum age (inclusive)
        max_age (int): Maximum age (inclusive)

    Returns:
        list: List of Person objects in age range
    """
    return [p for p in self.people if min_age <= p.age <= max_age]

get_people_by_geo_unit(geo_unit_code)

Get all people in a specific geographical unit.

Parameters:

Name Type Description Default
geo_unit_code str

Name/code of the geographical unit

required

Returns:

Name Type Description
list

List of Person objects in this geo_unit

Source code in may/population/population.py
407
408
409
410
411
412
413
414
415
416
417
418
419
420
def get_people_by_geo_unit(self, geo_unit_code):
    """
    Get all people in a specific geographical unit.

    Args:
        geo_unit_code (str): Name/code of the geographical unit

    Returns:
        list: List of Person objects in this geo_unit
    """
    unit = self.geography.get_unit(geo_unit_code)
    if unit is None:
        return []
    return unit.people if hasattr(unit, 'people') else []

get_people_by_sex(sex)

Get all people of a specific sex.

Parameters:

Name Type Description Default
sex str

Sex category

required

Returns:

Name Type Description
list

List of Person objects

Source code in may/population/population.py
383
384
385
386
387
388
389
390
391
392
393
def get_people_by_sex(self, sex):
    """
    Get all people of a specific sex.

    Args:
        sex (str): Sex category

    Returns:
        list: List of Person objects
    """
    return [p for p in self.people if p.sex == sex]

get_person(person_id)

Get a person by their ID.

Parameters:

Name Type Description Default
person_id int

ID of the person

required

Returns:

Name Type Description
Person

The person object, or None if not found

Source code in may/population/population.py
349
350
351
352
353
354
355
356
357
358
359
def get_person(self, person_id):
    """
    Get a person by their ID.

    Args:
        person_id (int): ID of the person

    Returns:
        Person: The person object, or None if not found
    """
    return self.people_by_id.get(person_id)

get_statistics()

Get basic statistics about the population.

Returns:

Name Type Description
dict

Dictionary of statistics

Source code in may/population/population.py
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
def get_statistics(self):
    """
    Get basic statistics about the population.

    Returns:
        dict: Dictionary of statistics
    """
    if not self.people:
        return {}

    ages = [p.age for p in self.people]
    sexes = [p.sex for p in self.people]

    # Count sex categories
    sex_counts = {}
    for sex in sexes:
        sex_counts[sex] = sex_counts.get(sex, 0) + 1

    # Collect all activities
    all_activities = set()
    for p in self.people:
        all_activities.update(p.activities)

    activity_counts = {}
    for activity in all_activities:
        activity_counts[activity] = len(self.get_people_by_activity(activity))

    return {
        'total_population': len(self.people),
        'mean_age': np.mean(ages),
        'median_age': np.median(ages),
        'min_age': np.min(ages),
        'max_age': np.max(ages),
        'sex_distribution': sex_counts,
        'activity_counts': activity_counts
    }

load_batch_explicit_from_csv(data_dir, column_mapping)

Load individual-level population data from multiple MGU-level CSV files.

Source code in may/population/population.py
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
def load_batch_explicit_from_csv(self, data_dir: str, column_mapping: Dict[str, str]):
    """
    Load individual-level population data from multiple MGU-level CSV files.
    """
    # 1. Identify all MGUs in the current geography
    mgu_units = self.geography.get_units_by_level("MGU")
    mgu_names = set(mgu_units.keys())

    # 2. Identify all loaded SGUs for internal filtering
    loaded_sgus = set(self.geography.get_units_by_level("SGU").keys())

    # Reset ID counter once for the whole batch
    Person.reset_counter()

    logger.info(f"Starting batch explicit population load for {len(mgu_names)} MGUs")

    total_files = 0
    for mgu_name in mgu_names:
        filename = f"{mgu_name}_pop.csv"
        path = os.path.join(data_dir, filename)
        if not os.path.exists(path):
             continue

        df = pd.read_csv(path)
        total_files += 1

        # Filter rows by geographical unit to only keep what is in our geography
        # Check for any valid geo level column (SGU, MGU, or custom levels)
        geo_levels = set(self.geography.levels)
        geo_cols = {'SGU', 'MGU', 'geo_unit'}.union(geo_levels)
        actual_geo_col = next((col for col in df.columns if col in geo_cols), None)

        if actual_geo_col and actual_geo_col in df.columns:
            # We filter by whatever geographical units are currently loaded in the geography
            loaded_units = set(self.geography.get_all_units().keys())
            df = df[df[actual_geo_col].isin(loaded_units)]

        self.load_explicit_from_df(df, column_mapping)

    logger.info(f"Batch load complete. Processed {total_files} files.")

load_demographics_from_csv(male_file='demographics_male.csv', female_file='demographics_female.csv')

Load precise population demographics from matrix-style CSV files.

Expected format (separate files for male/female): geo_unit,0,1,2,3,...,100 E00004320,2,2,1,3,...,0 E00004321,1,3,2,2,...,1 ...

Rows = geo units Columns = ages (1-year bins from 0 to 100)

Parameters:

Name Type Description Default
male_file str

Filename for male demographics

'demographics_male.csv'
female_file str

Filename for female demographics

'demographics_female.csv'
Source code in may/population/population.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def load_demographics_from_csv(self, male_file="demographics_male.csv",
                                 female_file="demographics_female.csv"):
    """
    Load precise population demographics from matrix-style CSV files.

    Expected format (separate files for male/female):
        geo_unit,0,1,2,3,...,100
        E00004320,2,2,1,3,...,0
        E00004321,1,3,2,2,...,1
        ...

    Rows = geo units
    Columns = ages (1-year bins from 0 to 100)

    Args:
        male_file (str): Filename for male demographics
        female_file (str): Filename for female demographics

    """
    male_path = os.path.join(self.data_dir, male_file)
    female_path = os.path.join(self.data_dir, female_file)

    if not os.path.exists(male_path) or not os.path.exists(female_path):
        logger.error(f"Demographics files not found: {male_path} or {female_path}")
        logger.info("Cannot generate population without demographics data")
        return

    # Get the smallest geographical level from the loaded geography
    # to filter demographics to only relevant geo units
    smallest_level = self.geography.levels[0]
    smallest_units_dict = self.geography.get_units_by_level(smallest_level)

    if not smallest_units_dict:
        logger.warning(f"No {smallest_level} units found in geography. Cannot load demographics.")
        return

    # Create a set of geo unit names that exist in our geography for fast lookup
    valid_geo_units = set(smallest_units_dict.keys())
    logger.info(f"Filtering demographics to {len(valid_geo_units)} {smallest_level}s in loaded geography")

    logger.info(f"Loading male demographics from {male_path}")
    male_df = pd.read_csv(male_path)

    logger.info(f"Loading female demographics from {female_path}")
    female_df = pd.read_csv(female_path)

    # Validate structure
    if 'geo_unit' not in male_df.columns or 'geo_unit' not in female_df.columns:
        raise ValueError("Demographics files must have 'geo_unit' column")


    # Ignore index column if it exists
    for _df in [male_df, female_df]:
        if 'index' in _df.columns:
            _df.drop(columns=['index'], inplace=True)


    # Filter to only geo units in our geography BEFORE processing
    male_df = male_df[male_df['geo_unit'].isin(valid_geo_units)]
    female_df = female_df[female_df['geo_unit'].isin(valid_geo_units)]

    logger.info(f"Filtered to {len(male_df)} male geo units and {len(female_df)} female geo units")


    # Load into nested dict structure: geo_unit -> age -> sex -> count
    # Note: Using a regular function instead of lambda for pickle compatibility
    self.precise_demographics = defaultdict(self._create_nested_defaultdict)
    total_people = 0

    logger.info("Processing male demographics...")
    # Convert male dataframe to long format for efficient processing
    male_melted = male_df.melt(id_vars=['geo_unit'], var_name='age', value_name='count')
    male_melted['age'] = male_melted['age'].astype(int)
    male_melted['count'] = male_melted['count'].fillna(0).astype(int)
    # Filter out zero counts for efficiency
    male_melted = male_melted[male_melted['count'] > 0]

    logger.info("Processing female demographics...")
    # Convert female dataframe to long format
    female_melted = female_df.melt(id_vars=['geo_unit'], var_name='age', value_name='count')
    female_melted['age'] = female_melted['age'].astype(int)
    female_melted['count'] = female_melted['count'].fillna(0).astype(int)
    # Filter out zero counts for efficiency
    female_melted = female_melted[female_melted['count'] > 0]

    logger.info("Building demographic dictionary...")
    # Convert to numpy arrays for much faster iteration
    male_values = male_melted.values  # [[geo_unit, age, count], ...]
    female_values = female_melted.values

    # Build nested dictionary
    for row in male_values:
        geo_unit = str(row[0])
        age = int(row[1])
        count = int(row[2])

        self.precise_demographics[geo_unit][age]['male'] = count
        total_people += count

    for row in female_values:
        geo_unit = str(row[0])
        age = int(row[1])
        count = int(row[2])

        self.precise_demographics[geo_unit][age]['female'] = count
        total_people += count

    logger.info(f"Loaded precise demographics for {len(self.precise_demographics)} geographical units")
    logger.info(f"Total people in demographics: {total_people:,}")

load_explicit_from_csv(filename, column_mapping)

Load individual-level population data from a CSV file.

Source code in may/population/population.py
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
def load_explicit_from_csv(self, filename: str, column_mapping: Dict[str, str]):
    """
    Load individual-level population data from a CSV file.
    """
    path = os.path.join(self.data_dir, filename)
    if not os.path.exists(path):
        logger.error(f"Explicit population file not found: {path}")
        return

    logger.info(f"Loading explicit population from {path}")
    df = pd.read_csv(path)

    # Reset ID counter for consistency (at the entry point)
    Person.reset_counter()

    self.load_explicit_from_df(df, column_mapping)

load_explicit_from_df(df, column_mapping)

Internal method to load population from a DataFrame.

Source code in may/population/population.py
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
def load_explicit_from_df(self, df: pd.DataFrame, column_mapping: Dict[str, str]):
    """
    Internal method to load population from a DataFrame.
    """
    target_to_csv = column_mapping

    # Identify geographical column
    # Priority: 1. mapped 'geo_unit', 2. literal 'geo_unit', 3. literal 'SGU', 4. literal 'MGU'
    geo_levels = set(self.geography.levels)
    geo_cols = {'geo_unit', 'SGU', 'MGU'}.union(geo_levels)

    mapped_geo_col = target_to_csv.get('geo_unit')
    actual_geo_col = None

    if mapped_geo_col in df.columns:
        actual_geo_col = mapped_geo_col
    else:
        actual_geo_col = next((col for col in df.columns if col in geo_cols), None)

    if actual_geo_col is None:
         raise ValueError(f"Missing required geographical column (e.g., 'geo_unit', 'SGU', 'MGU') in population data")

    people_count = 0

    for row in df.itertuples(index=False):
        row_dict = row._asdict()
        properties = {}
        age = 0
        sex = "unknown"

        # 1. Determine geographical unit
        geo_unit_name = row_dict.get(actual_geo_col)
        geo_unit = self.geography.get_unit(geo_unit_name) if geo_unit_name else None

        if not geo_unit:
            logger.warning(f"No geographical unit found for person in row (col: {actual_geo_col}, val: {geo_unit_name}). Skipping.")
            continue

        # Extract known attributes
        for target, csv_col in target_to_csv.items():
            if csv_col not in row_dict:
                continue

            val = row_dict[csv_col]
            if target == 'age':
                try:
                    age = int(float(val))
                except (ValueError, TypeError):
                    age = 0
            elif target == 'sex':
                sex = str(val).lower().strip() if pd.notna(val) else "unknown"
                # Normalize common sex strings
                if sex in ['m', '1', 'male']: sex = 'male'
                elif sex in ['f', '2', 'female']: sex = 'female'
            elif target == 'geo_unit':
                unit_name = str(val).strip()
                found_unit = self.geography.get_unit(unit_name)
                if found_unit:
                    geo_unit = found_unit
            else:
                # Treat as a generic property
                properties[target] = val

        # Add all other columns not in mapping to properties.
        # The geographical column drives `geographical_unit`; it must not
        # also be duplicated as a property (parallel with VenueManager,
        # which excludes its geo column from the property dict).
        mapped_csv_cols = set(target_to_csv.values())
        reserved_cols = mapped_csv_cols | {actual_geo_col}
        for col, val in row_dict.items():
            if col not in reserved_cols:
                properties[col] = val

        # Create and add person
        person = Person(age=age, sex=sex, geographical_unit=geo_unit, properties=properties)
        self.add_person(person)

        if geo_unit:
            geo_unit.add_person(person)

        people_count += 1

    logger.info(f"Successfully loaded {people_count:,} people from explicit data.")