FIA Database¶

The FIA class is the main entry point for working with FIA data.

Overview¶

import pyfia

# Basic usage
db = pyfia.FIA("georgia.duckdb")
db.clip_by_state("GA")

# Use estimation functions
result = pyfia.volume(db, grp_by="SPCD")

Spatial Filtering¶

PyFIA supports spatial filtering using polygon boundaries:

with pyfia.FIA("southeast.duckdb") as db:
    db.clip_by_state(37)  # North Carolina
    db.clip_most_recent(eval_type="VOL")

    # Filter to custom region
    db.clip_by_polygon("my_region.geojson")

    # Join polygon attributes for grouping
    db.intersect_polygons("counties.shp", attributes=["NAME"])

    # Group estimates by polygon attribute
    result = pyfia.tpa(db, grp_by=["NAME"])

See the Spatial Filtering Guide for detailed usage.

Class Reference¶

FIA ¶

FIA(db_path: Union[str, Path], engine: Optional[str] = None)

Main FIA database class for working with Forest Inventory and Analysis data.

This class provides methods for loading FIA data from DuckDB databases, filtering by EVALID, and preparing data for estimation functions.

ATTRIBUTE	DESCRIPTION
`db_path`	Path to the DuckDB database. TYPE: `Path`
`tables`	Loaded FIA tables as lazy frames. TYPE: `Dict[str, LazyFrame]`
`evalid`	Active EVALID filter. TYPE: `list of int or None`
`most_recent`	Whether to use most recent evaluations. TYPE: `bool`

Initialize FIA database connection.

PARAMETER	DESCRIPTION
`db_path`	Path to FIA database (DuckDB or SQLite). TYPE: `str or Path`
`engine`	Database engine ('duckdb', 'sqlite', or None for auto-detect). TYPE: `str` DEFAULT: `None`

Source code in src/pyfia/core/fia.py

def __init__(self, db_path: Union[str, Path], engine: Optional[str] = None):
    """
    Initialize FIA database connection.

    Parameters
    ----------
    db_path : str or Path
        Path to FIA database (DuckDB or SQLite).
    engine : str, optional
        Database engine ('duckdb', 'sqlite', or None for auto-detect).
    """
    self.db_path = Path(db_path)
    if not self.db_path.exists():
        raise FileNotFoundError(f"Database not found: {db_path}")

    # Initialize with appropriate engine
    self.tables: Dict[str, pl.LazyFrame] = {}
    self.evalid: Optional[List[int]] = None
    self.most_recent: bool = False
    self.state_filter: Optional[List[int]] = None  # Add state filter
    self._valid_plot_cns: Optional[List[str]] = (
        None  # Cache for EVALID plot filtering
    )
    # Spatial filter attributes
    self._spatial_plot_cns: Optional[List[str]] = (
        None  # Cache for spatial filtering
    )
    self._polygon_path: Optional[str] = None  # Path to polygon file
    # Polygon attribute join (for grp_by support)
    self._polygon_attributes: Optional[pl.DataFrame] = (
        None  # CN → polygon attributes mapping
    )
    # Connection managed by FIADataReader
    self._reader = FIADataReader(db_path, engine=engine)

load_table ¶

load_table(table_name: str, columns: Optional[List[str]] = None, where: Optional[str] = None) -> LazyFrame

Load a table from the FIA database as a lazy frame.

PARAMETER	DESCRIPTION
`table_name`	Name of the FIA table to load (e.g., 'PLOT', 'TREE', 'COND'). TYPE: `str`
`columns`	Columns to load. Loads all columns if None. TYPE: `list of str` DEFAULT: `None`
`where`	Additional SQL WHERE clause to apply (without 'WHERE' keyword). Used for database-side filtering to reduce memory usage. TYPE: `str` DEFAULT: `None`

RETURNS	DESCRIPTION
`LazyFrame`	Polars LazyFrame of the requested table.

Source code in src/pyfia/core/fia.py

def load_table(
    self,
    table_name: str,
    columns: Optional[List[str]] = None,
    where: Optional[str] = None,
) -> pl.LazyFrame:
    """
    Load a table from the FIA database as a lazy frame.

    Parameters
    ----------
    table_name : str
        Name of the FIA table to load (e.g., 'PLOT', 'TREE', 'COND').
    columns : list of str, optional
        Columns to load. Loads all columns if None.
    where : str, optional
        Additional SQL WHERE clause to apply (without 'WHERE' keyword).
        Used for database-side filtering to reduce memory usage.

    Returns
    -------
    pl.LazyFrame
        Polars LazyFrame of the requested table.
    """
    # Build base WHERE clause for state filter
    base_where_clause = None
    if self.state_filter and table_name in ["PLOT", "COND", "TREE"]:
        state_list = ", ".join(str(s) for s in self.state_filter)
        base_where_clause = f"STATECD IN ({state_list})"

    # Add user-provided WHERE clause
    if where:
        if base_where_clause:
            base_where_clause = f"{base_where_clause} AND ({where})"
        else:
            base_where_clause = where

    # EVALID filter via PLT_CN for TREE, COND tables
    # This is a critical optimization - it reduces data load by 90%+ for GRM estimates
    if self.evalid and table_name in ["TREE", "COND"]:
        valid_plot_cns = self._get_valid_plot_cns()
        if valid_plot_cns:
            # Batch the PLT_CN values to avoid SQL query limits
            batch_size = 900
            dfs = []

            for i in range(0, len(valid_plot_cns), batch_size):
                batch = valid_plot_cns[i : i + batch_size]
                cn_str = ", ".join(f"'{cn}'" for cn in batch)
                plt_cn_where = f"PLT_CN IN ({cn_str})"

                # Combine with base where clause if present
                if base_where_clause:
                    where_clause = f"{base_where_clause} AND {plt_cn_where}"
                else:
                    where_clause = plt_cn_where

                df = self._reader.read_table(
                    table_name,
                    columns=columns,
                    where=where_clause,
                    lazy=True,
                )
                dfs.append(df)

            # Concatenate all batches
            if len(dfs) == 1:
                result = dfs[0]
            else:
                result = pl.concat(dfs)

            # Join polygon attributes for PLOT table if available
            if table_name == "PLOT" and self._polygon_attributes is not None:
                result = result.join(
                    self._polygon_attributes.lazy(),
                    on="CN",
                    how="left",
                )

            # Apply spatial filter if set (clip_by_polygon)
            if self._spatial_plot_cns is not None:
                if table_name == "PLOT":
                    result = result.filter(
                        pl.col("CN").is_in(self._spatial_plot_cns)
                    )
                elif table_name in ["TREE", "COND"]:
                    result = result.filter(
                        pl.col("PLT_CN").is_in(self._spatial_plot_cns)
                    )

            self.tables[table_name] = result
            return self.tables[table_name]

    # Default path - no EVALID filtering or not a filterable table
    df = self._reader.read_table(
        table_name,
        columns=columns,
        where=base_where_clause,
        lazy=True,
    )

    # Join polygon attributes for PLOT table if available
    if table_name == "PLOT" and self._polygon_attributes is not None:
        df = df.join(
            self._polygon_attributes.lazy(),
            on="CN",
            how="left",
        )

    # Apply spatial filter if set (clip_by_polygon)
    if self._spatial_plot_cns is not None:
        if table_name == "PLOT":
            df = df.filter(pl.col("CN").is_in(self._spatial_plot_cns))
        elif table_name in ["TREE", "COND"]:
            df = df.filter(pl.col("PLT_CN").is_in(self._spatial_plot_cns))

    self.tables[table_name] = df
    return self.tables[table_name]

find_evalid ¶

find_evalid(most_recent: bool = True, state: Optional[Union[int, List[int]]] = None, year: Optional[Union[int, List[int]]] = None, eval_type: Optional[str] = None) -> List[int]

Find EVALID values matching criteria.

Identify evaluation IDs for filtering FIA data based on specific criteria.

PARAMETER	DESCRIPTION
`most_recent`	If True, return only most recent evaluations per state. TYPE: `bool` DEFAULT: `True`
`state`	State FIPS code(s) to filter by. TYPE: `int or list of int` DEFAULT: `None`
`year`	End inventory year(s) to filter by. TYPE: `int or list of int` DEFAULT: `None`
`eval_type`	Evaluation type ('VOL', 'GRM', 'CHNG', 'ALL', etc.). TYPE: `str` DEFAULT: `None`

RETURNS	DESCRIPTION
`list of int`	EVALID values matching the specified criteria.

Source code in src/pyfia/core/fia.py

def find_evalid(
    self,
    most_recent: bool = True,
    state: Optional[Union[int, List[int]]] = None,
    year: Optional[Union[int, List[int]]] = None,
    eval_type: Optional[str] = None,
) -> List[int]:
    """
    Find EVALID values matching criteria.

    Identify evaluation IDs for filtering FIA data based on specific criteria.

    Parameters
    ----------
    most_recent : bool, default True
        If True, return only most recent evaluations per state.
    state : int or list of int, optional
        State FIPS code(s) to filter by.
    year : int or list of int, optional
        End inventory year(s) to filter by.
    eval_type : str, optional
        Evaluation type ('VOL', 'GRM', 'CHNG', 'ALL', etc.).

    Returns
    -------
    list of int
        EVALID values matching the specified criteria.
    """
    # Load required tables if not already loaded
    try:
        if "POP_EVAL" not in self.tables:
            self.load_table("POP_EVAL")
        if "POP_EVAL_TYP" not in self.tables:
            self.load_table("POP_EVAL_TYP")

        # Get the data
        pop_eval = self.tables["POP_EVAL"].collect()
        pop_eval_typ = self.tables["POP_EVAL_TYP"].collect()

        # Check if EVALID exists in POP_EVAL
        if "EVALID" not in pop_eval.columns:
            raise ValueError(
                f"EVALID column not found in POP_EVAL table. Available columns: {pop_eval.columns}"
            )

        # Join on CN = EVAL_CN
        df = pop_eval.join(
            pop_eval_typ, left_on="CN", right_on="EVAL_CN", how="left"
        )
    except Exception as e:
        # Log the error and raise a more specific exception
        logger.debug(f"Failed to load evaluation tables: {e}")
        raise DatabaseError(
            f"Could not load evaluation tables (POP_EVAL, POP_EVAL_TYP): {e}. "
            "Ensure the database contains FIA population tables."
        )

    # Apply filters
    if state is not None:
        if isinstance(state, int):
            state = [state]
        df = df.filter(pl.col("STATECD").is_in(state))

    if year is not None:
        if isinstance(year, int):
            year = [year]
        df = df.filter(pl.col("END_INVYR").is_in(year))

    if eval_type is not None:
        # FIA uses 'EXP' prefix for evaluation types
        # Special case: "ALL" maps to "EXPALL" for area estimation
        if eval_type.upper() == "ALL":
            eval_type_full = "EXPALL"
        else:
            eval_type_full = f"EXP{eval_type}"
        df = df.filter(pl.col("EVAL_TYP") == eval_type_full)

    if most_recent:
        # Add parsed EVALID columns for robust year sorting
        df = _add_parsed_evalid_columns(df)  # type: ignore[assignment]

        # Special handling for Texas (STATECD=48)
        # Texas has separate East/West evaluations, but we want the full state
        # Prefer evaluations with "Texas" (not "Texas(EAST)" or "Texas(West)")
        df_texas = df.filter(pl.col("STATECD") == 48)
        df_other = df.filter(pl.col("STATECD") != 48)

        if not df_texas.is_empty():
            # For Texas, prefer full state evaluations over regional ones
            # Check LOCATION_NM to identify full state vs regional
            if "LOCATION_NM" in df_texas.columns:
                # Mark full state evaluations (just "Texas" without parentheses)
                df_texas = df_texas.with_columns(
                    pl.when(pl.col("LOCATION_NM") == "Texas")
                    .then(1)
                    .otherwise(0)
                    .alias("IS_FULL_STATE")
                )
                # Sort using parsed year for robust chronological ordering
                df_texas = (
                    df_texas.sort(
                        ["EVAL_TYP", "IS_FULL_STATE", "EVALID_YEAR", "EVALID_TYPE"],
                        descending=[False, True, True, False],
                    )
                    .group_by(["STATECD", "EVAL_TYP"])
                    .first()
                    .drop(
                        [
                            "IS_FULL_STATE",
                            "EVALID_YEAR",
                            "EVALID_STATE",
                            "EVALID_TYPE",
                        ]
                    )
                )
            else:
                # Fallback if LOCATION_NM not available - use parsed year
                df_texas = (
                    df_texas.sort(
                        ["STATECD", "EVAL_TYP", "EVALID_YEAR", "EVALID_TYPE"],
                        descending=[False, False, True, False],
                    )
                    .group_by(["STATECD", "EVAL_TYP"])
                    .first()
                    .drop(["EVALID_YEAR", "EVALID_STATE", "EVALID_TYPE"])
                )

        # For other states, use robust year sorting
        if not df_other.is_empty():
            df_other = (
                df_other.sort(
                    ["STATECD", "EVAL_TYP", "EVALID_YEAR", "EVALID_TYPE"],
                    descending=[False, False, True, False],
                )
                .group_by(["STATECD", "EVAL_TYP"])
                .first()
                .drop(["EVALID_YEAR", "EVALID_STATE", "EVALID_TYPE"])
            )

        # Combine Texas and other states
        df_list = []
        if not df_texas.is_empty():
            df_list.append(df_texas)
        if not df_other.is_empty():
            df_list.append(df_other)

        if df_list:
            df = pl.concat(df_list)

    # Extract unique EVALIDs
    evalids = df.select("EVALID").unique().sort("EVALID")["EVALID"].to_list()

    return evalids

clip_by_evalid ¶

clip_by_evalid(evalid: Union[int, List[int]]) -> FIA

Filter FIA data by EVALID (evaluation ID).

This is the core filtering method that ensures statistically valid plot groupings by evaluation.

PARAMETER	DESCRIPTION
`evalid`	Single EVALID or list of EVALIDs to filter by. TYPE: `int or list of int`

RETURNS	DESCRIPTION
`FIA`	Self for method chaining.

Source code in src/pyfia/core/fia.py

def clip_by_evalid(self, evalid: Union[int, List[int]]) -> "FIA":
    """
    Filter FIA data by EVALID (evaluation ID).

    This is the core filtering method that ensures statistically valid
    plot groupings by evaluation.

    Parameters
    ----------
    evalid : int or list of int
        Single EVALID or list of EVALIDs to filter by.

    Returns
    -------
    FIA
        Self for method chaining.
    """
    if isinstance(evalid, int):
        evalid = [evalid]

    self.evalid = evalid
    # Clear plot CN cache when EVALID changes
    self._valid_plot_cns = None
    # Clear loaded tables to ensure they use the new filter
    self.tables.clear()
    return self

clip_by_state ¶

clip_by_state(state: Union[int, List[int]], most_recent: bool = True, eval_type: Optional[str] = 'ALL') -> FIA

Filter FIA data by state code(s).

This method efficiently filters data at the database level by: 1. Setting a state filter for direct table queries 2. Finding appropriate EVALIDs for the state(s) 3. Combining both filters for optimal performance

PARAMETER	DESCRIPTION
`state`	Single state FIPS code or list of codes. TYPE: `int or list of int`
`most_recent`	If True, use only most recent evaluations. TYPE: `bool` DEFAULT: `True`
`eval_type`	Evaluation type to use. Default 'ALL' for EXPALL which is appropriate for area estimation. Use None to get all types. TYPE: `str` DEFAULT: `'ALL'`

RETURNS	DESCRIPTION
`FIA`	Self for method chaining.

Source code in src/pyfia/core/fia.py

def clip_by_state(
    self,
    state: Union[int, List[int]],
    most_recent: bool = True,
    eval_type: Optional[str] = "ALL",
) -> "FIA":
    """
    Filter FIA data by state code(s).

    This method efficiently filters data at the database level by:
    1. Setting a state filter for direct table queries
    2. Finding appropriate EVALIDs for the state(s)
    3. Combining both filters for optimal performance

    Parameters
    ----------
    state : int or list of int
        Single state FIPS code or list of codes.
    most_recent : bool, default True
        If True, use only most recent evaluations.
    eval_type : str, optional, default 'ALL'
        Evaluation type to use. Default 'ALL' for EXPALL which is
        appropriate for area estimation. Use None to get all types.

    Returns
    -------
    FIA
        Self for method chaining.
    """
    if isinstance(state, int):
        state = [state]

    self.state_filter = state

    # Find EVALIDs for proper statistical grouping
    if eval_type is not None:
        # Get specific evaluation type (e.g., "ALL" for EXPALL)
        evalids = self.find_evalid(
            state=state, most_recent=most_recent, eval_type=eval_type
        )
        if evalids:
            # Use only the first EVALID to ensure single evaluation
            self.clip_by_evalid([evalids[0]] if len(evalids) > 1 else evalids)
    else:
        # Get all evaluation types (old behavior - can cause overcounting)
        evalids = self.find_evalid(state=state, most_recent=most_recent)
        if evalids:
            self.clip_by_evalid(evalids)

    return self

clip_most_recent ¶

clip_most_recent(eval_type: str = 'VOL') -> FIA

Filter to most recent evaluation of specified type.

PARAMETER	DESCRIPTION
`eval_type`	Evaluation type ('VOL' for volume, 'GRM' for growth, etc.). TYPE: `str` DEFAULT: `'VOL'`

RETURNS	DESCRIPTION
`FIA`	Self for method chaining.

Source code in src/pyfia/core/fia.py

def clip_most_recent(self, eval_type: str = "VOL") -> "FIA":
    """
    Filter to most recent evaluation of specified type.

    Parameters
    ----------
    eval_type : str, default 'VOL'
        Evaluation type ('VOL' for volume, 'GRM' for growth, etc.).

    Returns
    -------
    FIA
        Self for method chaining.
    """
    self.most_recent = True
    # Include state filter if it exists
    state_filter = getattr(self, "state_filter", None)
    evalids = self.find_evalid(
        most_recent=True,
        eval_type=eval_type,
        state=state_filter,  # Pass state filter to find_evalid
    )

    if not evalids:
        raise NoEVALIDError(
            operation=f"clip_most_recent(eval_type='{eval_type}')",
            suggestion=f"No evaluations found for type '{eval_type}'. "
            "Check that the database contains evaluation data for this type, "
            "or use find_evalid() to see available evaluations.",
        )

    # When most_recent is True, we get one EVALID per state
    # This is correct - we want the most recent evaluation for EACH state
    return self.clip_by_evalid(evalids)

clip_by_polygon ¶

clip_by_polygon(polygon: Union[str, Path], predicate: str = 'intersects') -> FIA

Filter FIA plots to those within or intersecting a polygon boundary.

Uses DuckDB spatial extension for efficient point-in-polygon filtering. Supports Shapefiles, GeoJSON, GeoPackage, and GeoParquet formats.

PARAMETER	DESCRIPTION
`polygon`	Path to spatial file containing polygon(s). Supported formats: - Shapefile (.shp) - GeoJSON (.geojson, .json) - GeoPackage (.gpkg) - GeoParquet (.parquet with geometry) - Any format supported by GDAL/OGR TYPE: `str or Path`
`predicate`	Spatial predicate for filtering: - 'intersects': Plots that intersect the polygon (recommended) - 'within': Plots completely within the polygon TYPE: `str` DEFAULT: `'intersects'`

RETURNS	DESCRIPTION
`FIA`	Self for method chaining.

RAISES	DESCRIPTION
`SpatialFileError`	If the polygon file cannot be read or does not exist.
`SpatialExtensionError`	If the DuckDB spatial extension cannot be loaded.
`NoSpatialFilterError`	If no plots fall within the polygon boundary.

Notes

FIA plot coordinates (LAT/LON) are in EPSG:4326 (WGS84)
The polygon file should use EPSG:4326 or will be transformed automatically
Public FIA coordinates are fuzzed up to 1 mile for privacy, so precision below ~1 mile is not meaningful
This filter combines with state and EVALID filters

Examples:

>>> with FIA("southeast.duckdb") as db:
...     db.clip_by_state(37)  # North Carolina
...     db.clip_by_polygon("my_region.geojson")
...     result = db.tpa()

>>> # Using a shapefile
>>> with FIA("data.duckdb") as db:
...     db.clip_by_polygon("counties.shp")
...     result = db.area()

Source code in src/pyfia/core/fia.py

def clip_by_polygon(
    self,
    polygon: Union[str, Path],
    predicate: str = "intersects",
) -> "FIA":
    """
    Filter FIA plots to those within or intersecting a polygon boundary.

    Uses DuckDB spatial extension for efficient point-in-polygon filtering.
    Supports Shapefiles, GeoJSON, GeoPackage, and GeoParquet formats.

    Parameters
    ----------
    polygon : str or Path
        Path to spatial file containing polygon(s). Supported formats:
        - Shapefile (.shp)
        - GeoJSON (.geojson, .json)
        - GeoPackage (.gpkg)
        - GeoParquet (.parquet with geometry)
        - Any format supported by GDAL/OGR
    predicate : str, default 'intersects'
        Spatial predicate for filtering:
        - 'intersects': Plots that intersect the polygon (recommended)
        - 'within': Plots completely within the polygon

    Returns
    -------
    FIA
        Self for method chaining.

    Raises
    ------
    SpatialFileError
        If the polygon file cannot be read or does not exist.
    SpatialExtensionError
        If the DuckDB spatial extension cannot be loaded.
    NoSpatialFilterError
        If no plots fall within the polygon boundary.

    Notes
    -----
    - FIA plot coordinates (LAT/LON) are in EPSG:4326 (WGS84)
    - The polygon file should use EPSG:4326 or will be transformed automatically
    - Public FIA coordinates are fuzzed up to 1 mile for privacy, so precision
      below ~1 mile is not meaningful
    - This filter combines with state and EVALID filters

    Examples
    --------
    >>> with FIA("southeast.duckdb") as db:
    ...     db.clip_by_state(37)  # North Carolina
    ...     db.clip_by_polygon("my_region.geojson")
    ...     result = db.tpa()

    >>> # Using a shapefile
    >>> with FIA("data.duckdb") as db:
    ...     db.clip_by_polygon("counties.shp")
    ...     result = db.area()
    """
    from .backends.duckdb_backend import DuckDBBackend

    # Validate inputs
    polygon_path = Path(polygon)
    if not polygon_path.exists():
        raise SpatialFileError(
            str(polygon),
            reason="File not found",
            supported_formats=[".shp", ".geojson", ".json", ".gpkg", ".parquet"],
        )

    # Ensure we're using DuckDB backend (spatial requires DuckDB)
    if not isinstance(self._reader._backend, DuckDBBackend):
        raise SpatialExtensionError(
            "Spatial operations require DuckDB backend. "
            "SQLite does not support spatial queries."
        )

    # Store polygon path for reference
    self._polygon_path = str(polygon_path)

    # Build the spatial query
    # Note: FIA stores coordinates as LAT, LON but we need to create POINT(LON, LAT)
    # because ST_Point expects (x, y) = (longitude, latitude)
    predicate_fn = "ST_Intersects" if predicate == "intersects" else "ST_Within"

    # Query to find plot CNs within the polygon
    # Using ST_Read to load the polygon file and ST_Point to create points from LAT/LON
    query = f"""
        WITH boundary AS (
            SELECT ST_Union_Agg(geom) as geom
            FROM ST_Read('{polygon_path}')
        )
        SELECT CAST(p.CN AS VARCHAR) as CN
        FROM PLOT p, boundary b
        WHERE {predicate_fn}(
            ST_Point(p.LON, p.LAT),
            b.geom
        )
    """

    # Add state filter if present
    if self.state_filter:
        state_list = ", ".join(str(s) for s in self.state_filter)
        query = f"""
            WITH boundary AS (
                SELECT ST_Union_Agg(geom) as geom
                FROM ST_Read('{polygon_path}')
            )
            SELECT CAST(p.CN AS VARCHAR) as CN
            FROM PLOT p, boundary b
            WHERE p.STATECD IN ({state_list})
            AND {predicate_fn}(
                ST_Point(p.LON, p.LAT),
                b.geom
            )
        """

    try:
        # Execute spatial query
        result = self._reader._backend.execute_spatial_query(query)  # type: ignore[attr-defined]

        if result.is_empty():
            raise NoSpatialFilterError(str(polygon_path))

        # Store filtered plot CNs
        self._spatial_plot_cns = result["CN"].to_list()
        logger.info(
            f"Spatial filter applied: {len(self._spatial_plot_cns)} plots "
            f"within polygon from '{polygon_path}'"
        )

    except NoSpatialFilterError:
        raise
    except SpatialExtensionError:
        raise
    except Exception as e:
        error_msg = str(e)
        if "ST_Read" in error_msg or "GDAL" in error_msg:
            raise SpatialFileError(
                str(polygon_path),
                reason=f"Could not read spatial data: {error_msg}",
                supported_formats=[
                    ".shp",
                    ".geojson",
                    ".json",
                    ".gpkg",
                    ".parquet",
                ],
            )
        raise

    # Clear loaded tables to ensure they use the new filter
    self.tables.clear()
    return self

intersect_polygons ¶

intersect_polygons(polygon: Union[str, Path], attributes: List[str]) -> FIA

Perform spatial join between plots and polygons, adding polygon attributes to plots for use in grp_by.

This method joins polygon attributes to FIA plots based on spatial intersection. The resulting attributes can be used as grouping variables in estimator functions.

PARAMETER	DESCRIPTION
`polygon`	Path to spatial file containing polygon(s). Supported formats: - Shapefile (.shp) - GeoJSON (.geojson, .json) - GeoPackage (.gpkg) - GeoParquet (.parquet with geometry) - Any format supported by GDAL/OGR TYPE: `str or Path`
`attributes`	Polygon attribute columns to add to plots. These columns must exist in the polygon file and will be available for grp_by. TYPE: `list of str`

RETURNS	DESCRIPTION
`FIA`	Self for method chaining.

RAISES	DESCRIPTION
`SpatialFileError`	If the polygon file cannot be read or does not exist.
`SpatialExtensionError`	If the DuckDB spatial extension cannot be loaded.
`ValueError`	If requested attributes don't exist in the polygon file.

Notes

Plots that don't intersect any polygon will have NULL values
If a plot intersects multiple polygons, the first match is used
Attributes are available immediately for grp_by in estimators
This method is independent of clip_by_polygon (can use both)

Examples:

>>> with FIA("southeast.duckdb") as db:
...     db.clip_by_state(37)  # North Carolina
...     db.intersect_polygons("counties.shp", attributes=["NAME", "FIPS"])
...     # Group TPA estimates by county
...     result = tpa(db, grp_by=["NAME"])

>>> # Use multiple attributes for grouping
>>> with FIA("data.duckdb") as db:
...     db.intersect_polygons("regions.geojson", ["REGION", "DISTRICT"])
...     result = area(db, grp_by=["REGION", "DISTRICT"])

Source code in src/pyfia/core/fia.py

def intersect_polygons(
    self,
    polygon: Union[str, Path],
    attributes: List[str],
) -> "FIA":
    """
    Perform spatial join between plots and polygons, adding polygon
    attributes to plots for use in grp_by.

    This method joins polygon attributes to FIA plots based on spatial
    intersection. The resulting attributes can be used as grouping
    variables in estimator functions.

    Parameters
    ----------
    polygon : str or Path
        Path to spatial file containing polygon(s). Supported formats:
        - Shapefile (.shp)
        - GeoJSON (.geojson, .json)
        - GeoPackage (.gpkg)
        - GeoParquet (.parquet with geometry)
        - Any format supported by GDAL/OGR
    attributes : list of str
        Polygon attribute columns to add to plots. These columns must
        exist in the polygon file and will be available for grp_by.

    Returns
    -------
    FIA
        Self for method chaining.

    Raises
    ------
    SpatialFileError
        If the polygon file cannot be read or does not exist.
    SpatialExtensionError
        If the DuckDB spatial extension cannot be loaded.
    ValueError
        If requested attributes don't exist in the polygon file.

    Notes
    -----
    - Plots that don't intersect any polygon will have NULL values
    - If a plot intersects multiple polygons, the first match is used
    - Attributes are available immediately for grp_by in estimators
    - This method is independent of clip_by_polygon (can use both)

    Examples
    --------
    >>> with FIA("southeast.duckdb") as db:
    ...     db.clip_by_state(37)  # North Carolina
    ...     db.intersect_polygons("counties.shp", attributes=["NAME", "FIPS"])
    ...     # Group TPA estimates by county
    ...     result = tpa(db, grp_by=["NAME"])

    >>> # Use multiple attributes for grouping
    >>> with FIA("data.duckdb") as db:
    ...     db.intersect_polygons("regions.geojson", ["REGION", "DISTRICT"])
    ...     result = area(db, grp_by=["REGION", "DISTRICT"])
    """
    from .backends.duckdb_backend import DuckDBBackend

    # Validate inputs
    polygon_path = Path(polygon)
    if not polygon_path.exists():
        raise SpatialFileError(
            str(polygon),
            reason="File not found",
            supported_formats=[".shp", ".geojson", ".json", ".gpkg", ".parquet"],
        )

    if not attributes:
        raise ValueError(
            "attributes parameter must contain at least one column name"
        )

    # Ensure we're using DuckDB backend (spatial requires DuckDB)
    if not isinstance(self._reader._backend, DuckDBBackend):
        raise SpatialExtensionError(
            "Spatial operations require DuckDB backend. "
            "SQLite does not support spatial queries."
        )

    # First, check what columns exist in the polygon file
    try:
        check_query = f"SELECT * FROM ST_Read('{polygon_path}') LIMIT 0"
        schema_result = self._reader._backend.execute_spatial_query(check_query)
        available_cols = schema_result.columns
    except Exception as e:
        raise SpatialFileError(
            str(polygon_path),
            reason=f"Could not read spatial data: {e}",
            supported_formats=[".shp", ".geojson", ".json", ".gpkg", ".parquet"],
        )

    # Validate requested attributes exist
    missing_attrs = [attr for attr in attributes if attr not in available_cols]
    if missing_attrs:
        raise ValueError(
            f"Attributes not found in polygon file: {missing_attrs}. "
            f"Available columns: {[c for c in available_cols if c != 'geom']}"
        )

    # Build attribute selection for SQL
    attr_select = ", ".join(f"poly.{attr}" for attr in attributes)

    # Build the spatial join query
    # Use DISTINCT ON equivalent to get first match for each plot
    query = f"""
        WITH ranked AS (
            SELECT
                CAST(p.CN AS VARCHAR) as CN,
                {attr_select},
                ROW_NUMBER() OVER (PARTITION BY p.CN ORDER BY p.CN) as rn
            FROM PLOT p
            JOIN ST_Read('{polygon_path}') poly
            ON ST_Intersects(ST_Point(p.LON, p.LAT), poly.geom)
    """

    # Add state filter if present
    if self.state_filter:
        state_list = ", ".join(str(s) for s in self.state_filter)
        query += f"        WHERE p.STATECD IN ({state_list})\n"

    query += (
        """
        )
        SELECT CN, """
        + ", ".join(attributes)
        + """
        FROM ranked
        WHERE rn = 1
    """
    )

    try:
        # Execute spatial join query
        result = self._reader._backend.execute_spatial_query(query)

        # Store polygon attributes
        self._polygon_attributes = result
        n_matched = len(result)

        logger.info(
            f"Intersect polygons: {n_matched} plots matched with attributes "
            f"{attributes} from '{polygon_path}'"
        )

        if n_matched == 0:
            warnings.warn(
                f"No plots intersected with polygons from '{polygon_path}'. "
                "All polygon attribute values will be NULL.",
                UserWarning,
            )

    except SpatialExtensionError:
        raise
    except Exception as e:
        error_msg = str(e)
        if "ST_Read" in error_msg or "GDAL" in error_msg:
            raise SpatialFileError(
                str(polygon_path),
                reason=f"Could not read spatial data: {error_msg}",
                supported_formats=[
                    ".shp",
                    ".geojson",
                    ".json",
                    ".gpkg",
                    ".parquet",
                ],
            )
        raise

    # Clear loaded tables to ensure they include polygon attributes
    self.tables.clear()
    return self

get_plots ¶

get_plots(columns: Optional[List[str]] = None) -> DataFrame

Get PLOT table filtered by current EVALID, state, and spatial settings.

PARAMETER	DESCRIPTION
`columns`	Columns to return. Returns all columns if None. TYPE: `list of str` DEFAULT: `None`

RETURNS	DESCRIPTION
`DataFrame`	Filtered PLOT dataframe.

Source code in src/pyfia/core/fia.py

def get_plots(self, columns: Optional[List[str]] = None) -> pl.DataFrame:
    """
    Get PLOT table filtered by current EVALID, state, and spatial settings.

    Parameters
    ----------
    columns : list of str, optional
        Columns to return. Returns all columns if None.

    Returns
    -------
    pl.DataFrame
        Filtered PLOT dataframe.
    """
    # Load PLOT table if needed (with state filter applied)
    if "PLOT" not in self.tables:
        self.load_table("PLOT")

    # If we have EVALID filter, we need to join with assignments
    if self.evalid:
        # Load assignment table with EVALID filter directly
        evalid_str = ", ".join(str(e) for e in self.evalid)
        ppsa = self._reader.read_table(
            "POP_PLOT_STRATUM_ASSGN",
            columns=["PLT_CN", "STRATUM_CN", "EVALID"],
            where=f"EVALID IN ({evalid_str})",
            lazy=True,
        )

        # Filter plots to those in the evaluation
        plots = self.tables["PLOT"].join(
            ppsa.select(["PLT_CN", "EVALID"]).unique(),
            left_on="CN",
            right_on="PLT_CN",
            how="inner",
        )
    else:
        plots = self.tables["PLOT"]

    # Apply spatial filter if set
    if self._spatial_plot_cns is not None:
        plots = plots.filter(pl.col("CN").is_in(self._spatial_plot_cns))

    # Select columns if specified
    if columns:
        plots = plots.select(columns)

    # Materialize results
    plots_df = plots.collect()

    # Ensure PLT_CN is always available for downstream joins
    if "PLT_CN" not in plots_df.columns and "CN" in plots_df.columns:
        plots_df = plots_df.with_columns(pl.col("CN").alias("PLT_CN"))

    return plots_df

get_trees ¶

get_trees(columns: Optional[List[str]] = None) -> DataFrame

Get TREE table filtered by current EVALID and state settings.

PARAMETER	DESCRIPTION
`columns`	Columns to return. Returns all columns if None. TYPE: `list of str` DEFAULT: `None`

RETURNS	DESCRIPTION
`DataFrame`	Filtered TREE dataframe.

Source code in src/pyfia/core/fia.py

def get_trees(self, columns: Optional[List[str]] = None) -> pl.DataFrame:
    """
    Get TREE table filtered by current EVALID and state settings.

    Parameters
    ----------
    columns : list of str, optional
        Columns to return. Returns all columns if None.

    Returns
    -------
    pl.DataFrame
        Filtered TREE dataframe.
    """
    # Load TREE table if needed (with state filter applied)
    if "TREE" not in self.tables:
        self.load_table("TREE")

    # If we need additional filtering by plot CNs
    if self.evalid:
        # Get plot CNs efficiently
        plot_query = self.tables["PLOT"].select("CN")
        if self.evalid:
            evalid_str = ", ".join(str(e) for e in self.evalid)
            ppsa = self._reader.read_table(
                "POP_PLOT_STRATUM_ASSGN",
                columns=["PLT_CN"],
                where=f"EVALID IN ({evalid_str})",
                lazy=True,
            ).unique()
            plot_query = plot_query.join(
                ppsa, left_on="CN", right_on="PLT_CN", how="inner"
            )

        # Filter trees to those plots
        trees = self.tables["TREE"].join(
            plot_query.select("CN"), left_on="PLT_CN", right_on="CN", how="inner"
        )
    else:
        trees = self.tables["TREE"]

    # Select columns if specified
    if columns:
        trees = trees.select(columns)

    return trees.collect()

get_conditions ¶

get_conditions(columns: Optional[List[str]] = None) -> DataFrame

Get COND table filtered by current EVALID and state settings.

PARAMETER	DESCRIPTION
`columns`	Columns to return. Returns all columns if None. TYPE: `list of str` DEFAULT: `None`

RETURNS	DESCRIPTION
`DataFrame`	Filtered COND dataframe.

Source code in src/pyfia/core/fia.py

def get_conditions(self, columns: Optional[List[str]] = None) -> pl.DataFrame:
    """
    Get COND table filtered by current EVALID and state settings.

    Parameters
    ----------
    columns : list of str, optional
        Columns to return. Returns all columns if None.

    Returns
    -------
    pl.DataFrame
        Filtered COND dataframe.
    """
    # Load COND table if needed (with state filter applied)
    if "COND" not in self.tables:
        self.load_table("COND")

    # If we need additional filtering by plot CNs
    if self.evalid:
        # Get plot CNs efficiently
        plot_query = self.tables["PLOT"].select("CN")
        if self.evalid:
            evalid_str = ", ".join(str(e) for e in self.evalid)
            ppsa = self._reader.read_table(
                "POP_PLOT_STRATUM_ASSGN",
                columns=["PLT_CN"],
                where=f"EVALID IN ({evalid_str})",
                lazy=True,
            ).unique()
            plot_query = plot_query.join(
                ppsa, left_on="CN", right_on="PLT_CN", how="inner"
            )

        # Filter conditions to those plots
        conds = self.tables["COND"].join(
            plot_query.select("CN"), left_on="PLT_CN", right_on="CN", how="inner"
        )
    else:
        conds = self.tables["COND"]

    # Select columns if specified
    if columns:
        conds = conds.select(columns)

    return conds.collect()

prepare_estimation_data ¶

prepare_estimation_data(include_trees: bool = True) -> Dict[str, DataFrame]

Prepare standard set of tables for estimation functions.

This method loads and filters the core tables needed for most FIA estimation procedures, properly filtered by EVALID.

PARAMETER	DESCRIPTION
`include_trees`	Whether to include the TREE table. Set to False for area estimation which doesn't need tree data (saves significant memory on constrained environments). TYPE: `bool` DEFAULT: `True`

RETURNS	DESCRIPTION
`Dict[str, DataFrame]`	Dictionary with filtered dataframes for estimation containing: 'plot', 'tree', 'cond', 'pop_plot_stratum_assgn', 'pop_stratum', 'pop_estn_unit'. If include_trees=False, 'tree' will be an empty DataFrame.

Source code in src/pyfia/core/fia.py

def prepare_estimation_data(
    self, include_trees: bool = True
) -> Dict[str, pl.DataFrame]:
    """
    Prepare standard set of tables for estimation functions.

    This method loads and filters the core tables needed for most
    FIA estimation procedures, properly filtered by EVALID.

    Parameters
    ----------
    include_trees : bool, default True
        Whether to include the TREE table. Set to False for area
        estimation which doesn't need tree data (saves significant
        memory on constrained environments).

    Returns
    -------
    Dict[str, pl.DataFrame]
        Dictionary with filtered dataframes for estimation containing:
        'plot', 'tree', 'cond', 'pop_plot_stratum_assgn',
        'pop_stratum', 'pop_estn_unit'.
        If include_trees=False, 'tree' will be an empty DataFrame.
    """
    # Ensure we have an EVALID filter
    if not self.evalid and not self.most_recent:
        warnings.warn("No EVALID filter set. Using most recent volume evaluation.")
        self.clip_most_recent(eval_type="VOL")

    # Load population tables
    if "POP_STRATUM" not in self.tables:
        self.load_table("POP_STRATUM")
    if "POP_PLOT_STRATUM_ASSGN" not in self.tables:
        self.load_table("POP_PLOT_STRATUM_ASSGN")
    if "POP_ESTN_UNIT" not in self.tables:
        self.load_table("POP_ESTN_UNIT")

    # Get filtered core tables
    plots = self.get_plots()
    # Only load TREE table if needed (volume, biomass, TPA, mortality, etc.)
    # Area estimation doesn't need tree data - skip to save memory
    trees = self.get_trees() if include_trees else pl.DataFrame()
    conds = self.get_conditions()

    # Get stratum assignments for filtered plots
    plot_cns = plots["CN"].to_list()
    if self.evalid is None:
        raise NoEVALIDError(
            operation="get_estimation_data",
            suggestion="Use clip_by_evalid() or clip_most_recent() before calling estimation functions.",
        )
    ppsa = (
        self.tables["POP_PLOT_STRATUM_ASSGN"]
        .filter(pl.col("PLT_CN").is_in(plot_cns))
        .filter(pl.col("EVALID").is_in(self.evalid))
        .collect()
    )

    # Get strata for these assignments
    stratum_cns = ppsa["STRATUM_CN"].unique().to_list()
    pop_stratum = (
        self.tables["POP_STRATUM"].filter(pl.col("CN").is_in(stratum_cns)).collect()
    )

    # Get estimation units
    estn_unit_cns = pop_stratum["ESTN_UNIT_CN"].unique().to_list()
    pop_estn_unit = (
        self.tables["POP_ESTN_UNIT"]
        .filter(pl.col("CN").is_in(estn_unit_cns))
        .collect()
    )

    return {
        "plot": plots,
        "tree": trees,
        "cond": conds,
        "pop_plot_stratum_assgn": ppsa,
        "pop_stratum": pop_stratum,
        "pop_estn_unit": pop_estn_unit,
    }