diff --git a/examples/flood_simulation.py b/examples/flood_simulation.py
index 91556564..5309134b 100644
--- a/examples/flood_simulation.py
+++ b/examples/flood_simulation.py
@@ -40,10 +40,10 @@
 from matplotlib.animation import FuncAnimation
 
 from xrspatial import generate_terrain, slope
-from xrspatial.flow_direction import flow_direction
-from xrspatial.flow_accumulation import flow_accumulation
-from xrspatial.flow_length import flow_length
-from xrspatial.hand import hand
+from xrspatial.hydro.flow_direction_d8 import flow_direction_d8 as flow_direction
+from xrspatial.hydro.flow_accumulation_d8 import flow_accumulation_d8 as flow_accumulation
+from xrspatial.hydro.flow_length_d8 import flow_length_d8 as flow_length
+from xrspatial.hydro.hand_d8 import hand_d8 as hand
 from xrspatial.flood import (
     curve_number_runoff,
     flood_depth,
diff --git a/examples/landslide_risk.py b/examples/landslide_risk.py
index 634a1c70..a7440e6a 100644
--- a/examples/landslide_risk.py
+++ b/examples/landslide_risk.py
@@ -43,9 +43,9 @@
 
 from xrspatial import generate_terrain, slope
 from xrspatial.curvature import curvature
-from xrspatial.flow_direction import flow_direction
-from xrspatial.flow_accumulation import flow_accumulation
-from xrspatial.twi import twi
+from xrspatial.hydro.flow_direction_d8 import flow_direction_d8 as flow_direction
+from xrspatial.hydro.flow_accumulation_d8 import flow_accumulation_d8 as flow_accumulation
+from xrspatial.hydro.twi_d8 import twi_d8 as twi
 from xrspatial.terrain_metrics import tpi
 
 # -- Tunable parameters -----------------------------------------------------
diff --git a/examples/watershed_explorer.py b/examples/watershed_explorer.py
index 31e5ceed..789c2e1f 100644
--- a/examples/watershed_explorer.py
+++ b/examples/watershed_explorer.py
@@ -34,12 +34,12 @@
 import matplotlib.colors as mcolors
 
 from xrspatial import generate_terrain
-from xrspatial.flow_direction import flow_direction
-from xrspatial.flow_accumulation import flow_accumulation
-from xrspatial.stream_order import stream_order
-from xrspatial.snap_pour_point import snap_pour_point
-from xrspatial.watershed import watershed
-from xrspatial.basin import basin
+from xrspatial.hydro.flow_direction_d8 import flow_direction_d8 as flow_direction
+from xrspatial.hydro.flow_accumulation_d8 import flow_accumulation_d8 as flow_accumulation
+from xrspatial.hydro.stream_order_d8 import stream_order_d8 as stream_order
+from xrspatial.hydro.snap_pour_point_d8 import snap_pour_point_d8 as snap_pour_point
+from xrspatial.hydro.watershed_d8 import watershed_d8 as watershed
+from xrspatial.hydro.basin_d8 import basin_d8 as basin
 
 # -- Tunable parameters -----------------------------------------------------
 CELL_SIZE = 30.0                    # metres per pixel
diff --git a/xrspatial/__init__.py b/xrspatial/__init__.py
index 8de0a0dc..3eb227e9 100644
--- a/xrspatial/__init__.py
+++ b/xrspatial/__init__.py
@@ -23,7 +23,8 @@
 from xrspatial.curvature import curvature  # noqa
 from xrspatial.emerging_hotspots import emerging_hotspots  # noqa
 from xrspatial.erosion import erode  # noqa
-from xrspatial.fill import fill  # noqa
+from xrspatial.hydro import fill  # noqa: unified wrapper
+from xrspatial.hydro import fill_d8  # noqa
 from xrspatial.interpolate import idw  # noqa
 from xrspatial.interpolate import kriging  # noqa
 from xrspatial.interpolate import spline  # noqa
@@ -38,23 +39,22 @@
 from xrspatial.flood import flood_depth  # noqa
 from xrspatial.flood import inundation  # noqa
 from xrspatial.flood import travel_time  # noqa
-from xrspatial.flow_accumulation import flow_accumulation  # noqa
-from xrspatial.flow_accumulation_dinf import flow_accumulation_dinf  # noqa
-from xrspatial.flow_accumulation_mfd import flow_accumulation_mfd  # noqa
-from xrspatial.flow_direction import flow_direction  # noqa
-from xrspatial.flow_direction_dinf import flow_direction_dinf  # noqa
-from xrspatial.flow_direction_mfd import flow_direction_mfd  # noqa
-from xrspatial.flow_length import flow_length  # noqa
-from xrspatial.flow_length_dinf import flow_length_dinf  # noqa
-from xrspatial.flow_length_mfd import flow_length_mfd  # noqa
-from xrspatial.flow_path import flow_path  # noqa
+from xrspatial.hydro import flow_accumulation  # noqa: unified wrapper
+from xrspatial.hydro import flow_accumulation_d8, flow_accumulation_dinf, flow_accumulation_mfd  # noqa
+from xrspatial.hydro import flow_direction  # noqa: unified wrapper
+from xrspatial.hydro import flow_direction_d8, flow_direction_dinf, flow_direction_mfd  # noqa
+from xrspatial.hydro import flow_length  # noqa: unified wrapper
+from xrspatial.hydro import flow_length_d8, flow_length_dinf, flow_length_mfd  # noqa
+from xrspatial.hydro import flow_path  # noqa: unified wrapper
+from xrspatial.hydro import flow_path_d8, flow_path_dinf, flow_path_mfd  # noqa
 from xrspatial.focal import mean  # noqa
 from xrspatial.glcm import glcm_texture  # noqa
 from xrspatial.morphology import morph_closing  # noqa
 from xrspatial.morphology import morph_dilate  # noqa
 from xrspatial.morphology import morph_erode  # noqa
 from xrspatial.morphology import morph_opening  # noqa
-from xrspatial.hand import hand  # noqa
+from xrspatial.hydro import hand  # noqa: unified wrapper
+from xrspatial.hydro import hand_d8, hand_dinf, hand_mfd  # noqa
 from xrspatial.hillshade import hillshade  # noqa
 from xrspatial.mahalanobis import mahalanobis  # noqa
 from xrspatial.multispectral import arvi  # noqa
@@ -76,14 +76,14 @@
 from xrspatial.proximity import great_circle_distance  # noqa
 from xrspatial.proximity import manhattan_distance  # noqa
 from xrspatial.proximity import proximity  # noqa
-from xrspatial.sink import sink  # noqa
-from xrspatial.snap_pour_point import snap_pour_point  # noqa
-from xrspatial.stream_link import stream_link  # noqa
-from xrspatial.stream_link_dinf import stream_link_dinf  # noqa
-from xrspatial.stream_link_mfd import stream_link_mfd  # noqa
-from xrspatial.stream_order import stream_order  # noqa
-from xrspatial.stream_order_dinf import stream_order_dinf  # noqa
-from xrspatial.stream_order_mfd import stream_order_mfd  # noqa
+from xrspatial.hydro import sink  # noqa: unified wrapper
+from xrspatial.hydro import sink_d8  # noqa
+from xrspatial.hydro import snap_pour_point  # noqa: unified wrapper
+from xrspatial.hydro import snap_pour_point_d8  # noqa
+from xrspatial.hydro import stream_link  # noqa: unified wrapper
+from xrspatial.hydro import stream_link_d8, stream_link_dinf, stream_link_mfd  # noqa
+from xrspatial.hydro import stream_order  # noqa: unified wrapper
+from xrspatial.hydro import stream_order_d8, stream_order_dinf, stream_order_mfd  # noqa
 from xrspatial.sky_view_factor import sky_view_factor  # noqa
 from xrspatial.slope import slope  # noqa
 from xrspatial.surface_distance import surface_allocation  # noqa
@@ -95,12 +95,16 @@
 from xrspatial.terrain_metrics import roughness  # noqa
 from xrspatial.terrain_metrics import tpi  # noqa
 from xrspatial.terrain_metrics import tri  # noqa
-from xrspatial.twi import twi  # noqa
+from xrspatial.hydro import twi  # noqa: unified wrapper
+from xrspatial.hydro import twi_d8  # noqa
 from xrspatial.polygonize import polygonize  # noqa
 from xrspatial.viewshed import viewshed  # noqa
-from xrspatial.basin import basin  # noqa
-from xrspatial.watershed import basins  # noqa
-from xrspatial.watershed import watershed  # noqa
+from xrspatial.hydro import basin  # noqa: unified wrapper
+from xrspatial.hydro import basin_d8  # noqa
+from xrspatial.hydro import basins  # noqa: backward-compat alias
+from xrspatial.hydro import basins_d8  # noqa
+from xrspatial.hydro import watershed  # noqa: unified wrapper
+from xrspatial.hydro import watershed_d8, watershed_dinf, watershed_mfd  # noqa
 from xrspatial.zonal import apply as zonal_apply  # noqa
 from xrspatial.zonal import crop  # noqa
 from xrspatial.zonal import trim  # noqa
diff --git a/xrspatial/hydro/__init__.py b/xrspatial/hydro/__init__.py
new file mode 100644
index 00000000..1cf71cb1
--- /dev/null
+++ b/xrspatial/hydro/__init__.py
@@ -0,0 +1,149 @@
+"""Hydrology analysis modules for xarray-spatial.
+
+Includes flow direction, flow accumulation, flow length, flow path,
+watershed delineation, basin labeling, HAND, stream ordering, and
+related utilities for D8, D-infinity, and MFD routing.
+
+Each function family provides a unified wrapper that accepts a
+``routing`` parameter ('d8', 'dinf', or 'mfd') and dispatches to
+the corresponding implementation.  The suffixed variants
+(e.g. ``flow_direction_d8``) are also importable directly.
+"""
+
+# -- concrete D8 implementations ------------------------------------------
+from xrspatial.hydro.basin_d8 import basin_d8  # noqa
+from xrspatial.hydro.fill_d8 import fill_d8  # noqa
+from xrspatial.hydro.flow_accumulation_d8 import flow_accumulation_d8  # noqa
+from xrspatial.hydro.flow_direction_d8 import flow_direction_d8  # noqa
+from xrspatial.hydro.flow_length_d8 import flow_length_d8  # noqa
+from xrspatial.hydro.flow_path_d8 import flow_path_d8  # noqa
+from xrspatial.hydro.hand_d8 import hand_d8  # noqa
+from xrspatial.hydro.sink_d8 import sink_d8  # noqa
+from xrspatial.hydro.snap_pour_point_d8 import snap_pour_point_d8  # noqa
+from xrspatial.hydro.stream_link_d8 import stream_link_d8  # noqa
+from xrspatial.hydro.stream_order_d8 import stream_order_d8  # noqa
+from xrspatial.hydro.twi_d8 import twi_d8  # noqa
+from xrspatial.hydro.watershed_d8 import basins_d8  # noqa
+from xrspatial.hydro.watershed_d8 import watershed_d8  # noqa
+
+# -- concrete D-infinity implementations -----------------------------------
+from xrspatial.hydro.flow_accumulation_dinf import flow_accumulation_dinf  # noqa
+from xrspatial.hydro.flow_direction_dinf import flow_direction_dinf  # noqa
+from xrspatial.hydro.flow_length_dinf import flow_length_dinf  # noqa
+from xrspatial.hydro.flow_path_dinf import flow_path_dinf  # noqa
+from xrspatial.hydro.hand_dinf import hand_dinf  # noqa
+from xrspatial.hydro.stream_link_dinf import stream_link_dinf  # noqa
+from xrspatial.hydro.stream_order_dinf import stream_order_dinf  # noqa
+from xrspatial.hydro.watershed_dinf import watershed_dinf  # noqa
+
+# -- concrete MFD implementations -----------------------------------------
+from xrspatial.hydro.flow_accumulation_mfd import flow_accumulation_mfd  # noqa
+from xrspatial.hydro.flow_direction_mfd import flow_direction_mfd  # noqa
+from xrspatial.hydro.flow_length_mfd import flow_length_mfd  # noqa
+from xrspatial.hydro.flow_path_mfd import flow_path_mfd  # noqa
+from xrspatial.hydro.hand_mfd import hand_mfd  # noqa
+from xrspatial.hydro.stream_link_mfd import stream_link_mfd  # noqa
+from xrspatial.hydro.stream_order_mfd import stream_order_mfd  # noqa
+from xrspatial.hydro.watershed_mfd import watershed_mfd  # noqa
+
+
+# =========================================================================
+# Routing dispatch
+# =========================================================================
+
+class _RoutingDispatch:
+    """Map routing algorithm names to concrete implementations.
+
+    Inspired by ArrayTypeFunctionMapping but keyed on a string
+    (``'d8'``, ``'dinf'``, ``'mfd'``) rather than array type.
+    """
+
+    __slots__ = ('_name', '_impls')
+
+    def __init__(self, name, **impls):
+        self._name = name
+        self._impls = impls
+
+    def __call__(self, *args, routing='d8', **kwargs):
+        try:
+            fn = self._impls[routing]
+        except KeyError:
+            opts = ', '.join(repr(k) for k in self._impls)
+            raise ValueError(
+                f"Unknown routing {routing!r} for {self._name}; "
+                f"expected one of {opts}"
+            ) from None
+        return fn(*args, **kwargs)
+
+
+# -- 8 families with d8 / dinf / mfd variants ----------------------------
+
+flow_direction = _RoutingDispatch(
+    'flow_direction',
+    d8=flow_direction_d8, dinf=flow_direction_dinf, mfd=flow_direction_mfd,
+)
+
+flow_accumulation = _RoutingDispatch(
+    'flow_accumulation',
+    d8=flow_accumulation_d8, dinf=flow_accumulation_dinf,
+    mfd=flow_accumulation_mfd,
+)
+
+flow_length = _RoutingDispatch(
+    'flow_length',
+    d8=flow_length_d8, dinf=flow_length_dinf, mfd=flow_length_mfd,
+)
+
+flow_path = _RoutingDispatch(
+    'flow_path',
+    d8=flow_path_d8, dinf=flow_path_dinf, mfd=flow_path_mfd,
+)
+
+watershed = _RoutingDispatch(
+    'watershed',
+    d8=watershed_d8, dinf=watershed_dinf, mfd=watershed_mfd,
+)
+
+hand = _RoutingDispatch(
+    'hand',
+    d8=hand_d8, dinf=hand_dinf, mfd=hand_mfd,
+)
+
+stream_link = _RoutingDispatch(
+    'stream_link',
+    d8=stream_link_d8, dinf=stream_link_dinf, mfd=stream_link_mfd,
+)
+
+
+# stream_order needs special handling: the ordering param (strahler/shreve)
+# is called `ordering` in d8 and `method` in dinf/mfd.
+
+class _StreamOrderDispatch(_RoutingDispatch):
+    def __call__(self, *args, routing='d8', ordering='strahler', **kwargs):
+        try:
+            fn = self._impls[routing]
+        except KeyError:
+            opts = ', '.join(repr(k) for k in self._impls)
+            raise ValueError(
+                f"Unknown routing {routing!r} for {self._name}; "
+                f"expected one of {opts}"
+            ) from None
+        if routing == 'd8':
+            return fn(*args, ordering=ordering, **kwargs)
+        return fn(*args, method=ordering, **kwargs)
+
+
+stream_order = _StreamOrderDispatch(
+    'stream_order',
+    d8=stream_order_d8, dinf=stream_order_dinf, mfd=stream_order_mfd,
+)
+
+
+# -- 5 D8-only functions (future-proofed with routing param) --------------
+
+basin = _RoutingDispatch('basin', d8=basin_d8)
+basins = _RoutingDispatch('basins', d8=basins_d8)
+sink = _RoutingDispatch('sink', d8=sink_d8)
+snap_pour_point = _RoutingDispatch('snap_pour_point', d8=snap_pour_point_d8)
+fill = _RoutingDispatch('fill', d8=fill_d8)
+twi = _RoutingDispatch('twi', d8=twi_d8)
diff --git a/xrspatial/_boundary_store.py b/xrspatial/hydro/_boundary_store.py
similarity index 100%
rename from xrspatial/_boundary_store.py
rename to xrspatial/hydro/_boundary_store.py
diff --git a/xrspatial/basin.py b/xrspatial/hydro/basin_d8.py
similarity index 97%
rename from xrspatial/basin.py
rename to xrspatial/hydro/basin_d8.py
index d7a404c4..11ed2fac 100644
--- a/xrspatial/basin.py
+++ b/xrspatial/hydro/basin_d8.py
@@ -272,7 +272,7 @@ def _basins_dask_iterative(flow_dir_da):
     Constructs basin pour_points lazily, then delegates to the
     watershed dask infrastructure.
     """
-    from xrspatial.watershed import _watershed_dask_iterative
+    from xrspatial.hydro.watershed_d8 import _watershed_dask_iterative
 
     chunks_y = flow_dir_da.chunks[0]
     chunks_x = flow_dir_da.chunks[1]
@@ -300,7 +300,7 @@ def _basins_make_pp_block(flow_dir_block, block_info=None):
 def _basins_dask_cupy(flow_dir_da):
     """Dask+CuPy basins: native GPU via watershed infrastructure."""
     import cupy as cp
-    from xrspatial.watershed import _watershed_dask_cupy
+    from xrspatial.hydro.watershed_d8 import _watershed_dask_cupy
 
     chunks_y = flow_dir_da.chunks[0]
     chunks_x = flow_dir_da.chunks[1]
@@ -332,7 +332,7 @@ def _basins_make_pp_block(flow_dir_block, block_info=None):
 # =====================================================================
 
 @supports_dataset
-def basin(flow_dir: xr.DataArray,
+def basin_d8(flow_dir: xr.DataArray,
           name: str = 'basin') -> xr.DataArray:
     """Delineate drainage basins: every cell labeled with its outlet ID.
 
@@ -360,7 +360,7 @@ def basin(flow_dir: xr.DataArray,
     data = flow_dir.data
 
     if isinstance(data, np.ndarray):
-        from xrspatial.watershed import _watershed_cpu
+        from xrspatial.hydro.watershed_d8 import _watershed_cpu
         fd = data.astype(np.float64)
         h, w = fd.shape
         labels = _basins_init_labels(fd, h, w, h, w, 0, 0)
diff --git a/xrspatial/fill.py b/xrspatial/hydro/fill_d8.py
similarity index 99%
rename from xrspatial/fill.py
rename to xrspatial/hydro/fill_d8.py
index a899809f..ab992071 100644
--- a/xrspatial/fill.py
+++ b/xrspatial/hydro/fill_d8.py
@@ -36,7 +36,7 @@ class cupy:  # type: ignore[no-redef]
     is_dask_cupy,
     ngjit,
 )
-from xrspatial._boundary_store import BoundaryStore
+from xrspatial.hydro._boundary_store import BoundaryStore
 from xrspatial.dataset_support import supports_dataset
 
 
@@ -449,7 +449,7 @@ def _fill_dask_cupy(dem_data):
 # =====================================================================
 
 @supports_dataset
-def fill(dem: xr.DataArray,
+def fill_d8(dem: xr.DataArray,
          z_limit=None,
          name: str = 'fill') -> xr.DataArray:
     """Fill depressions in a DEM using Planchon-Darboux iterative flooding.
diff --git a/xrspatial/flow_accumulation.py b/xrspatial/hydro/flow_accumulation_d8.py
similarity index 99%
rename from xrspatial/flow_accumulation.py
rename to xrspatial/hydro/flow_accumulation_d8.py
index 0593fe49..4ab96868 100644
--- a/xrspatial/flow_accumulation.py
+++ b/xrspatial/hydro/flow_accumulation_d8.py
@@ -38,7 +38,7 @@ class cupy:  # type: ignore[no-redef]
     is_dask_cupy,
     ngjit,
 )
-from xrspatial._boundary_store import BoundaryStore
+from xrspatial.hydro._boundary_store import BoundaryStore
 from xrspatial.dataset_support import supports_dataset
 
 
@@ -881,7 +881,7 @@ def _flow_accum_dask_cupy(flow_dir_da):
 # =====================================================================
 
 @supports_dataset
-def flow_accumulation(flow_dir: xr.DataArray,
+def flow_accumulation_d8(flow_dir: xr.DataArray,
                       name: str = 'flow_accumulation') -> xr.DataArray:
     """Compute flow accumulation from a D8 flow direction grid.
 
diff --git a/xrspatial/flow_accumulation_dinf.py b/xrspatial/hydro/flow_accumulation_dinf.py
similarity index 99%
rename from xrspatial/flow_accumulation_dinf.py
rename to xrspatial/hydro/flow_accumulation_dinf.py
index 9b45dcc1..2d164d84 100644
--- a/xrspatial/flow_accumulation_dinf.py
+++ b/xrspatial/hydro/flow_accumulation_dinf.py
@@ -37,9 +37,9 @@ class cupy:  # type: ignore[no-redef]
     is_dask_cupy,
     ngjit,
 )
-from xrspatial._boundary_store import BoundaryStore
+from xrspatial.hydro._boundary_store import BoundaryStore
 from xrspatial.dataset_support import supports_dataset
-from xrspatial.flow_accumulation import (
+from xrspatial.hydro.flow_accumulation_d8 import (
     _find_ready_and_finalize,
     _preprocess_tiles,
 )
diff --git a/xrspatial/flow_accumulation_mfd.py b/xrspatial/hydro/flow_accumulation_mfd.py
similarity index 99%
rename from xrspatial/flow_accumulation_mfd.py
rename to xrspatial/hydro/flow_accumulation_mfd.py
index c2eceab2..2e483c02 100644
--- a/xrspatial/flow_accumulation_mfd.py
+++ b/xrspatial/hydro/flow_accumulation_mfd.py
@@ -37,7 +37,7 @@ class cupy:  # type: ignore[no-redef]
     is_dask_cupy,
     ngjit,
 )
-from xrspatial._boundary_store import BoundaryStore
+from xrspatial.hydro._boundary_store import BoundaryStore
 from xrspatial.dataset_support import supports_dataset
 
 # Neighbor offsets: E, SE, S, SW, W, NW, N, NE
diff --git a/xrspatial/flow_direction.py b/xrspatial/hydro/flow_direction_d8.py
similarity index 99%
rename from xrspatial/flow_direction.py
rename to xrspatial/hydro/flow_direction_d8.py
index 6495de00..5f57cb70 100644
--- a/xrspatial/flow_direction.py
+++ b/xrspatial/hydro/flow_direction_d8.py
@@ -266,7 +266,7 @@ def _run_dask_cupy(data: da.Array,
 # =====================================================================
 
 @supports_dataset
-def flow_direction(agg: xr.DataArray,
+def flow_direction_d8(agg: xr.DataArray,
                    name: str = 'flow_direction',
                    boundary: str = 'nan') -> xr.DataArray:
     """Compute D8 flow direction for each cell.
diff --git a/xrspatial/flow_direction_dinf.py b/xrspatial/hydro/flow_direction_dinf.py
similarity index 100%
rename from xrspatial/flow_direction_dinf.py
rename to xrspatial/hydro/flow_direction_dinf.py
diff --git a/xrspatial/flow_direction_mfd.py b/xrspatial/hydro/flow_direction_mfd.py
similarity index 100%
rename from xrspatial/flow_direction_mfd.py
rename to xrspatial/hydro/flow_direction_mfd.py
diff --git a/xrspatial/flow_length.py b/xrspatial/hydro/flow_length_d8.py
similarity index 99%
rename from xrspatial/flow_length.py
rename to xrspatial/hydro/flow_length_d8.py
index 5fd6d5ba..abb21ab3 100644
--- a/xrspatial/flow_length.py
+++ b/xrspatial/hydro/flow_length_d8.py
@@ -20,9 +20,9 @@
 except ImportError:
     da = None
 
-from xrspatial.flow_accumulation import _code_to_offset
-from xrspatial.watershed import _code_to_offset_py
-from xrspatial._boundary_store import BoundaryStore
+from xrspatial.hydro.flow_accumulation_d8 import _code_to_offset
+from xrspatial.hydro.watershed_d8 import _code_to_offset_py
+from xrspatial.hydro._boundary_store import BoundaryStore
 from xrspatial.utils import (
     _validate_raster,
     get_dataarray_resolution,
@@ -687,7 +687,7 @@ def _compute_seeds_upstream(iy, ix, boundaries, flow_bdry,
                              cellsize_x, cellsize_y, diag):
     """For upstream: check which adjacent cells flow INTO this tile,
     and seed with max(existing, neighbor_upstream + step_dist)."""
-    from xrspatial.flow_accumulation import _compute_seeds as _compute_accum_seeds
+    from xrspatial.hydro.flow_accumulation_d8 import _compute_seeds as _compute_accum_seeds
 
     tile_h = chunks_y[iy]
     tile_w = chunks_x[ix]
@@ -1021,7 +1021,7 @@ def _flow_length_dask_cupy(flow_dir_da, direction,
 # =====================================================================
 
 @supports_dataset
-def flow_length(flow_dir: xr.DataArray,
+def flow_length_d8(flow_dir: xr.DataArray,
                 direction: str = 'downstream',
                 name: str = 'flow_length') -> xr.DataArray:
     """Compute D8 flow length from a flow direction grid.
diff --git a/xrspatial/flow_length_dinf.py b/xrspatial/hydro/flow_length_dinf.py
similarity index 99%
rename from xrspatial/flow_length_dinf.py
rename to xrspatial/hydro/flow_length_dinf.py
index 4c8e0180..b4578e6a 100644
--- a/xrspatial/flow_length_dinf.py
+++ b/xrspatial/hydro/flow_length_dinf.py
@@ -23,9 +23,9 @@
 except ImportError:
     da = None
 
-from xrspatial._boundary_store import BoundaryStore
-from xrspatial.flow_accumulation import _preprocess_tiles
-from xrspatial.flow_accumulation_dinf import _angle_to_neighbors
+from xrspatial.hydro._boundary_store import BoundaryStore
+from xrspatial.hydro.flow_accumulation_d8 import _preprocess_tiles
+from xrspatial.hydro.flow_accumulation_dinf import _angle_to_neighbors
 from xrspatial.utils import (
     _validate_raster,
     get_dataarray_resolution,
diff --git a/xrspatial/flow_length_mfd.py b/xrspatial/hydro/flow_length_mfd.py
similarity index 99%
rename from xrspatial/flow_length_mfd.py
rename to xrspatial/hydro/flow_length_mfd.py
index f0d77797..d7336da1 100644
--- a/xrspatial/flow_length_mfd.py
+++ b/xrspatial/hydro/flow_length_mfd.py
@@ -23,7 +23,7 @@
 except ImportError:
     da = None
 
-from xrspatial._boundary_store import BoundaryStore
+from xrspatial.hydro._boundary_store import BoundaryStore
 from xrspatial.utils import (
     _validate_raster,
     get_dataarray_resolution,
diff --git a/xrspatial/flow_path.py b/xrspatial/hydro/flow_path_d8.py
similarity index 99%
rename from xrspatial/flow_path.py
rename to xrspatial/hydro/flow_path_d8.py
index 9b41f642..e6077feb 100644
--- a/xrspatial/flow_path.py
+++ b/xrspatial/hydro/flow_path_d8.py
@@ -32,7 +32,7 @@ class cupy:  # type: ignore[no-redef]
 except ImportError:
     da = None
 
-from xrspatial.flow_accumulation import _code_to_offset, _code_to_offset_py
+from xrspatial.hydro.flow_accumulation_d8 import _code_to_offset, _code_to_offset_py
 from xrspatial.utils import (
     _validate_raster,
     has_cuda_and_cupy,
@@ -348,7 +348,7 @@ def _flow_path_dask_cupy(flow_dir_data, start_points_data):
 # =====================================================================
 
 @supports_dataset
-def flow_path(flow_dir: xr.DataArray,
+def flow_path_d8(flow_dir: xr.DataArray,
               start_points: xr.DataArray,
               name: str = 'flow_path') -> xr.DataArray:
     """Trace downstream flow paths from start points through a D8 grid.
diff --git a/xrspatial/hydro/flow_path_dinf.py b/xrspatial/hydro/flow_path_dinf.py
new file mode 100644
index 00000000..d0423cd3
--- /dev/null
+++ b/xrspatial/hydro/flow_path_dinf.py
@@ -0,0 +1,364 @@
+"""Trace downstream flow paths from start points through a D-inf direction grid.
+
+Uses the dominant-neighbor approach: at each cell, the D-inf angle
+decomposes into two neighbors with proportional weights; the path
+follows whichever neighbor receives the higher weight.
+
+Algorithm
+---------
+For each non-NaN cell in ``start_points``:
+1. Decompose the D-inf angle into two neighbors and weights.
+2. Follow the dominant neighbor (higher weight) at each step.
+3. Write the start cell's label to every visited cell.
+4. Stop at NaN, pit (angle < 0), out-of-bounds, or grid edge.
+"""
+
+from __future__ import annotations
+
+import math
+
+import numpy as np
+import xarray as xr
+
+try:
+    import cupy
+except ImportError:
+    class cupy:  # type: ignore[no-redef]
+        ndarray = False
+
+try:
+    import dask.array as da
+except ImportError:
+    da = None
+
+from xrspatial.hydro.flow_accumulation_dinf import _angle_to_neighbors
+from xrspatial.utils import (
+    _validate_raster,
+    has_cuda_and_cupy,
+    is_cupy_array,
+    is_dask_cupy,
+    ngjit,
+)
+from xrspatial.dataset_support import supports_dataset
+
+
+# =====================================================================
+# Pure-Python angle decomposition (for dask tracing loop)
+# =====================================================================
+
+def _angle_to_neighbors_py(angle):
+    """Pure-Python version of _angle_to_neighbors for non-numba contexts."""
+    if isinstance(angle, float) and (math.isnan(angle) or angle < 0.0):
+        return (0, 0, 0.0, 0, 0, 0.0)
+    if angle < 0.0 or angle != angle:
+        return (0, 0, 0.0, 0, 0, 0.0)
+
+    pi_over_4 = math.pi / 4
+    k = int(angle / pi_over_4)
+    if k > 7:
+        k = 7
+    alpha = angle - k * pi_over_4
+    w1 = 1.0 - alpha / pi_over_4
+    w2 = alpha / pi_over_4
+
+    facets = [
+        ((0, 1), (-1, 1)),    # k=0: E, NE
+        ((-1, 1), (-1, 0)),   # k=1: NE, N
+        ((-1, 0), (-1, -1)),  # k=2: N, NW
+        ((-1, -1), (0, -1)),  # k=3: NW, W
+        ((0, -1), (1, -1)),   # k=4: W, SW
+        ((1, -1), (1, 0)),    # k=5: SW, S
+        ((1, 0), (1, 1)),     # k=6: S, SE
+        ((1, 1), (0, 1)),     # k=7: SE, E
+    ]
+    (dy1, dx1), (dy2, dx2) = facets[k]
+    return (dy1, dx1, w1, dy2, dx2, w2)
+
+
+# =====================================================================
+# CPU kernel
+# =====================================================================
+
+@ngjit
+def _flow_path_dinf_cpu(flow_dir, start_points, H, W):
+    """Trace downstream paths using D-inf dominant neighbor."""
+    out = np.empty((H, W), dtype=np.float64)
+    out[:] = np.nan
+
+    for r in range(H):
+        for c in range(W):
+            v = start_points[r, c]
+            if v != v:  # NaN
+                continue
+            label = v
+            cr, cc = r, c
+            while True:
+                out[cr, cc] = label
+                angle = flow_dir[cr, cc]
+                if angle != angle:  # NaN
+                    break
+                dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(angle)
+                if w1 <= 0.0 and w2 <= 0.0:
+                    break  # pit
+                if w1 >= w2:
+                    dy, dx = dy1, dx1
+                else:
+                    dy, dx = dy2, dx2
+                nr = cr + dy
+                nc = cc + dx
+                if nr < 0 or nr >= H or nc < 0 or nc >= W:
+                    break
+                cr, cc = nr, nc
+
+    return out
+
+
+# =====================================================================
+# CuPy backend
+# =====================================================================
+
+def _flow_path_dinf_cupy(flow_dir_data, start_points_data):
+    """CuPy: convert to numpy, run CPU kernel, convert back."""
+    import cupy as cp
+
+    fd_np = flow_dir_data.get() if hasattr(flow_dir_data, 'get') else np.asarray(flow_dir_data)
+    sp_np = start_points_data.get() if hasattr(start_points_data, 'get') else np.asarray(start_points_data)
+    fd_np = fd_np.astype(np.float64)
+    sp_np = sp_np.astype(np.float64)
+    H, W = fd_np.shape
+    out = _flow_path_dinf_cpu(fd_np, sp_np, H, W)
+    return cp.asarray(out)
+
+
+# =====================================================================
+# Dask backend
+# =====================================================================
+
+def _flow_path_dinf_dask(flow_dir_data, start_points_data):
+    """Dask: sparse start-point extraction, LRU-cached tracing, lazy assembly."""
+    from xrspatial.hydro.flow_path_d8 import _group_cells_by_chunk
+    from functools import lru_cache
+
+    H, W = flow_dir_data.shape
+    chunks_y = start_points_data.chunks[0]
+    chunks_x = start_points_data.chunks[1]
+
+    # Phase 1: identify chunks with start points
+    def _has_sp(block):
+        return np.array(
+            [[np.any(~np.isnan(np.asarray(block))).item()]],
+            dtype=np.int8,
+        )
+
+    flags = da.map_blocks(
+        _has_sp, start_points_data,
+        dtype=np.int8,
+        chunks=tuple((1,) * len(c) for c in start_points_data.chunks),
+    ).compute()
+
+    # Phase 2: extract start point coordinates
+    points = []
+    row_off = 0
+    for iy, cy in enumerate(chunks_y):
+        col_off = 0
+        for ix, cx in enumerate(chunks_x):
+            if flags[iy, ix]:
+                chunk = np.asarray(
+                    start_points_data.blocks[iy, ix].compute(),
+                    dtype=np.float64,
+                )
+                rs, cs = np.where(~np.isnan(chunk))
+                for k in range(len(rs)):
+                    points.append((
+                        row_off + int(rs[k]),
+                        col_off + int(cs[k]),
+                        float(chunk[rs[k], cs[k]]),
+                    ))
+            col_off += cx
+        row_off += cy
+
+    # Phase 3: trace paths with LRU cache
+    fd_chunks_y = flow_dir_data.chunks[0]
+    fd_chunks_x = flow_dir_data.chunks[1]
+
+    fd_row_offsets = np.zeros(len(fd_chunks_y) + 1, dtype=np.int64)
+    for i, cy in enumerate(fd_chunks_y):
+        fd_row_offsets[i + 1] = fd_row_offsets[i] + cy
+    fd_col_offsets = np.zeros(len(fd_chunks_x) + 1, dtype=np.int64)
+    for i, cx in enumerate(fd_chunks_x):
+        fd_col_offsets[i + 1] = fd_col_offsets[i] + cx
+
+    max_chunk_bytes = max(
+        int(cy) * int(cx) * 8
+        for cy in fd_chunks_y for cx in fd_chunks_x
+    )
+    cache_size = max(4, (512 * 1024 * 1024) // max(max_chunk_bytes, 1))
+
+    @lru_cache(maxsize=cache_size)
+    def _get_chunk(iy, ix):
+        return np.asarray(
+            flow_dir_data.blocks[iy, ix].compute(), dtype=np.float64)
+
+    def _find_chunk(r, c):
+        iy = int(np.searchsorted(fd_row_offsets[1:], r, side='right'))
+        ix = int(np.searchsorted(fd_col_offsets[1:], c, side='right'))
+        return iy, ix, r - int(fd_row_offsets[iy]), c - int(fd_col_offsets[ix])
+
+    _init_cap = max(1024, len(points) * 4)
+    _buf_rows = np.empty(_init_cap, dtype=np.int64)
+    _buf_cols = np.empty(_init_cap, dtype=np.int64)
+    _buf_labels = np.empty(_init_cap, dtype=np.float64)
+    _buf_len = 0
+
+    for r, c, label in points:
+        cr, cc = r, c
+        while True:
+            if _buf_len >= len(_buf_rows):
+                new_cap = len(_buf_rows) * 2
+                _new_rows = np.empty(new_cap, dtype=np.int64)
+                _new_rows[:_buf_len] = _buf_rows[:_buf_len]
+                _buf_rows = _new_rows
+                _new_cols = np.empty(new_cap, dtype=np.int64)
+                _new_cols[:_buf_len] = _buf_cols[:_buf_len]
+                _buf_cols = _new_cols
+                _new_labels = np.empty(new_cap, dtype=np.float64)
+                _new_labels[:_buf_len] = _buf_labels[:_buf_len]
+                _buf_labels = _new_labels
+
+            _buf_rows[_buf_len] = cr
+            _buf_cols[_buf_len] = cc
+            _buf_labels[_buf_len] = label
+            _buf_len += 1
+
+            iy, ix, lr, lc = _find_chunk(cr, cc)
+            chunk = _get_chunk(iy, ix)
+            angle = chunk[lr, lc]
+            if np.isnan(angle):
+                break
+            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors_py(float(angle))
+            if w1 <= 0.0 and w2 <= 0.0:
+                break
+            if w1 >= w2:
+                dy, dx = dy1, dx1
+            else:
+                dy, dx = dy2, dx2
+            nr = cr + dy
+            nc = cc + dx
+            if nr < 0 or nr >= H or nc < 0 or nc >= W:
+                break
+            cr, cc = nr, nc
+
+    path_rows = _buf_rows[:_buf_len]
+    path_cols = _buf_cols[:_buf_len]
+    path_labels = _buf_labels[:_buf_len]
+
+    _get_chunk.cache_clear()
+
+    # Phase 4: assemble via map_blocks
+    _grouped = _group_cells_by_chunk(
+        path_rows, path_cols, path_labels,
+        flow_dir_data.chunks[0], flow_dir_data.chunks[1],
+    )
+
+    def _assemble_block(block, block_info=None):
+        if block_info is None or 0 not in block_info:
+            return np.full(block.shape, np.nan, dtype=np.float64)
+        loc = block_info[0]['chunk-location']
+        out = np.full(block.shape, np.nan, dtype=np.float64)
+        group = _grouped.get((loc[0], loc[1]))
+        if group is not None:
+            local_r, local_c, lbls = group
+            out[local_r, local_c] = lbls
+        return out
+
+    dummy = da.zeros((H, W), chunks=flow_dir_data.chunks, dtype=np.float64)
+    return da.map_blocks(
+        _assemble_block, dummy,
+        dtype=np.float64,
+        meta=np.array((), dtype=np.float64),
+    )
+
+
+# =====================================================================
+# Dask+CuPy backend
+# =====================================================================
+
+def _flow_path_dinf_dask_cupy(flow_dir_data, start_points_data):
+    """Dask+CuPy: convert to numpy dask, run dask path, convert back."""
+    import cupy as cp
+
+    fd_np = flow_dir_data.map_blocks(
+        lambda b: b.get(), dtype=flow_dir_data.dtype,
+        meta=np.array((), dtype=flow_dir_data.dtype),
+    )
+    sp_np = start_points_data.map_blocks(
+        lambda b: b.get(), dtype=start_points_data.dtype,
+        meta=np.array((), dtype=start_points_data.dtype),
+    )
+
+    result = _flow_path_dinf_dask(fd_np, sp_np)
+    return result.map_blocks(
+        cp.asarray, dtype=result.dtype,
+        meta=cp.array((), dtype=result.dtype),
+    )
+
+
+# =====================================================================
+# Public API
+# =====================================================================
+
+@supports_dataset
+def flow_path_dinf(flow_dir_dinf: xr.DataArray,
+                   start_points: xr.DataArray,
+                   name: str = 'flow_path_dinf') -> xr.DataArray:
+    """Trace downstream flow paths using D-infinity dominant neighbor.
+
+    Parameters
+    ----------
+    flow_dir_dinf : xarray.DataArray or xr.Dataset
+        2D D-infinity flow direction grid. Values are continuous
+        angles in radians [0, 2*pi), with -1.0 for pits and NaN
+        for nodata.
+    start_points : xarray.DataArray
+        2D raster where non-NaN cells are path starting locations.
+        Values are preserved as labels along the traced path.
+    name : str, default 'flow_path_dinf'
+        Name of output DataArray.
+
+    Returns
+    -------
+    xarray.DataArray or xr.Dataset
+        Same-shape grid where each cell on a traced path carries
+        the label of its originating start point. All other cells
+        are NaN. If paths overlap, the last start point in
+        raster-scan order wins.
+    """
+    _validate_raster(flow_dir_dinf, func_name='flow_path_dinf',
+                     name='flow_dir_dinf')
+
+    fd_data = flow_dir_dinf.data
+    sp_data = start_points.data
+
+    if isinstance(fd_data, np.ndarray):
+        fd = fd_data.astype(np.float64)
+        sp = np.asarray(sp_data, dtype=np.float64)
+        H, W = fd.shape
+        out = _flow_path_dinf_cpu(fd, sp, H, W)
+
+    elif has_cuda_and_cupy() and is_cupy_array(fd_data):
+        out = _flow_path_dinf_cupy(fd_data, sp_data)
+
+    elif has_cuda_and_cupy() and is_dask_cupy(flow_dir_dinf):
+        out = _flow_path_dinf_dask_cupy(fd_data, sp_data)
+
+    elif da is not None and isinstance(fd_data, da.Array):
+        out = _flow_path_dinf_dask(fd_data, sp_data)
+
+    else:
+        raise TypeError(f"Unsupported array type: {type(fd_data)}")
+
+    return xr.DataArray(out,
+                        name=name,
+                        coords=flow_dir_dinf.coords,
+                        dims=flow_dir_dinf.dims,
+                        attrs=flow_dir_dinf.attrs)
diff --git a/xrspatial/hydro/flow_path_mfd.py b/xrspatial/hydro/flow_path_mfd.py
new file mode 100644
index 00000000..5e28aced
--- /dev/null
+++ b/xrspatial/hydro/flow_path_mfd.py
@@ -0,0 +1,379 @@
+"""Trace downstream flow paths from start points through an MFD fraction grid.
+
+Uses the dominant-neighbor approach: at each cell, the neighbor with the
+highest fraction is followed. Returns a single path per start point.
+
+Algorithm
+---------
+For each non-NaN cell in ``start_points``:
+1. Find the neighbor direction k with the highest fraction.
+2. Follow that neighbor at each step.
+3. Write the start cell's label to every visited cell.
+4. Stop at NaN, pit (all fractions zero), out-of-bounds, or grid edge.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import xarray as xr
+
+try:
+    import cupy
+except ImportError:
+    class cupy:  # type: ignore[no-redef]
+        ndarray = False
+
+try:
+    import dask.array as da
+except ImportError:
+    da = None
+
+from xrspatial.utils import (
+    _validate_raster,
+    has_cuda_and_cupy,
+    is_cupy_array,
+    is_dask_cupy,
+    ngjit,
+)
+from xrspatial.dataset_support import supports_dataset
+
+# Neighbor offsets: E, SE, S, SW, W, NW, N, NE
+_DY_NP = np.array([0, 1, 1, 1, 0, -1, -1, -1], dtype=np.int64)
+_DX_NP = np.array([1, 1, 0, -1, -1, -1, 0, 1], dtype=np.int64)
+
+
+def _dominant_neighbor_py(fractions, r, c):
+    """Pure-Python: find dominant neighbor direction and offset.
+
+    Returns (dy, dx, frac) or (0, 0, 0.0) if pit/nodata.
+    """
+    dy_arr = [0, 1, 1, 1, 0, -1, -1, -1]
+    dx_arr = [1, 1, 0, -1, -1, -1, 0, 1]
+    best_k = -1
+    best_frac = 0.0
+    for k in range(8):
+        f = float(fractions[k, r, c])
+        if f > best_frac:
+            best_frac = f
+            best_k = k
+    if best_k == -1:
+        return 0, 0, 0.0
+    return dy_arr[best_k], dx_arr[best_k], best_frac
+
+
+# =====================================================================
+# CPU kernel
+# =====================================================================
+
+@ngjit
+def _flow_path_mfd_cpu(fractions, start_points, H, W):
+    """Trace downstream paths using MFD dominant neighbor."""
+    dy = np.array([0, 1, 1, 1, 0, -1, -1, -1], dtype=np.int64)
+    dx = np.array([1, 1, 0, -1, -1, -1, 0, 1], dtype=np.int64)
+
+    out = np.empty((H, W), dtype=np.float64)
+    out[:] = np.nan
+
+    for r in range(H):
+        for c in range(W):
+            v = start_points[r, c]
+            if v != v:  # NaN
+                continue
+            label = v
+            cr, cc = r, c
+            while True:
+                out[cr, cc] = label
+                # Check if cell is valid
+                chk = fractions[0, cr, cc]
+                if chk != chk:  # NaN → nodata
+                    break
+                # Find dominant neighbor
+                best_k = -1
+                best_frac = 0.0
+                for k in range(8):
+                    f = fractions[k, cr, cc]
+                    if f > best_frac:
+                        best_frac = f
+                        best_k = k
+                if best_k == -1:
+                    break  # pit
+                nr = cr + dy[best_k]
+                nc = cc + dx[best_k]
+                if nr < 0 or nr >= H or nc < 0 or nc >= W:
+                    break
+                cr, cc = nr, nc
+
+    return out
+
+
+# =====================================================================
+# CuPy backend
+# =====================================================================
+
+def _flow_path_mfd_cupy(fractions_data, start_points_data):
+    """CuPy: convert to numpy, run CPU kernel, convert back."""
+    import cupy as cp
+
+    fr_np = fractions_data.get() if hasattr(fractions_data, 'get') else np.asarray(fractions_data)
+    sp_np = start_points_data.get() if hasattr(start_points_data, 'get') else np.asarray(start_points_data)
+    fr_np = fr_np.astype(np.float64)
+    sp_np = sp_np.astype(np.float64)
+    _, H, W = fr_np.shape
+    out = _flow_path_mfd_cpu(fr_np, sp_np, H, W)
+    return cp.asarray(out)
+
+
+# =====================================================================
+# Dask backend
+# =====================================================================
+
+def _flow_path_mfd_dask(fractions_data, start_points_data, chunks_y, chunks_x):
+    """Dask: sparse start-point extraction, LRU-cached tracing, lazy assembly."""
+    from xrspatial.hydro.flow_path_d8 import _group_cells_by_chunk
+    from functools import lru_cache
+
+    _, H, W = fractions_data.shape
+
+    # Phase 1: identify chunks with start points
+    sp_chunks_y = start_points_data.chunks[0]
+    sp_chunks_x = start_points_data.chunks[1]
+
+    def _has_sp(block):
+        return np.array(
+            [[np.any(~np.isnan(np.asarray(block))).item()]],
+            dtype=np.int8,
+        )
+
+    flags = da.map_blocks(
+        _has_sp, start_points_data,
+        dtype=np.int8,
+        chunks=tuple((1,) * len(c) for c in start_points_data.chunks),
+    ).compute()
+
+    # Phase 2: extract start points
+    points = []
+    row_off = 0
+    for iy, cy in enumerate(sp_chunks_y):
+        col_off = 0
+        for ix, cx in enumerate(sp_chunks_x):
+            if flags[iy, ix]:
+                chunk = np.asarray(
+                    start_points_data.blocks[iy, ix].compute(),
+                    dtype=np.float64,
+                )
+                rs, cs = np.where(~np.isnan(chunk))
+                for k in range(len(rs)):
+                    points.append((
+                        row_off + int(rs[k]),
+                        col_off + int(cs[k]),
+                        float(chunk[rs[k], cs[k]]),
+                    ))
+            col_off += cx
+        row_off += cy
+
+    # Phase 3: trace paths with LRU cache
+    fd_row_offsets = np.zeros(len(chunks_y) + 1, dtype=np.int64)
+    for i, cy in enumerate(chunks_y):
+        fd_row_offsets[i + 1] = fd_row_offsets[i] + cy
+    fd_col_offsets = np.zeros(len(chunks_x) + 1, dtype=np.int64)
+    for i, cx in enumerate(chunks_x):
+        fd_col_offsets[i + 1] = fd_col_offsets[i] + cx
+
+    max_chunk_bytes = max(
+        8 * int(cy) * int(cx) * 8  # 8 bands * tile * 8 bytes
+        for cy in chunks_y for cx in chunks_x
+    )
+    cache_size = max(4, (512 * 1024 * 1024) // max(max_chunk_bytes, 1))
+
+    @lru_cache(maxsize=cache_size)
+    def _get_chunk(iy, ix):
+        y_start = int(fd_row_offsets[iy])
+        y_end = int(fd_row_offsets[iy + 1])
+        x_start = int(fd_col_offsets[ix])
+        x_end = int(fd_col_offsets[ix + 1])
+        return np.asarray(
+            fractions_data[:, y_start:y_end, x_start:x_end].compute(),
+            dtype=np.float64,
+        )
+
+    def _find_chunk(r, c):
+        iy = int(np.searchsorted(fd_row_offsets[1:], r, side='right'))
+        ix = int(np.searchsorted(fd_col_offsets[1:], c, side='right'))
+        return iy, ix, r - int(fd_row_offsets[iy]), c - int(fd_col_offsets[ix])
+
+    _init_cap = max(1024, len(points) * 4)
+    _buf_rows = np.empty(_init_cap, dtype=np.int64)
+    _buf_cols = np.empty(_init_cap, dtype=np.int64)
+    _buf_labels = np.empty(_init_cap, dtype=np.float64)
+    _buf_len = 0
+
+    dy_arr = [0, 1, 1, 1, 0, -1, -1, -1]
+    dx_arr = [1, 1, 0, -1, -1, -1, 0, 1]
+
+    for r, c, label in points:
+        cr, cc = r, c
+        while True:
+            if _buf_len >= len(_buf_rows):
+                new_cap = len(_buf_rows) * 2
+                _new_rows = np.empty(new_cap, dtype=np.int64)
+                _new_rows[:_buf_len] = _buf_rows[:_buf_len]
+                _buf_rows = _new_rows
+                _new_cols = np.empty(new_cap, dtype=np.int64)
+                _new_cols[:_buf_len] = _buf_cols[:_buf_len]
+                _buf_cols = _new_cols
+                _new_labels = np.empty(new_cap, dtype=np.float64)
+                _new_labels[:_buf_len] = _buf_labels[:_buf_len]
+                _buf_labels = _new_labels
+
+            _buf_rows[_buf_len] = cr
+            _buf_cols[_buf_len] = cc
+            _buf_labels[_buf_len] = label
+            _buf_len += 1
+
+            iy, ix, lr, lc = _find_chunk(cr, cc)
+            chunk = _get_chunk(iy, ix)
+            # Check valid
+            if np.isnan(chunk[0, lr, lc]):
+                break
+            # Find dominant neighbor
+            best_k = -1
+            best_frac = 0.0
+            for k in range(8):
+                f = float(chunk[k, lr, lc])
+                if f > best_frac:
+                    best_frac = f
+                    best_k = k
+            if best_k == -1:
+                break
+            nr = cr + dy_arr[best_k]
+            nc = cc + dx_arr[best_k]
+            if nr < 0 or nr >= H or nc < 0 or nc >= W:
+                break
+            cr, cc = nr, nc
+
+    path_rows = _buf_rows[:_buf_len]
+    path_cols = _buf_cols[:_buf_len]
+    path_labels = _buf_labels[:_buf_len]
+
+    _get_chunk.cache_clear()
+
+    # Phase 4: assemble via map_blocks
+    _grouped = _group_cells_by_chunk(
+        path_rows, path_cols, path_labels,
+        chunks_y, chunks_x,
+    )
+
+    def _assemble_block(block, block_info=None):
+        if block_info is None or 0 not in block_info:
+            return np.full(block.shape, np.nan, dtype=np.float64)
+        loc = block_info[0]['chunk-location']
+        out = np.full(block.shape, np.nan, dtype=np.float64)
+        group = _grouped.get((loc[0], loc[1]))
+        if group is not None:
+            local_r, local_c, lbls = group
+            out[local_r, local_c] = lbls
+        return out
+
+    dummy = da.zeros((H, W), chunks=(chunks_y, chunks_x), dtype=np.float64)
+    return da.map_blocks(
+        _assemble_block, dummy,
+        dtype=np.float64,
+        meta=np.array((), dtype=np.float64),
+    )
+
+
+# =====================================================================
+# Dask+CuPy backend
+# =====================================================================
+
+def _flow_path_mfd_dask_cupy(fractions_data, start_points_data, chunks_y, chunks_x):
+    """Dask+CuPy: convert to numpy dask, run dask path, convert back."""
+    import cupy as cp
+
+    fr_np = fractions_data.map_blocks(
+        lambda b: b.get(), dtype=fractions_data.dtype,
+        meta=np.array((), dtype=fractions_data.dtype),
+    )
+    sp_np = start_points_data.map_blocks(
+        lambda b: b.get(), dtype=start_points_data.dtype,
+        meta=np.array((), dtype=start_points_data.dtype),
+    )
+
+    result = _flow_path_mfd_dask(fr_np, sp_np, chunks_y, chunks_x)
+    return result.map_blocks(
+        cp.asarray, dtype=result.dtype,
+        meta=cp.array((), dtype=result.dtype),
+    )
+
+
+# =====================================================================
+# Public API
+# =====================================================================
+
+@supports_dataset
+def flow_path_mfd(flow_dir_mfd: xr.DataArray,
+                  start_points: xr.DataArray,
+                  name: str = 'flow_path_mfd') -> xr.DataArray:
+    """Trace downstream flow paths using MFD dominant neighbor.
+
+    Parameters
+    ----------
+    flow_dir_mfd : xarray.DataArray or xr.Dataset
+        3D MFD flow direction array of shape (8, H, W) as returned
+        by flow_direction_mfd.
+    start_points : xarray.DataArray
+        2D raster where non-NaN cells are path starting locations.
+    name : str, default 'flow_path_mfd'
+        Name of output DataArray.
+
+    Returns
+    -------
+    xarray.DataArray or xr.Dataset
+        2D grid where each cell on a traced path carries the label
+        of its originating start point. All other cells are NaN.
+    """
+    _validate_raster(flow_dir_mfd, func_name='flow_path_mfd',
+                     name='flow_dir_mfd', ndim=3)
+
+    data = flow_dir_mfd.data
+    sp_data = start_points.data
+
+    if data.ndim != 3 or data.shape[0] != 8:
+        raise ValueError(
+            f"flow_dir_mfd must have shape (8, H, W), got {data.shape}")
+
+    _, H, W = data.shape
+
+    if isinstance(data, np.ndarray):
+        fr = data.astype(np.float64)
+        sp = np.asarray(sp_data, dtype=np.float64)
+        out = _flow_path_mfd_cpu(fr, sp, H, W)
+
+    elif has_cuda_and_cupy() and is_cupy_array(data):
+        out = _flow_path_mfd_cupy(data, sp_data)
+
+    elif has_cuda_and_cupy() and is_dask_cupy(flow_dir_mfd):
+        chunks_y = data.chunks[1]
+        chunks_x = data.chunks[2]
+        out = _flow_path_mfd_dask_cupy(data, sp_data, chunks_y, chunks_x)
+
+    elif da is not None and isinstance(data, da.Array):
+        chunks_y = data.chunks[1]
+        chunks_x = data.chunks[2]
+        out = _flow_path_mfd_dask(data, sp_data, chunks_y, chunks_x)
+
+    else:
+        raise TypeError(f"Unsupported array type: {type(data)}")
+
+    # Build 2D output coords
+    spatial_dims = flow_dir_mfd.dims[1:]
+    coords = {}
+    for d in spatial_dims:
+        if d in flow_dir_mfd.coords:
+            coords[d] = flow_dir_mfd.coords[d]
+
+    return xr.DataArray(out,
+                        name=name,
+                        coords=coords,
+                        dims=spatial_dims,
+                        attrs=flow_dir_mfd.attrs)
diff --git a/xrspatial/hand.py b/xrspatial/hydro/hand_d8.py
similarity index 99%
rename from xrspatial/hand.py
rename to xrspatial/hydro/hand_d8.py
index f4c0e823..fe4009ad 100644
--- a/xrspatial/hand.py
+++ b/xrspatial/hydro/hand_d8.py
@@ -23,9 +23,9 @@
 except ImportError:
     da = None
 
-from xrspatial.flow_accumulation import _code_to_offset
-from xrspatial.watershed import _code_to_offset_py
-from xrspatial._boundary_store import BoundaryStore
+from xrspatial.hydro.flow_accumulation_d8 import _code_to_offset
+from xrspatial.hydro.watershed_d8 import _code_to_offset_py
+from xrspatial.hydro._boundary_store import BoundaryStore
 from xrspatial.utils import (
     _validate_raster,
     has_cuda_and_cupy,
@@ -844,7 +844,7 @@ def _hand_dask_cupy(flow_dir_da, flow_accum_da, elev_da, threshold):
 # Public API
 # =====================================================================
 
-def hand(flow_dir: xr.DataArray,
+def hand_d8(flow_dir: xr.DataArray,
          flow_accum: xr.DataArray,
          elevation: xr.DataArray,
          threshold: float = 100,
diff --git a/xrspatial/hydro/hand_dinf.py b/xrspatial/hydro/hand_dinf.py
new file mode 100644
index 00000000..a5cec233
--- /dev/null
+++ b/xrspatial/hydro/hand_dinf.py
@@ -0,0 +1,612 @@
+"""D-infinity Height Above Nearest Drainage (HAND).
+
+Uses D-inf angle decomposition for downstream tracing.  At each cell,
+the dominant neighbor (higher weight) is followed to find the nearest
+stream cell.  HAND = elevation - drain_elevation.
+
+Algorithm
+---------
+CPU : Kahn's BFS topological sort with reverse propagation of drain_elev.
+      In-degrees use both D-inf neighbors; reverse pass follows dominant.
+GPU : CuPy-via-CPU.
+Dask: iterative tile sweep with BoundaryStore exit-label propagation.
+"""
+
+from __future__ import annotations
+
+import math
+
+import numpy as np
+import xarray as xr
+
+try:
+    import dask.array as da
+except ImportError:
+    da = None
+
+from xrspatial.hydro.flow_accumulation_dinf import _angle_to_neighbors
+from xrspatial.hydro.flow_path_dinf import _angle_to_neighbors_py
+from xrspatial.hydro._boundary_store import BoundaryStore
+from xrspatial.hydro.watershed_dinf import (
+    _dominant_offset_py,
+    _preprocess_tiles,
+    _to_numpy_f64,
+)
+from xrspatial.utils import (
+    _validate_raster,
+    has_cuda_and_cupy,
+    is_cupy_array,
+    is_dask_cupy,
+    ngjit,
+)
+
+
+# =====================================================================
+# CPU kernel
+# =====================================================================
+
+@ngjit
+def _hand_dinf_cpu(flow_dir, flow_accum, elevation, H, W, threshold):
+    """Compute HAND via Kahn's BFS with D-inf angle decomposition."""
+    in_degree = np.zeros((H, W), dtype=np.int32)
+    valid = np.zeros((H, W), dtype=np.int8)
+    is_stream = np.zeros((H, W), dtype=np.int8)
+    drain_elev = np.empty((H, W), dtype=np.float64)
+    hand_out = np.empty((H, W), dtype=np.float64)
+
+    for r in range(H):
+        for c in range(W):
+            v = flow_dir[r, c]
+            if v == v:  # not NaN
+                valid[r, c] = 1
+                fa = flow_accum[r, c]
+                if fa == fa and fa >= threshold:
+                    is_stream[r, c] = 1
+                    drain_elev[r, c] = elevation[r, c]
+                else:
+                    drain_elev[r, c] = np.nan
+            else:
+                drain_elev[r, c] = np.nan
+                hand_out[r, c] = np.nan
+
+    # In-degrees: both D-inf neighbors contribute
+    for r in range(H):
+        for c in range(W):
+            if valid[r, c] == 0:
+                continue
+            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[r, c])
+            if w1 > 0.0:
+                nr, nc = r + dy1, c + dx1
+                if 0 <= nr < H and 0 <= nc < W and valid[nr, nc] == 1:
+                    in_degree[nr, nc] += 1
+            if w2 > 0.0:
+                nr, nc = r + dy2, c + dx2
+                if 0 <= nr < H and 0 <= nc < W and valid[nr, nc] == 1:
+                    in_degree[nr, nc] += 1
+
+    # BFS topological order
+    order_r = np.empty(H * W, dtype=np.int64)
+    order_c = np.empty(H * W, dtype=np.int64)
+    head = np.int64(0)
+    tail = np.int64(0)
+
+    for r in range(H):
+        for c in range(W):
+            if valid[r, c] == 1 and in_degree[r, c] == 0:
+                order_r[tail] = r
+                order_c[tail] = c
+                tail += 1
+
+    while head < tail:
+        r = order_r[head]
+        c = order_c[head]
+        head += 1
+        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[r, c])
+        if w1 > 0.0:
+            nr, nc = r + dy1, c + dx1
+            if 0 <= nr < H and 0 <= nc < W and valid[nr, nc] == 1:
+                in_degree[nr, nc] -= 1
+                if in_degree[nr, nc] == 0:
+                    order_r[tail] = nr
+                    order_c[tail] = nc
+                    tail += 1
+        if w2 > 0.0:
+            nr, nc = r + dy2, c + dx2
+            if 0 <= nr < H and 0 <= nc < W and valid[nr, nc] == 1:
+                in_degree[nr, nc] -= 1
+                if in_degree[nr, nc] == 0:
+                    order_r[tail] = nr
+                    order_c[tail] = nc
+                    tail += 1
+
+    # Reverse pass: propagate drain_elev via dominant neighbor
+    for i in range(tail - 1, -1, -1):
+        r = order_r[i]
+        c = order_c[i]
+        if is_stream[r, c] == 1:
+            continue
+        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[r, c])
+        if w1 <= 0.0 and w2 <= 0.0:
+            drain_elev[r, c] = elevation[r, c]
+            continue
+        if w1 >= w2:
+            ddy, ddx = dy1, dx1
+        else:
+            ddy, ddx = dy2, dx2
+        nr, nc = r + ddy, c + ddx
+        if nr < 0 or nr >= H or nc < 0 or nc >= W:
+            drain_elev[r, c] = elevation[r, c]
+            continue
+        if valid[nr, nc] == 0:
+            drain_elev[r, c] = elevation[r, c]
+            continue
+        de = drain_elev[nr, nc]
+        if de == de:
+            drain_elev[r, c] = de
+        else:
+            drain_elev[r, c] = elevation[r, c]
+
+    for r in range(H):
+        for c in range(W):
+            if valid[r, c] == 1:
+                hand_out[r, c] = elevation[r, c] - drain_elev[r, c]
+            else:
+                hand_out[r, c] = np.nan
+
+    return hand_out
+
+
+# =====================================================================
+# CuPy backend
+# =====================================================================
+
+def _hand_dinf_cupy(fd_data, fa_data, elev_data, threshold):
+    import cupy as cp
+    fd_np = fd_data.get().astype(np.float64)
+    fa_np = fa_data.get().astype(np.float64)
+    el_np = elev_data.get().astype(np.float64)
+    H, W = fd_np.shape
+    out = _hand_dinf_cpu(fd_np, fa_np, el_np, H, W, threshold)
+    return cp.asarray(out)
+
+
+# =====================================================================
+# Dask tile kernel
+# =====================================================================
+
+@ngjit
+def _hand_dinf_drain_elev_tile(flow_dir, flow_accum, elevation, h, w,
+                                threshold,
+                                exit_top, exit_bottom, exit_left, exit_right,
+                                exit_tl, exit_tr, exit_bl, exit_br):
+    """Compute drain_elev for a D-inf tile (for boundary propagation)."""
+    in_degree = np.zeros((h, w), dtype=np.int32)
+    valid = np.zeros((h, w), dtype=np.int8)
+    is_stream = np.zeros((h, w), dtype=np.int8)
+    drain_elev = np.empty((h, w), dtype=np.float64)
+    known = np.zeros((h, w), dtype=np.int8)
+
+    for r in range(h):
+        for c in range(w):
+            v = flow_dir[r, c]
+            if v == v:
+                valid[r, c] = 1
+                fa = flow_accum[r, c]
+                if fa == fa and fa >= threshold:
+                    is_stream[r, c] = 1
+                    drain_elev[r, c] = elevation[r, c]
+                    known[r, c] = 1
+                else:
+                    drain_elev[r, c] = np.nan
+            else:
+                drain_elev[r, c] = np.nan
+
+    # Apply exit labels at boundaries where dominant neighbor exits tile
+    for c in range(w):
+        if valid[0, c] == 1 and known[0, c] == 0:
+            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[0, c])
+            if w1 <= 0.0 and w2 <= 0.0:
+                continue
+            if w1 >= w2:
+                ddy = dy1
+            else:
+                ddy = dy2
+            if 0 + ddy < 0:
+                el = exit_top[c]
+                if el == el:
+                    drain_elev[0, c] = el
+                    known[0, c] = 1
+                else:
+                    drain_elev[0, c] = elevation[0, c]
+                    known[0, c] = 1
+
+    for c in range(w):
+        if valid[h - 1, c] == 1 and known[h - 1, c] == 0:
+            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[h - 1, c])
+            if w1 <= 0.0 and w2 <= 0.0:
+                continue
+            if w1 >= w2:
+                ddy = dy1
+            else:
+                ddy = dy2
+            if h - 1 + ddy >= h:
+                el = exit_bottom[c]
+                if el == el:
+                    drain_elev[h - 1, c] = el
+                    known[h - 1, c] = 1
+                else:
+                    drain_elev[h - 1, c] = elevation[h - 1, c]
+                    known[h - 1, c] = 1
+
+    for r in range(h):
+        if valid[r, 0] == 1 and known[r, 0] == 0:
+            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[r, 0])
+            if w1 <= 0.0 and w2 <= 0.0:
+                continue
+            if w1 >= w2:
+                ddx = dx1
+            else:
+                ddx = dx2
+            if 0 + ddx < 0:
+                el = exit_left[r]
+                if el == el:
+                    drain_elev[r, 0] = el
+                    known[r, 0] = 1
+                else:
+                    drain_elev[r, 0] = elevation[r, 0]
+                    known[r, 0] = 1
+
+    for r in range(h):
+        if valid[r, w - 1] == 1 and known[r, w - 1] == 0:
+            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[r, w - 1])
+            if w1 <= 0.0 and w2 <= 0.0:
+                continue
+            if w1 >= w2:
+                ddx = dx1
+            else:
+                ddx = dx2
+            if w - 1 + ddx >= w:
+                el = exit_right[r]
+                if el == el:
+                    drain_elev[r, w - 1] = el
+                    known[r, w - 1] = 1
+                else:
+                    drain_elev[r, w - 1] = elevation[r, w - 1]
+                    known[r, w - 1] = 1
+
+    # Corner overrides
+    if valid[0, 0] == 1 and known[0, 0] == 0:
+        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[0, 0])
+        if not (w1 <= 0.0 and w2 <= 0.0):
+            if w1 >= w2:
+                ddy, ddx = dy1, dx1
+            else:
+                ddy, ddx = dy2, dx2
+            if 0 + ddy < 0 and 0 + ddx < 0:
+                if exit_tl == exit_tl:
+                    drain_elev[0, 0] = exit_tl
+                    known[0, 0] = 1
+    if valid[0, w - 1] == 1 and known[0, w - 1] == 0:
+        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[0, w - 1])
+        if not (w1 <= 0.0 and w2 <= 0.0):
+            if w1 >= w2:
+                ddy, ddx = dy1, dx1
+            else:
+                ddy, ddx = dy2, dx2
+            if 0 + ddy < 0 and w - 1 + ddx >= w:
+                if exit_tr == exit_tr:
+                    drain_elev[0, w - 1] = exit_tr
+                    known[0, w - 1] = 1
+    if valid[h - 1, 0] == 1 and known[h - 1, 0] == 0:
+        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[h - 1, 0])
+        if not (w1 <= 0.0 and w2 <= 0.0):
+            if w1 >= w2:
+                ddy, ddx = dy1, dx1
+            else:
+                ddy, ddx = dy2, dx2
+            if h - 1 + ddy >= h and 0 + ddx < 0:
+                if exit_bl == exit_bl:
+                    drain_elev[h - 1, 0] = exit_bl
+                    known[h - 1, 0] = 1
+    if valid[h - 1, w - 1] == 1 and known[h - 1, w - 1] == 0:
+        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[h - 1, w - 1])
+        if not (w1 <= 0.0 and w2 <= 0.0):
+            if w1 >= w2:
+                ddy, ddx = dy1, dx1
+            else:
+                ddy, ddx = dy2, dx2
+            if h - 1 + ddy >= h and w - 1 + ddx >= w:
+                if exit_br == exit_br:
+                    drain_elev[h - 1, w - 1] = exit_br
+                    known[h - 1, w - 1] = 1
+
+    # In-degrees (non-known cells only)
+    for r in range(h):
+        for c in range(w):
+            if valid[r, c] == 0 or known[r, c] == 1:
+                continue
+            dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[r, c])
+            if w1 > 0.0:
+                nr, nc = r + dy1, c + dx1
+                if 0 <= nr < h and 0 <= nc < w:
+                    if valid[nr, nc] == 1 and known[nr, nc] == 0:
+                        in_degree[nr, nc] += 1
+            if w2 > 0.0:
+                nr, nc = r + dy2, c + dx2
+                if 0 <= nr < h and 0 <= nc < w:
+                    if valid[nr, nc] == 1 and known[nr, nc] == 0:
+                        in_degree[nr, nc] += 1
+
+    # BFS
+    order_r = np.empty(h * w, dtype=np.int64)
+    order_c = np.empty(h * w, dtype=np.int64)
+    head = np.int64(0)
+    tail = np.int64(0)
+    for r in range(h):
+        for c in range(w):
+            if valid[r, c] == 1 and known[r, c] == 0 and in_degree[r, c] == 0:
+                order_r[tail] = r
+                order_c[tail] = c
+                tail += 1
+    while head < tail:
+        r = order_r[head]
+        c = order_c[head]
+        head += 1
+        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[r, c])
+        if w1 > 0.0:
+            nr, nc = r + dy1, c + dx1
+            if 0 <= nr < h and 0 <= nc < w and valid[nr, nc] == 1 and known[nr, nc] == 0:
+                in_degree[nr, nc] -= 1
+                if in_degree[nr, nc] == 0:
+                    order_r[tail] = nr
+                    order_c[tail] = nc
+                    tail += 1
+        if w2 > 0.0:
+            nr, nc = r + dy2, c + dx2
+            if 0 <= nr < h and 0 <= nc < w and valid[nr, nc] == 1 and known[nr, nc] == 0:
+                in_degree[nr, nc] -= 1
+                if in_degree[nr, nc] == 0:
+                    order_r[tail] = nr
+                    order_c[tail] = nc
+                    tail += 1
+
+    # Reverse pass
+    for i in range(tail - 1, -1, -1):
+        r = order_r[i]
+        c = order_c[i]
+        dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(flow_dir[r, c])
+        if w1 <= 0.0 and w2 <= 0.0:
+            drain_elev[r, c] = elevation[r, c]
+            continue
+        if w1 >= w2:
+            ddy, ddx = dy1, dx1
+        else:
+            ddy, ddx = dy2, dx2
+        nr, nc = r + ddy, c + ddx
+        if nr < 0 or nr >= h or nc < 0 or nc >= w:
+            drain_elev[r, c] = elevation[r, c]
+            continue
+        if valid[nr, nc] == 0:
+            drain_elev[r, c] = elevation[r, c]
+            continue
+        de = drain_elev[nr, nc]
+        if de == de:
+            drain_elev[r, c] = de
+        else:
+            drain_elev[r, c] = elevation[r, c]
+
+    return drain_elev
+
+
+@ngjit
+def _hand_dinf_tile_kernel(flow_dir, flow_accum, elevation, h, w, threshold,
+                            exit_top, exit_bottom, exit_left, exit_right,
+                            exit_tl, exit_tr, exit_bl, exit_br):
+    """HAND tile kernel: returns HAND values (elevation - drain_elev)."""
+    drain_elev = _hand_dinf_drain_elev_tile(
+        flow_dir, flow_accum, elevation, h, w, threshold,
+        exit_top, exit_bottom, exit_left, exit_right,
+        exit_tl, exit_tr, exit_bl, exit_br)
+
+    out = np.empty((h, w), dtype=np.float64)
+    for r in range(h):
+        for c in range(w):
+            v = flow_dir[r, c]
+            if v == v:
+                out[r, c] = elevation[r, c] - drain_elev[r, c]
+            else:
+                out[r, c] = np.nan
+    return out
+
+
+# =====================================================================
+# Dask iterative tile sweep
+# =====================================================================
+
+def _compute_exit_labels(iy, ix, boundaries, flow_bdry,
+                          chunks_y, chunks_x, n_tile_y, n_tile_x):
+    """Same exit-label pattern as watershed_dinf but propagating drain_elev."""
+    from xrspatial.hydro.watershed_dinf import _compute_exit_labels as _ws_compute
+    return _ws_compute(iy, ix, boundaries, flow_bdry,
+                       chunks_y, chunks_x, n_tile_y, n_tile_x)
+
+
+def _process_tile_hand(iy, ix, flow_dir_da, flow_accum_da, elev_da,
+                        boundaries, flow_bdry, threshold,
+                        chunks_y, chunks_x, n_tile_y, n_tile_x):
+    fd_chunk = np.asarray(
+        flow_dir_da.blocks[iy, ix].compute(), dtype=np.float64)
+    fa_chunk = np.asarray(
+        flow_accum_da.blocks[iy, ix].compute(), dtype=np.float64)
+    el_chunk = np.asarray(
+        elev_da.blocks[iy, ix].compute(), dtype=np.float64)
+    h, w = fd_chunk.shape
+
+    exits = _compute_exit_labels(
+        iy, ix, boundaries, flow_bdry,
+        chunks_y, chunks_x, n_tile_y, n_tile_x)
+
+    drain_elev = _hand_dinf_drain_elev_tile(
+        fd_chunk, fa_chunk, el_chunk, h, w, threshold, *exits)
+
+    new_top = drain_elev[0, :].copy()
+    new_bottom = drain_elev[-1, :].copy()
+    new_left = drain_elev[:, 0].copy()
+    new_right = drain_elev[:, -1].copy()
+
+    changed = False
+    for side, new in (('top', new_top), ('bottom', new_bottom),
+                      ('left', new_left), ('right', new_right)):
+        old = boundaries.get(side, iy, ix).copy()
+        with np.errstate(invalid='ignore'):
+            mask = ~(np.isnan(old) & np.isnan(new))
+            if mask.any():
+                diff = old[mask] != new[mask]
+                if np.any(diff):
+                    changed = True
+                    break
+
+    boundaries.set('top', iy, ix, new_top)
+    boundaries.set('bottom', iy, ix, new_bottom)
+    boundaries.set('left', iy, ix, new_left)
+    boundaries.set('right', iy, ix, new_right)
+
+    return changed
+
+
+def _hand_dinf_dask(flow_dir_da, flow_accum_da, elev_da, threshold):
+    chunks_y = flow_dir_da.chunks[0]
+    chunks_x = flow_dir_da.chunks[1]
+    n_tile_y = len(chunks_y)
+    n_tile_x = len(chunks_x)
+
+    flow_bdry = _preprocess_tiles(flow_dir_da, chunks_y, chunks_x)
+    flow_bdry = flow_bdry.snapshot()
+
+    boundaries = BoundaryStore(chunks_y, chunks_x, fill_value=np.nan)
+
+    max_iterations = max(n_tile_y, n_tile_x) * 2 + 10
+
+    for _iteration in range(max_iterations):
+        any_changed = False
+        for iy in range(n_tile_y):
+            for ix in range(n_tile_x):
+                c = _process_tile_hand(
+                    iy, ix, flow_dir_da, flow_accum_da, elev_da,
+                    boundaries, flow_bdry, threshold,
+                    chunks_y, chunks_x, n_tile_y, n_tile_x)
+                if c:
+                    any_changed = True
+        for iy in reversed(range(n_tile_y)):
+            for ix in reversed(range(n_tile_x)):
+                c = _process_tile_hand(
+                    iy, ix, flow_dir_da, flow_accum_da, elev_da,
+                    boundaries, flow_bdry, threshold,
+                    chunks_y, chunks_x, n_tile_y, n_tile_x)
+                if c:
+                    any_changed = True
+        if not any_changed:
+            break
+
+    boundaries = boundaries.snapshot()
+
+    def _tile_fn(fd_block, fa_block, el_block, block_info=None):
+        if block_info is None or 0 not in block_info:
+            return np.full(fd_block.shape, np.nan, dtype=np.float64)
+        iy, ix = block_info[0]['chunk-location']
+        h, w = fd_block.shape
+        exits = _compute_exit_labels(
+            iy, ix, boundaries, flow_bdry,
+            chunks_y, chunks_x, n_tile_y, n_tile_x)
+        return _hand_dinf_tile_kernel(
+            np.asarray(fd_block, dtype=np.float64),
+            np.asarray(fa_block, dtype=np.float64),
+            np.asarray(el_block, dtype=np.float64),
+            h, w, threshold, *exits)
+
+    return da.map_blocks(
+        _tile_fn,
+        flow_dir_da, flow_accum_da, elev_da,
+        dtype=np.float64,
+        meta=np.array((), dtype=np.float64),
+    )
+
+
+def _hand_dinf_dask_cupy(flow_dir_da, flow_accum_da, elev_da, threshold):
+    import cupy as cp
+    fd_np = flow_dir_da.map_blocks(
+        lambda b: b.get(), dtype=flow_dir_da.dtype,
+        meta=np.array((), dtype=flow_dir_da.dtype))
+    fa_np = flow_accum_da.map_blocks(
+        lambda b: b.get(), dtype=flow_accum_da.dtype,
+        meta=np.array((), dtype=flow_accum_da.dtype))
+    el_np = elev_da.map_blocks(
+        lambda b: b.get(), dtype=elev_da.dtype,
+        meta=np.array((), dtype=elev_da.dtype))
+    result = _hand_dinf_dask(fd_np, fa_np, el_np, threshold)
+    return result.map_blocks(
+        cp.asarray, dtype=result.dtype,
+        meta=cp.array((), dtype=result.dtype))
+
+
+# =====================================================================
+# Public API
+# =====================================================================
+
+def hand_dinf(flow_dir_dinf: xr.DataArray,
+              flow_accum: xr.DataArray,
+              elevation: xr.DataArray,
+              threshold: float = 100,
+              name: str = 'hand_dinf') -> xr.DataArray:
+    """Compute HAND using D-infinity flow direction.
+
+    Parameters
+    ----------
+    flow_dir_dinf : xarray.DataArray
+        2D D-infinity flow direction grid.
+    flow_accum : xarray.DataArray
+        2D flow accumulation grid.
+    elevation : xarray.DataArray
+        2D elevation grid.
+    threshold : float, default 100
+        Minimum flow accumulation to define a stream cell.
+    name : str, default 'hand_dinf'
+        Name of output DataArray.
+
+    Returns
+    -------
+    xarray.DataArray
+        2D float64 HAND grid. Stream cells have HAND = 0.
+    """
+    _validate_raster(flow_dir_dinf, func_name='hand_dinf', name='flow_dir_dinf')
+    _validate_raster(flow_accum, func_name='hand_dinf', name='flow_accum')
+    _validate_raster(elevation, func_name='hand_dinf', name='elevation')
+
+    fd_data = flow_dir_dinf.data
+    fa_data = flow_accum.data
+    el_data = elevation.data
+
+    if isinstance(fd_data, np.ndarray):
+        fd = fd_data.astype(np.float64)
+        fa = np.asarray(fa_data, dtype=np.float64)
+        el = np.asarray(el_data, dtype=np.float64)
+        H, W = fd.shape
+        out = _hand_dinf_cpu(fd, fa, el, H, W, float(threshold))
+
+    elif has_cuda_and_cupy() and is_cupy_array(fd_data):
+        out = _hand_dinf_cupy(fd_data, fa_data, el_data, float(threshold))
+
+    elif has_cuda_and_cupy() and is_dask_cupy(flow_dir_dinf):
+        out = _hand_dinf_dask_cupy(fd_data, fa_data, el_data, float(threshold))
+
+    elif da is not None and isinstance(fd_data, da.Array):
+        out = _hand_dinf_dask(fd_data, fa_data, el_data, float(threshold))
+
+    else:
+        raise TypeError(f"Unsupported array type: {type(fd_data)}")
+
+    return xr.DataArray(out,
+                        name=name,
+                        coords=flow_dir_dinf.coords,
+                        dims=flow_dir_dinf.dims,
+                        attrs=flow_dir_dinf.attrs)
diff --git a/xrspatial/hydro/hand_mfd.py b/xrspatial/hydro/hand_mfd.py
new file mode 100644
index 00000000..9ff16e4c
--- /dev/null
+++ b/xrspatial/hydro/hand_mfd.py
@@ -0,0 +1,637 @@
+"""MFD Height Above Nearest Drainage (HAND).
+
+Uses MFD dominant-neighbor (highest fraction) for downstream tracing.
+HAND = elevation - drain_elevation.
+
+Algorithm
+---------
+CPU : Kahn's BFS topological sort with reverse propagation of drain_elev.
+GPU : CuPy-via-CPU.
+Dask: iterative tile sweep with BoundaryStore exit-label propagation.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import xarray as xr
+
+try:
+    import dask.array as da
+except ImportError:
+    da = None
+
+from xrspatial.hydro._boundary_store import BoundaryStore
+from xrspatial.hydro.watershed_mfd import (
+    _dominant_offset_mfd_py,
+    _preprocess_mfd_tiles,
+    _to_numpy_f64,
+)
+from xrspatial.utils import (
+    _validate_raster,
+    has_cuda_and_cupy,
+    is_cupy_array,
+    is_dask_cupy,
+    ngjit,
+)
+
+
+# =====================================================================
+# CPU kernel
+# =====================================================================
+
+@ngjit
+def _hand_mfd_cpu(fractions, flow_accum, elevation, H, W, threshold):
+    """Compute HAND via Kahn's BFS with MFD dominant-neighbor tracing."""
+    dy = np.array([0, 1, 1, 1, 0, -1, -1, -1], dtype=np.int64)
+    dx = np.array([1, 1, 0, -1, -1, -1, 0, 1], dtype=np.int64)
+
+    in_degree = np.zeros((H, W), dtype=np.int32)
+    valid = np.zeros((H, W), dtype=np.int8)
+    is_stream = np.zeros((H, W), dtype=np.int8)
+    drain_elev = np.empty((H, W), dtype=np.float64)
+    hand_out = np.empty((H, W), dtype=np.float64)
+
+    for r in range(H):
+        for c in range(W):
+            v = fractions[0, r, c]
+            if v == v:
+                valid[r, c] = 1
+                fa = flow_accum[r, c]
+                if fa == fa and fa >= threshold:
+                    is_stream[r, c] = 1
+                    drain_elev[r, c] = elevation[r, c]
+                else:
+                    drain_elev[r, c] = np.nan
+            else:
+                drain_elev[r, c] = np.nan
+                hand_out[r, c] = np.nan
+
+    # In-degrees: all MFD neighbors with frac > 0 contribute
+    for r in range(H):
+        for c in range(W):
+            if valid[r, c] == 0:
+                continue
+            for k in range(8):
+                if fractions[k, r, c] > 0.0:
+                    nr = r + dy[k]
+                    nc = c + dx[k]
+                    if 0 <= nr < H and 0 <= nc < W and valid[nr, nc] == 1:
+                        in_degree[nr, nc] += 1
+
+    # BFS topological order
+    order_r = np.empty(H * W, dtype=np.int64)
+    order_c = np.empty(H * W, dtype=np.int64)
+    head = np.int64(0)
+    tail = np.int64(0)
+
+    for r in range(H):
+        for c in range(W):
+            if valid[r, c] == 1 and in_degree[r, c] == 0:
+                order_r[tail] = r
+                order_c[tail] = c
+                tail += 1
+
+    while head < tail:
+        r = order_r[head]
+        c = order_c[head]
+        head += 1
+        for k in range(8):
+            if fractions[k, r, c] > 0.0:
+                nr = r + dy[k]
+                nc = c + dx[k]
+                if 0 <= nr < H and 0 <= nc < W and valid[nr, nc] == 1:
+                    in_degree[nr, nc] -= 1
+                    if in_degree[nr, nc] == 0:
+                        order_r[tail] = nr
+                        order_c[tail] = nc
+                        tail += 1
+
+    # Reverse pass: propagate drain_elev via dominant neighbor
+    for i in range(tail - 1, -1, -1):
+        r = order_r[i]
+        c = order_c[i]
+        if is_stream[r, c] == 1:
+            continue
+        best_k = -1
+        best_frac = 0.0
+        for k in range(8):
+            f = fractions[k, r, c]
+            if f > best_frac:
+                best_frac = f
+                best_k = k
+        if best_k == -1:
+            drain_elev[r, c] = elevation[r, c]
+            continue
+        nr, nc = r + dy[best_k], c + dx[best_k]
+        if nr < 0 or nr >= H or nc < 0 or nc >= W:
+            drain_elev[r, c] = elevation[r, c]
+            continue
+        if valid[nr, nc] == 0:
+            drain_elev[r, c] = elevation[r, c]
+            continue
+        de = drain_elev[nr, nc]
+        if de == de:
+            drain_elev[r, c] = de
+        else:
+            drain_elev[r, c] = elevation[r, c]
+
+    for r in range(H):
+        for c in range(W):
+            if valid[r, c] == 1:
+                hand_out[r, c] = elevation[r, c] - drain_elev[r, c]
+            else:
+                hand_out[r, c] = np.nan
+
+    return hand_out
+
+
+# =====================================================================
+# CuPy backend
+# =====================================================================
+
+def _hand_mfd_cupy(fr_data, fa_data, elev_data, threshold):
+    import cupy as cp
+    fr_np = fr_data.get().astype(np.float64)
+    fa_np = fa_data.get().astype(np.float64)
+    el_np = elev_data.get().astype(np.float64)
+    _, H, W = fr_np.shape
+    out = _hand_mfd_cpu(fr_np, fa_np, el_np, H, W, threshold)
+    return cp.asarray(out)
+
+
+# =====================================================================
+# Dask tile kernel
+# =====================================================================
+
+@ngjit
+def _hand_mfd_drain_elev_tile(fractions, flow_accum, elevation, h, w,
+                                threshold,
+                                exit_top, exit_bottom, exit_left, exit_right,
+                                exit_tl, exit_tr, exit_bl, exit_br):
+    """Compute drain_elev for an MFD tile (for boundary propagation)."""
+    dy = np.array([0, 1, 1, 1, 0, -1, -1, -1], dtype=np.int64)
+    dx = np.array([1, 1, 0, -1, -1, -1, 0, 1], dtype=np.int64)
+
+    in_degree = np.zeros((h, w), dtype=np.int32)
+    valid = np.zeros((h, w), dtype=np.int8)
+    is_stream = np.zeros((h, w), dtype=np.int8)
+    drain_elev = np.empty((h, w), dtype=np.float64)
+    known = np.zeros((h, w), dtype=np.int8)
+
+    for r in range(h):
+        for c in range(w):
+            v = fractions[0, r, c]
+            if v == v:
+                valid[r, c] = 1
+                fa = flow_accum[r, c]
+                if fa == fa and fa >= threshold:
+                    is_stream[r, c] = 1
+                    drain_elev[r, c] = elevation[r, c]
+                    known[r, c] = 1
+                else:
+                    drain_elev[r, c] = np.nan
+            else:
+                drain_elev[r, c] = np.nan
+
+    # Apply exit labels at boundaries where dominant neighbor exits tile
+    # Top row
+    for c in range(w):
+        if valid[0, c] == 1 and known[0, c] == 0:
+            best_k = -1
+            best_frac = 0.0
+            for k in range(8):
+                f = fractions[k, 0, c]
+                if f > best_frac:
+                    best_frac = f
+                    best_k = k
+            if best_k >= 0 and 0 + dy[best_k] < 0:
+                el = exit_top[c]
+                if el == el:
+                    drain_elev[0, c] = el
+                    known[0, c] = 1
+                else:
+                    drain_elev[0, c] = elevation[0, c]
+                    known[0, c] = 1
+
+    # Bottom row
+    for c in range(w):
+        if valid[h - 1, c] == 1 and known[h - 1, c] == 0:
+            best_k = -1
+            best_frac = 0.0
+            for k in range(8):
+                f = fractions[k, h - 1, c]
+                if f > best_frac:
+                    best_frac = f
+                    best_k = k
+            if best_k >= 0 and h - 1 + dy[best_k] >= h:
+                el = exit_bottom[c]
+                if el == el:
+                    drain_elev[h - 1, c] = el
+                    known[h - 1, c] = 1
+                else:
+                    drain_elev[h - 1, c] = elevation[h - 1, c]
+                    known[h - 1, c] = 1
+
+    # Left column
+    for r in range(h):
+        if valid[r, 0] == 1 and known[r, 0] == 0:
+            best_k = -1
+            best_frac = 0.0
+            for k in range(8):
+                f = fractions[k, r, 0]
+                if f > best_frac:
+                    best_frac = f
+                    best_k = k
+            if best_k >= 0 and 0 + dx[best_k] < 0:
+                el = exit_left[r]
+                if el == el:
+                    drain_elev[r, 0] = el
+                    known[r, 0] = 1
+                else:
+                    drain_elev[r, 0] = elevation[r, 0]
+                    known[r, 0] = 1
+
+    # Right column
+    for r in range(h):
+        if valid[r, w - 1] == 1 and known[r, w - 1] == 0:
+            best_k = -1
+            best_frac = 0.0
+            for k in range(8):
+                f = fractions[k, r, w - 1]
+                if f > best_frac:
+                    best_frac = f
+                    best_k = k
+            if best_k >= 0 and w - 1 + dx[best_k] >= w:
+                el = exit_right[r]
+                if el == el:
+                    drain_elev[r, w - 1] = el
+                    known[r, w - 1] = 1
+                else:
+                    drain_elev[r, w - 1] = elevation[r, w - 1]
+                    known[r, w - 1] = 1
+
+    # Corners
+    if valid[0, 0] == 1 and known[0, 0] == 0:
+        best_k = -1
+        best_frac = 0.0
+        for k in range(8):
+            f = fractions[k, 0, 0]
+            if f > best_frac:
+                best_frac = f
+                best_k = k
+        if best_k >= 0 and 0 + dy[best_k] < 0 and 0 + dx[best_k] < 0:
+            if exit_tl == exit_tl:
+                drain_elev[0, 0] = exit_tl
+                known[0, 0] = 1
+
+    if valid[0, w - 1] == 1 and known[0, w - 1] == 0:
+        best_k = -1
+        best_frac = 0.0
+        for k in range(8):
+            f = fractions[k, 0, w - 1]
+            if f > best_frac:
+                best_frac = f
+                best_k = k
+        if best_k >= 0 and 0 + dy[best_k] < 0 and w - 1 + dx[best_k] >= w:
+            if exit_tr == exit_tr:
+                drain_elev[0, w - 1] = exit_tr
+                known[0, w - 1] = 1
+
+    if valid[h - 1, 0] == 1 and known[h - 1, 0] == 0:
+        best_k = -1
+        best_frac = 0.0
+        for k in range(8):
+            f = fractions[k, h - 1, 0]
+            if f > best_frac:
+                best_frac = f
+                best_k = k
+        if best_k >= 0 and h - 1 + dy[best_k] >= h and 0 + dx[best_k] < 0:
+            if exit_bl == exit_bl:
+                drain_elev[h - 1, 0] = exit_bl
+                known[h - 1, 0] = 1
+
+    if valid[h - 1, w - 1] == 1 and known[h - 1, w - 1] == 0:
+        best_k = -1
+        best_frac = 0.0
+        for k in range(8):
+            f = fractions[k, h - 1, w - 1]
+            if f > best_frac:
+                best_frac = f
+                best_k = k
+        if best_k >= 0 and h - 1 + dy[best_k] >= h and w - 1 + dx[best_k] >= w:
+            if exit_br == exit_br:
+                drain_elev[h - 1, w - 1] = exit_br
+                known[h - 1, w - 1] = 1
+
+    # In-degrees
+    for r in range(h):
+        for c in range(w):
+            if valid[r, c] == 0 or known[r, c] == 1:
+                continue
+            for k in range(8):
+                if fractions[k, r, c] > 0.0:
+                    nr = r + dy[k]
+                    nc = c + dx[k]
+                    if 0 <= nr < h and 0 <= nc < w:
+                        if valid[nr, nc] == 1 and known[nr, nc] == 0:
+                            in_degree[nr, nc] += 1
+
+    # BFS
+    order_r = np.empty(h * w, dtype=np.int64)
+    order_c = np.empty(h * w, dtype=np.int64)
+    head = np.int64(0)
+    tail = np.int64(0)
+    for r in range(h):
+        for c in range(w):
+            if valid[r, c] == 1 and known[r, c] == 0 and in_degree[r, c] == 0:
+                order_r[tail] = r
+                order_c[tail] = c
+                tail += 1
+    while head < tail:
+        r = order_r[head]
+        c = order_c[head]
+        head += 1
+        for k in range(8):
+            if fractions[k, r, c] > 0.0:
+                nr = r + dy[k]
+                nc = c + dx[k]
+                if 0 <= nr < h and 0 <= nc < w and valid[nr, nc] == 1 and known[nr, nc] == 0:
+                    in_degree[nr, nc] -= 1
+                    if in_degree[nr, nc] == 0:
+                        order_r[tail] = nr
+                        order_c[tail] = nc
+                        tail += 1
+
+    # Reverse pass
+    for i in range(tail - 1, -1, -1):
+        r = order_r[i]
+        c = order_c[i]
+        best_k = -1
+        best_frac = 0.0
+        for k in range(8):
+            f = fractions[k, r, c]
+            if f > best_frac:
+                best_frac = f
+                best_k = k
+        if best_k == -1:
+            drain_elev[r, c] = elevation[r, c]
+            continue
+        nr, nc = r + dy[best_k], c + dx[best_k]
+        if nr < 0 or nr >= h or nc < 0 or nc >= w:
+            drain_elev[r, c] = elevation[r, c]
+            continue
+        if valid[nr, nc] == 0:
+            drain_elev[r, c] = elevation[r, c]
+            continue
+        de = drain_elev[nr, nc]
+        if de == de:
+            drain_elev[r, c] = de
+        else:
+            drain_elev[r, c] = elevation[r, c]
+
+    return drain_elev
+
+
+@ngjit
+def _hand_mfd_tile_kernel(fractions, flow_accum, elevation, h, w, threshold,
+                           exit_top, exit_bottom, exit_left, exit_right,
+                           exit_tl, exit_tr, exit_bl, exit_br):
+    """HAND tile kernel: returns HAND values."""
+    drain_elev = _hand_mfd_drain_elev_tile(
+        fractions, flow_accum, elevation, h, w, threshold,
+        exit_top, exit_bottom, exit_left, exit_right,
+        exit_tl, exit_tr, exit_bl, exit_br)
+
+    out = np.empty((h, w), dtype=np.float64)
+    for r in range(h):
+        for c in range(w):
+            v = fractions[0, r, c]
+            if v == v:
+                out[r, c] = elevation[r, c] - drain_elev[r, c]
+            else:
+                out[r, c] = np.nan
+    return out
+
+
+# =====================================================================
+# Dask iterative tile sweep
+# =====================================================================
+
+def _compute_exit_labels_mfd(iy, ix, boundaries, frac_bdry,
+                              chunks_y, chunks_x, n_tile_y, n_tile_x):
+    from xrspatial.hydro.watershed_mfd import _compute_exit_labels_mfd as _ws_compute
+    return _ws_compute(iy, ix, boundaries, frac_bdry,
+                       chunks_y, chunks_x, n_tile_y, n_tile_x)
+
+
+def _process_tile_hand_mfd(iy, ix, fractions_da, flow_accum_da, elev_da,
+                             boundaries, frac_bdry, threshold,
+                             chunks_y, chunks_x, n_tile_y, n_tile_x):
+    y_start = sum(chunks_y[:iy])
+    y_end = y_start + chunks_y[iy]
+    x_start = sum(chunks_x[:ix])
+    x_end = x_start + chunks_x[ix]
+
+    fr_chunk = np.asarray(
+        fractions_da[:, y_start:y_end, x_start:x_end].compute(),
+        dtype=np.float64)
+    fa_chunk = np.asarray(
+        flow_accum_da.blocks[iy, ix].compute(), dtype=np.float64)
+    el_chunk = np.asarray(
+        elev_da.blocks[iy, ix].compute(), dtype=np.float64)
+    _, h, w = fr_chunk.shape
+
+    exits = _compute_exit_labels_mfd(
+        iy, ix, boundaries, frac_bdry,
+        chunks_y, chunks_x, n_tile_y, n_tile_x)
+
+    drain_elev = _hand_mfd_drain_elev_tile(
+        fr_chunk, fa_chunk, el_chunk, h, w, threshold, *exits)
+
+    new_top = drain_elev[0, :].copy()
+    new_bottom = drain_elev[-1, :].copy()
+    new_left = drain_elev[:, 0].copy()
+    new_right = drain_elev[:, -1].copy()
+
+    changed = False
+    for side, new in (('top', new_top), ('bottom', new_bottom),
+                      ('left', new_left), ('right', new_right)):
+        old = boundaries.get(side, iy, ix).copy()
+        with np.errstate(invalid='ignore'):
+            mask = ~(np.isnan(old) & np.isnan(new))
+            if mask.any():
+                diff = old[mask] != new[mask]
+                if np.any(diff):
+                    changed = True
+                    break
+
+    boundaries.set('top', iy, ix, new_top)
+    boundaries.set('bottom', iy, ix, new_bottom)
+    boundaries.set('left', iy, ix, new_left)
+    boundaries.set('right', iy, ix, new_right)
+
+    return changed
+
+
+def _hand_mfd_dask(fractions_da, flow_accum_da, elev_da, threshold,
+                     chunks_y, chunks_x):
+    n_tile_y = len(chunks_y)
+    n_tile_x = len(chunks_x)
+
+    frac_bdry = _preprocess_mfd_tiles(fractions_da, chunks_y, chunks_x)
+    boundaries = BoundaryStore(chunks_y, chunks_x, fill_value=np.nan)
+
+    max_iterations = max(n_tile_y, n_tile_x) * 2 + 10
+
+    for _iteration in range(max_iterations):
+        any_changed = False
+        for iy in range(n_tile_y):
+            for ix in range(n_tile_x):
+                c = _process_tile_hand_mfd(
+                    iy, ix, fractions_da, flow_accum_da, elev_da,
+                    boundaries, frac_bdry, threshold,
+                    chunks_y, chunks_x, n_tile_y, n_tile_x)
+                if c:
+                    any_changed = True
+        for iy in reversed(range(n_tile_y)):
+            for ix in reversed(range(n_tile_x)):
+                c = _process_tile_hand_mfd(
+                    iy, ix, fractions_da, flow_accum_da, elev_da,
+                    boundaries, frac_bdry, threshold,
+                    chunks_y, chunks_x, n_tile_y, n_tile_x)
+                if c:
+                    any_changed = True
+        if not any_changed:
+            break
+
+    boundaries = boundaries.snapshot()
+
+    # Assemble
+    rows = []
+    for iy in range(n_tile_y):
+        row = []
+        for ix in range(n_tile_x):
+            y_start = sum(chunks_y[:iy])
+            y_end = y_start + chunks_y[iy]
+            x_start = sum(chunks_x[:ix])
+            x_end = x_start + chunks_x[ix]
+
+            fr_chunk = np.asarray(
+                fractions_da[:, y_start:y_end, x_start:x_end].compute(),
+                dtype=np.float64)
+            fa_chunk = np.asarray(
+                flow_accum_da.blocks[iy, ix].compute(), dtype=np.float64)
+            el_chunk = np.asarray(
+                elev_da.blocks[iy, ix].compute(), dtype=np.float64)
+            _, h, w = fr_chunk.shape
+
+            exits = _compute_exit_labels_mfd(
+                iy, ix, boundaries, frac_bdry,
+                chunks_y, chunks_x, n_tile_y, n_tile_x)
+
+            tile = _hand_mfd_tile_kernel(
+                fr_chunk, fa_chunk, el_chunk, h, w, threshold, *exits)
+            row.append(da.from_array(tile, chunks=tile.shape))
+        rows.append(row)
+
+    return da.block(rows)
+
+
+def _hand_mfd_dask_cupy(fractions_da, flow_accum_da, elev_da, threshold,
+                          chunks_y, chunks_x):
+    import cupy as cp
+    fr_np = fractions_da.map_blocks(
+        lambda b: b.get(), dtype=fractions_da.dtype,
+        meta=np.array((), dtype=fractions_da.dtype))
+    fa_np = flow_accum_da.map_blocks(
+        lambda b: b.get(), dtype=flow_accum_da.dtype,
+        meta=np.array((), dtype=flow_accum_da.dtype))
+    el_np = elev_da.map_blocks(
+        lambda b: b.get(), dtype=elev_da.dtype,
+        meta=np.array((), dtype=elev_da.dtype))
+    result = _hand_mfd_dask(fr_np, fa_np, el_np, threshold,
+                             chunks_y, chunks_x)
+    return result.map_blocks(
+        cp.asarray, dtype=result.dtype,
+        meta=cp.array((), dtype=result.dtype))
+
+
+# =====================================================================
+# Public API
+# =====================================================================
+
+def hand_mfd(flow_dir_mfd: xr.DataArray,
+             flow_accum: xr.DataArray,
+             elevation: xr.DataArray,
+             threshold: float = 100,
+             name: str = 'hand_mfd') -> xr.DataArray:
+    """Compute HAND using MFD flow direction.
+
+    Parameters
+    ----------
+    flow_dir_mfd : xarray.DataArray
+        3D MFD flow direction array of shape (8, H, W).
+    flow_accum : xarray.DataArray
+        2D flow accumulation grid.
+    elevation : xarray.DataArray
+        2D elevation grid.
+    threshold : float, default 100
+        Minimum flow accumulation to define a stream cell.
+    name : str, default 'hand_mfd'
+        Name of output DataArray.
+
+    Returns
+    -------
+    xarray.DataArray
+        2D float64 HAND grid. Stream cells have HAND = 0.
+    """
+    _validate_raster(flow_dir_mfd, func_name='hand_mfd',
+                     name='flow_dir_mfd', ndim=3)
+    _validate_raster(flow_accum, func_name='hand_mfd', name='flow_accum')
+    _validate_raster(elevation, func_name='hand_mfd', name='elevation')
+
+    data = flow_dir_mfd.data
+    fa_data = flow_accum.data
+    el_data = elevation.data
+
+    if data.ndim != 3 or data.shape[0] != 8:
+        raise ValueError(
+            f"flow_dir_mfd must have shape (8, H, W), got {data.shape}")
+
+    _, H, W = data.shape
+
+    if isinstance(data, np.ndarray):
+        fr = data.astype(np.float64)
+        fa = np.asarray(fa_data, dtype=np.float64)
+        el = np.asarray(el_data, dtype=np.float64)
+        out = _hand_mfd_cpu(fr, fa, el, H, W, float(threshold))
+
+    elif has_cuda_and_cupy() and is_cupy_array(data):
+        out = _hand_mfd_cupy(data, fa_data, el_data, float(threshold))
+
+    elif has_cuda_and_cupy() and is_dask_cupy(flow_dir_mfd):
+        chunks_y = data.chunks[1]
+        chunks_x = data.chunks[2]
+        out = _hand_mfd_dask_cupy(data, fa_data, el_data,
+                                    float(threshold), chunks_y, chunks_x)
+
+    elif da is not None and isinstance(data, da.Array):
+        chunks_y = data.chunks[1]
+        chunks_x = data.chunks[2]
+        out = _hand_mfd_dask(data, fa_data, el_data,
+                              float(threshold), chunks_y, chunks_x)
+
+    else:
+        raise TypeError(f"Unsupported array type: {type(data)}")
+
+    spatial_dims = flow_dir_mfd.dims[1:]
+    coords = {}
+    for d in spatial_dims:
+        if d in flow_dir_mfd.coords:
+            coords[d] = flow_dir_mfd.coords[d]
+
+    return xr.DataArray(out,
+                        name=name,
+                        coords=coords,
+                        dims=spatial_dims,
+                        attrs=flow_dir_mfd.attrs)
diff --git a/xrspatial/sink.py b/xrspatial/hydro/sink_d8.py
similarity index 99%
rename from xrspatial/sink.py
rename to xrspatial/hydro/sink_d8.py
index f4d811c3..1a139d31 100644
--- a/xrspatial/sink.py
+++ b/xrspatial/hydro/sink_d8.py
@@ -228,7 +228,7 @@ def _run_dask_cupy(data):
 # =====================================================================
 
 @supports_dataset
-def sink(flow_dir: xr.DataArray,
+def sink_d8(flow_dir: xr.DataArray,
          name: str = 'sink') -> xr.DataArray:
     """Identify and label depression cells in a D8 flow direction grid.
 
diff --git a/xrspatial/snap_pour_point.py b/xrspatial/hydro/snap_pour_point_d8.py
similarity index 99%
rename from xrspatial/snap_pour_point.py
rename to xrspatial/hydro/snap_pour_point_d8.py
index 96bd58a1..0f8dc173 100644
--- a/xrspatial/snap_pour_point.py
+++ b/xrspatial/hydro/snap_pour_point_d8.py
@@ -454,7 +454,7 @@ def _assemble_block(block, block_info=None):
 # =====================================================================
 
 @supports_dataset
-def snap_pour_point(flow_accum: xr.DataArray,
+def snap_pour_point_d8(flow_accum: xr.DataArray,
                     pour_points: xr.DataArray,
                     search_radius: int = 5,
                     name: str = 'snapped_pour_points') -> xr.DataArray:
diff --git a/xrspatial/stream_link.py b/xrspatial/hydro/stream_link_d8.py
similarity index 99%
rename from xrspatial/stream_link.py
rename to xrspatial/hydro/stream_link_d8.py
index 73462004..1c1ea76f 100644
--- a/xrspatial/stream_link.py
+++ b/xrspatial/hydro/stream_link_d8.py
@@ -42,10 +42,10 @@ class cupy:  # type: ignore[no-redef]
     is_dask_cupy,
     ngjit,
 )
-from xrspatial._boundary_store import BoundaryStore
+from xrspatial.hydro._boundary_store import BoundaryStore
 from xrspatial.dataset_support import supports_dataset
-from xrspatial.flow_accumulation import _code_to_offset, _code_to_offset_py
-from xrspatial.stream_order import _preprocess_stream_tiles, _to_numpy_f64
+from xrspatial.hydro.flow_accumulation_d8 import _code_to_offset, _code_to_offset_py
+from xrspatial.hydro.stream_order_d8 import _preprocess_stream_tiles, _to_numpy_f64
 
 
 # =====================================================================
@@ -982,7 +982,7 @@ def _tile_fn(block, accum_block, block_info=None):
 # =====================================================================
 
 @supports_dataset
-def stream_link(flow_dir: xr.DataArray,
+def stream_link_d8(flow_dir: xr.DataArray,
                 flow_accum: xr.DataArray,
                 threshold: float = 100,
                 name: str = 'stream_link') -> xr.DataArray:
diff --git a/xrspatial/stream_link_dinf.py b/xrspatial/hydro/stream_link_dinf.py
similarity index 99%
rename from xrspatial/stream_link_dinf.py
rename to xrspatial/hydro/stream_link_dinf.py
index b3777832..0aa497da 100644
--- a/xrspatial/stream_link_dinf.py
+++ b/xrspatial/hydro/stream_link_dinf.py
@@ -48,10 +48,10 @@ class cupy:  # type: ignore[no-redef]
     is_dask_cupy,
     ngjit,
 )
-from xrspatial._boundary_store import BoundaryStore
+from xrspatial.hydro._boundary_store import BoundaryStore
 from xrspatial.dataset_support import supports_dataset
-from xrspatial.stream_order import _to_numpy_f64
-from xrspatial.stream_order_dinf import (
+from xrspatial.hydro.stream_order_d8 import _to_numpy_f64
+from xrspatial.hydro.stream_order_dinf import (
     _dinf_downstream,
     _NB_DY,
     _NB_DX,
diff --git a/xrspatial/stream_link_mfd.py b/xrspatial/hydro/stream_link_mfd.py
similarity index 99%
rename from xrspatial/stream_link_mfd.py
rename to xrspatial/hydro/stream_link_mfd.py
index 50a3c972..7e649993 100644
--- a/xrspatial/stream_link_mfd.py
+++ b/xrspatial/hydro/stream_link_mfd.py
@@ -47,9 +47,9 @@ class cupy:  # type: ignore[no-redef]
     is_dask_cupy,
     ngjit,
 )
-from xrspatial._boundary_store import BoundaryStore
+from xrspatial.hydro._boundary_store import BoundaryStore
 from xrspatial.dataset_support import supports_dataset
-from xrspatial.stream_order import _to_numpy_f64
+from xrspatial.hydro.stream_order_d8 import _to_numpy_f64
 
 
 # =====================================================================
diff --git a/xrspatial/stream_order.py b/xrspatial/hydro/stream_order_d8.py
similarity index 99%
rename from xrspatial/stream_order.py
rename to xrspatial/hydro/stream_order_d8.py
index c7122dad..158faaaf 100644
--- a/xrspatial/stream_order.py
+++ b/xrspatial/hydro/stream_order_d8.py
@@ -44,9 +44,9 @@ class cupy:  # type: ignore[no-redef]
     is_dask_cupy,
     ngjit,
 )
-from xrspatial._boundary_store import BoundaryStore
+from xrspatial.hydro._boundary_store import BoundaryStore
 from xrspatial.dataset_support import supports_dataset
-from xrspatial.flow_accumulation import _code_to_offset, _code_to_offset_py
+from xrspatial.hydro.flow_accumulation_d8 import _code_to_offset, _code_to_offset_py
 
 
 def _to_numpy_f64(arr):
@@ -1460,11 +1460,11 @@ def _tile_fn(block, accum_block, block_info=None):
 # =====================================================================
 
 @supports_dataset
-def stream_order(flow_dir: xr.DataArray,
-                 flow_accum: xr.DataArray,
-                 threshold: float = 100,
-                 method: str = 'strahler',
-                 name: str = 'stream_order') -> xr.DataArray:
+def stream_order_d8(flow_dir: xr.DataArray,
+                    flow_accum: xr.DataArray,
+                    threshold: float = 100,
+                    ordering: str = 'strahler',
+                    name: str = 'stream_order') -> xr.DataArray:
     """Compute stream order from D8 flow direction and accumulation grids.
 
     Parameters
@@ -1478,7 +1478,7 @@ def stream_order(flow_dir: xr.DataArray,
     threshold : float, default 100
         Minimum accumulation to classify a cell as part of the
         stream network.
-    method : str, default 'strahler'
+    ordering : str, default 'strahler'
         ``'strahler'`` for Strahler branching hierarchy or
         ``'shreve'`` for Shreve cumulative magnitude.
     name : str, default 'stream_order'
@@ -1501,10 +1501,10 @@ def stream_order(flow_dir: xr.DataArray,
     """
     _validate_raster(flow_dir, func_name='stream_order', name='flow_dir')
 
-    method = method.lower()
+    method = ordering.lower()
     if method not in ('strahler', 'shreve'):
         raise ValueError(
-            f"method must be 'strahler' or 'shreve', got {method!r}")
+            f"ordering must be 'strahler' or 'shreve', got {ordering!r}")
 
     fd_data = flow_dir.data
     fa_data = flow_accum.data
diff --git a/xrspatial/stream_order_dinf.py b/xrspatial/hydro/stream_order_dinf.py
similarity index 99%
rename from xrspatial/stream_order_dinf.py
rename to xrspatial/hydro/stream_order_dinf.py
index cd2fe365..66faf4ca 100644
--- a/xrspatial/stream_order_dinf.py
+++ b/xrspatial/hydro/stream_order_dinf.py
@@ -52,9 +52,9 @@ class cupy:  # type: ignore[no-redef]
     is_dask_cupy,
     ngjit,
 )
-from xrspatial._boundary_store import BoundaryStore
+from xrspatial.hydro._boundary_store import BoundaryStore
 from xrspatial.dataset_support import supports_dataset
-from xrspatial.stream_order import _to_numpy_f64
+from xrspatial.hydro.stream_order_d8 import _to_numpy_f64
 
 
 # Neighbor offsets: counterclockwise from East
diff --git a/xrspatial/stream_order_mfd.py b/xrspatial/hydro/stream_order_mfd.py
similarity index 99%
rename from xrspatial/stream_order_mfd.py
rename to xrspatial/hydro/stream_order_mfd.py
index cd216b24..be0783c7 100644
--- a/xrspatial/stream_order_mfd.py
+++ b/xrspatial/hydro/stream_order_mfd.py
@@ -44,7 +44,7 @@ class cupy:  # type: ignore[no-redef]
     is_dask_cupy,
     ngjit,
 )
-from xrspatial._boundary_store import BoundaryStore
+from xrspatial.hydro._boundary_store import BoundaryStore
 from xrspatial.dataset_support import supports_dataset
 
 # Neighbor offsets: E, SE, S, SW, W, NW, N, NE
diff --git a/xrspatial/hydro/tests/__init__.py b/xrspatial/hydro/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/xrspatial/hydro/tests/conftest.py b/xrspatial/hydro/tests/conftest.py
new file mode 100644
index 00000000..6d27702c
--- /dev/null
+++ b/xrspatial/hydro/tests/conftest.py
@@ -0,0 +1,3 @@
+"""Re-export shared test fixtures so hydro tests can discover them."""
+
+from xrspatial.tests.conftest import random_data  # noqa: F401
diff --git a/xrspatial/tests/test_basin.py b/xrspatial/hydro/tests/test_basin_d8.py
similarity index 97%
rename from xrspatial/tests/test_basin.py
rename to xrspatial/hydro/tests/test_basin_d8.py
index 10c8eaa9..1a3612f8 100644
--- a/xrspatial/tests/test_basin.py
+++ b/xrspatial/hydro/tests/test_basin_d8.py
@@ -2,7 +2,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial import basin
+from xrspatial.hydro import basin
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
@@ -155,7 +155,7 @@ def test_basin_dask_edge_exits():
 @dask_array_available
 def test_basin_dask_random():
     """Random acyclic flow_dir: dask basin matches numpy."""
-    from xrspatial import flow_direction
+    from xrspatial.hydro import flow_direction
 
     rng = np.random.default_rng(123)
     elev = rng.random((8, 10)).astype(np.float64)
@@ -180,7 +180,7 @@ def test_basin_dask_temp_cleanup():
     import os
     import glob
 
-    from xrspatial import flow_direction
+    from xrspatial.hydro import flow_direction
 
     rng = np.random.default_rng(789)
     elev = rng.random((8, 10)).astype(np.float64)
@@ -244,7 +244,7 @@ def test_basin_dask_cupy_random():
     This test verifies the GPU tile kernel produces identical results to
     the CPU tile kernel.
     """
-    from xrspatial import flow_direction
+    from xrspatial.hydro import flow_direction
 
     rng = np.random.default_rng(952)
     elev = rng.random((8, 10)).astype(np.float64)
diff --git a/xrspatial/tests/test_fill.py b/xrspatial/hydro/tests/test_fill_d8.py
similarity index 99%
rename from xrspatial/tests/test_fill.py
rename to xrspatial/hydro/tests/test_fill_d8.py
index dc57f1f2..0783811f 100644
--- a/xrspatial/tests/test_fill.py
+++ b/xrspatial/hydro/tests/test_fill_d8.py
@@ -2,7 +2,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial import fill
+from xrspatial.hydro import fill
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
diff --git a/xrspatial/tests/test_flow_accumulation.py b/xrspatial/hydro/tests/test_flow_accumulation_d8.py
similarity index 98%
rename from xrspatial/tests/test_flow_accumulation.py
rename to xrspatial/hydro/tests/test_flow_accumulation_d8.py
index 925e2db0..b86012f0 100644
--- a/xrspatial/tests/test_flow_accumulation.py
+++ b/xrspatial/hydro/tests/test_flow_accumulation_d8.py
@@ -2,7 +2,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial import flow_accumulation
+from xrspatial.hydro import flow_accumulation
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
@@ -286,7 +286,7 @@ def test_dask_random():
     graphs (randomly assigned D8 codes can create cycles, which the
     iterative tile sweep cannot handle).
     """
-    from xrspatial import flow_direction
+    from xrspatial.hydro import flow_direction
 
     rng = np.random.default_rng(42)
     elev = rng.random((10, 12)).astype(np.float64)
@@ -355,7 +355,7 @@ def test_numpy_equals_dask_cupy():
 @pytest.mark.parametrize("chunks", [(3, 3), (5, 6), (2, 4)])
 def test_dask_cupy_chunk_configs(chunks):
     """Dask+CuPy matches NumPy across chunk sizes."""
-    from xrspatial import flow_direction
+    from xrspatial.hydro import flow_direction
 
     rng = np.random.default_rng(952)
     elev = rng.random((10, 12)).astype(np.float64)
diff --git a/xrspatial/tests/test_flow_accumulation_dinf.py b/xrspatial/hydro/tests/test_flow_accumulation_dinf.py
similarity index 94%
rename from xrspatial/tests/test_flow_accumulation_dinf.py
rename to xrspatial/hydro/tests/test_flow_accumulation_dinf.py
index 8665d05a..335ad8b4 100644
--- a/xrspatial/tests/test_flow_accumulation_dinf.py
+++ b/xrspatial/hydro/tests/test_flow_accumulation_dinf.py
@@ -1,8 +1,8 @@
 import numpy as np
 import pytest
 
-from xrspatial.flow_accumulation_dinf import flow_accumulation_dinf
-from xrspatial.flow_accumulation import _detect_flow_type
+from xrspatial.hydro.flow_accumulation_dinf import flow_accumulation_dinf
+from xrspatial.hydro.flow_accumulation_d8 import _detect_flow_type
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
@@ -107,7 +107,7 @@ def test_dinf_chain_with_split():
 
 def test_dinf_end_to_end():
     """flow_direction_dinf -> flow_accumulation_dinf produces valid results."""
-    from xrspatial import flow_direction_dinf
+    from xrspatial.hydro import flow_direction_dinf
 
     rng = np.random.default_rng(42)
     elev = create_test_raster(rng.random((20, 20)) * 100)
@@ -145,7 +145,7 @@ def test_dinf_nan_handling():
 ])
 def test_dinf_dask_matches_numpy(chunks):
     """Dinf dask matches numpy across chunk sizes."""
-    from xrspatial import flow_direction_dinf
+    from xrspatial.hydro import flow_direction_dinf
 
     rng = np.random.default_rng(123)
     elev = create_test_raster(rng.random((6, 6)) * 100)
@@ -163,7 +163,7 @@ def test_dinf_dask_matches_numpy(chunks):
 @dask_array_available
 def test_dinf_dask_larger_grid():
     """Dinf dask on a larger grid with cross-tile flow."""
-    from xrspatial import flow_direction_dinf
+    from xrspatial.hydro import flow_direction_dinf
 
     rng = np.random.default_rng(99)
     elev = create_test_raster(rng.random((12, 12)) * 100)
@@ -183,7 +183,7 @@ def test_dinf_dask_larger_grid():
 @cuda_and_cupy_available
 def test_dinf_cupy_matches_numpy():
     """Dinf CuPy matches NumPy."""
-    from xrspatial import flow_direction_dinf
+    from xrspatial.hydro import flow_direction_dinf
 
     rng = np.random.default_rng(42)
     elev = create_test_raster(rng.random((10, 10)) * 100)
@@ -201,7 +201,7 @@ def test_dinf_cupy_matches_numpy():
 @cuda_and_cupy_available
 def test_dinf_dask_cupy_matches_numpy():
     """Dinf Dask+CuPy matches NumPy."""
-    from xrspatial import flow_direction_dinf
+    from xrspatial.hydro import flow_direction_dinf
 
     rng = np.random.default_rng(42)
     elev = create_test_raster(rng.random((8, 8)) * 100)
diff --git a/xrspatial/tests/test_flow_accumulation_mfd.py b/xrspatial/hydro/tests/test_flow_accumulation_mfd.py
similarity index 98%
rename from xrspatial/tests/test_flow_accumulation_mfd.py
rename to xrspatial/hydro/tests/test_flow_accumulation_mfd.py
index cfe5e82d..78a162ed 100644
--- a/xrspatial/tests/test_flow_accumulation_mfd.py
+++ b/xrspatial/hydro/tests/test_flow_accumulation_mfd.py
@@ -4,8 +4,8 @@
 import pytest
 import xarray as xr
 
-from xrspatial.flow_direction_mfd import flow_direction_mfd
-from xrspatial.flow_accumulation_mfd import flow_accumulation_mfd
+from xrspatial.hydro.flow_direction_mfd import flow_direction_mfd
+from xrspatial.hydro.flow_accumulation_mfd import flow_accumulation_mfd
 
 
 # =====================================================================
diff --git a/xrspatial/tests/test_flow_direction.py b/xrspatial/hydro/tests/test_flow_direction_d8.py
similarity index 98%
rename from xrspatial/tests/test_flow_direction.py
rename to xrspatial/hydro/tests/test_flow_direction_d8.py
index 4d956485..da9686bc 100644
--- a/xrspatial/tests/test_flow_direction.py
+++ b/xrspatial/hydro/tests/test_flow_direction_d8.py
@@ -2,7 +2,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial import flow_direction
+from xrspatial.hydro import flow_direction
 from xrspatial.tests.general_checks import (assert_boundary_mode_correctness,
                                             assert_numpy_equals_cupy,
                                             assert_numpy_equals_dask_cupy,
@@ -230,7 +230,7 @@ def test_nan_edges():
 @pytest.mark.parametrize("size", [(4, 6), (10, 15)])
 @pytest.mark.parametrize(
     "dtype", [np.int32, np.int64, np.float32, np.float64])
-def test_numpy_equals_dask(random_data):
+def test_numpy_equals_dask(random_data, size, dtype):
     numpy_agg = create_test_raster(random_data, backend='numpy')
     dask_agg = create_test_raster(random_data, backend='dask')
     assert_numpy_equals_dask_numpy(numpy_agg, dask_agg, flow_direction)
@@ -240,7 +240,7 @@ def test_numpy_equals_dask(random_data):
 @pytest.mark.parametrize("size", [(4, 6), (10, 15)])
 @pytest.mark.parametrize(
     "dtype", [np.int32, np.int64, np.float32, np.float64])
-def test_numpy_equals_cupy(random_data):
+def test_numpy_equals_cupy(random_data, size, dtype):
     numpy_agg = create_test_raster(random_data, backend='numpy')
     cupy_agg = create_test_raster(random_data, backend='cupy')
     assert_numpy_equals_cupy(numpy_agg, cupy_agg, flow_direction)
@@ -251,7 +251,7 @@ def test_numpy_equals_cupy(random_data):
 @pytest.mark.parametrize("size", [(4, 6), (10, 15)])
 @pytest.mark.parametrize(
     "dtype", [np.int32, np.int64, np.float32, np.float64])
-def test_numpy_equals_dask_cupy(random_data):
+def test_numpy_equals_dask_cupy(random_data, size, dtype):
     numpy_agg = create_test_raster(random_data, backend='numpy')
     dask_cupy_agg = create_test_raster(random_data, backend='dask+cupy')
     assert_numpy_equals_dask_cupy(numpy_agg, dask_cupy_agg, flow_direction,
diff --git a/xrspatial/tests/test_flow_direction_dinf.py b/xrspatial/hydro/tests/test_flow_direction_dinf.py
similarity index 98%
rename from xrspatial/tests/test_flow_direction_dinf.py
rename to xrspatial/hydro/tests/test_flow_direction_dinf.py
index e53bf2e8..e5ab84f6 100644
--- a/xrspatial/tests/test_flow_direction_dinf.py
+++ b/xrspatial/hydro/tests/test_flow_direction_dinf.py
@@ -4,7 +4,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial import flow_direction_dinf
+from xrspatial.hydro import flow_direction_dinf
 from xrspatial.tests.general_checks import (assert_boundary_mode_correctness,
                                             assert_numpy_equals_cupy,
                                             assert_numpy_equals_dask_cupy,
@@ -361,7 +361,7 @@ def test_dataset_support():
 @pytest.mark.parametrize("size", [(4, 6), (10, 15)])
 @pytest.mark.parametrize(
     "dtype", [np.int32, np.int64, np.float32, np.float64])
-def test_numpy_equals_dask(random_data):
+def test_numpy_equals_dask(random_data, size, dtype):
     numpy_agg = create_test_raster(random_data, backend='numpy')
     dask_agg = create_test_raster(random_data, backend='dask')
     assert_numpy_equals_dask_numpy(numpy_agg, dask_agg, flow_direction_dinf)
@@ -371,7 +371,7 @@ def test_numpy_equals_dask(random_data):
 @pytest.mark.parametrize("size", [(4, 6), (10, 15)])
 @pytest.mark.parametrize(
     "dtype", [np.int32, np.int64, np.float32, np.float64])
-def test_numpy_equals_cupy(random_data):
+def test_numpy_equals_cupy(random_data, size, dtype):
     numpy_agg = create_test_raster(random_data, backend='numpy')
     cupy_agg = create_test_raster(random_data, backend='cupy')
     assert_numpy_equals_cupy(numpy_agg, cupy_agg, flow_direction_dinf)
@@ -382,7 +382,7 @@ def test_numpy_equals_cupy(random_data):
 @pytest.mark.parametrize("size", [(4, 6), (10, 15)])
 @pytest.mark.parametrize(
     "dtype", [np.int32, np.int64, np.float32, np.float64])
-def test_numpy_equals_dask_cupy(random_data):
+def test_numpy_equals_dask_cupy(random_data, size, dtype):
     numpy_agg = create_test_raster(random_data, backend='numpy')
     dask_cupy_agg = create_test_raster(random_data, backend='dask+cupy')
     assert_numpy_equals_dask_cupy(numpy_agg, dask_cupy_agg,
diff --git a/xrspatial/tests/test_flow_direction_mfd.py b/xrspatial/hydro/tests/test_flow_direction_mfd.py
similarity index 99%
rename from xrspatial/tests/test_flow_direction_mfd.py
rename to xrspatial/hydro/tests/test_flow_direction_mfd.py
index 0a6d42cd..a689ebbb 100644
--- a/xrspatial/tests/test_flow_direction_mfd.py
+++ b/xrspatial/hydro/tests/test_flow_direction_mfd.py
@@ -4,8 +4,8 @@
 import pytest
 import xarray as xr
 
-from xrspatial import flow_direction_mfd
-from xrspatial.flow_direction_mfd import NEIGHBOR_NAMES
+from xrspatial.hydro import flow_direction_mfd
+from xrspatial.hydro.flow_direction_mfd import NEIGHBOR_NAMES
 from xrspatial.tests.general_checks import (create_test_raster,
                                             cuda_and_cupy_available,
                                             dask_array_available)
diff --git a/xrspatial/tests/test_flow_length.py b/xrspatial/hydro/tests/test_flow_length_d8.py
similarity index 97%
rename from xrspatial/tests/test_flow_length.py
rename to xrspatial/hydro/tests/test_flow_length_d8.py
index bfc48360..6fdb2ac1 100644
--- a/xrspatial/tests/test_flow_length.py
+++ b/xrspatial/hydro/tests/test_flow_length_d8.py
@@ -4,7 +4,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial.flow_length import flow_length
+from xrspatial.hydro.flow_length_d8 import flow_length_d8 as flow_length
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
@@ -214,7 +214,7 @@ class TestFlowLengthDask:
     ])
     @pytest.mark.parametrize("direction", ['downstream', 'upstream'])
     def test_numpy_equals_dask(self, chunks, direction):
-        from xrspatial.flow_direction import flow_direction
+        from xrspatial.hydro.flow_direction_d8 import flow_direction_d8 as flow_direction
 
         np.random.seed(123)
         elev = np.random.uniform(0, 100, (6, 6)).astype(np.float64)
@@ -255,7 +255,7 @@ class TestFlowLengthCuPy:
 
     @pytest.mark.parametrize("direction", ['downstream', 'upstream'])
     def test_numpy_equals_cupy(self, direction):
-        from xrspatial.flow_direction import flow_direction
+        from xrspatial.hydro.flow_direction_d8 import flow_direction_d8 as flow_direction
 
         np.random.seed(42)
         elev = np.random.uniform(0, 100, (6, 6)).astype(np.float64)
@@ -281,7 +281,7 @@ class TestFlowLengthDaskCuPy:
 
     @pytest.mark.parametrize("direction", ['downstream', 'upstream'])
     def test_numpy_equals_dask_cupy(self, direction):
-        from xrspatial.flow_direction import flow_direction
+        from xrspatial.hydro.flow_direction_d8 import flow_direction_d8 as flow_direction
 
         np.random.seed(42)
         elev = np.random.uniform(0, 100, (6, 6)).astype(np.float64)
diff --git a/xrspatial/tests/test_flow_length_dinf.py b/xrspatial/hydro/tests/test_flow_length_dinf.py
similarity index 96%
rename from xrspatial/tests/test_flow_length_dinf.py
rename to xrspatial/hydro/tests/test_flow_length_dinf.py
index bf2ff158..28b6a881 100644
--- a/xrspatial/tests/test_flow_length_dinf.py
+++ b/xrspatial/hydro/tests/test_flow_length_dinf.py
@@ -6,7 +6,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial.flow_length_dinf import flow_length_dinf
+from xrspatial.hydro.flow_length_dinf import flow_length_dinf
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
@@ -197,7 +197,7 @@ class TestFlowDirectionDinfIntegration:
     """Integration test with actual flow_direction_dinf output."""
 
     def test_bowl_downstream(self):
-        from xrspatial.flow_direction_dinf import flow_direction_dinf
+        from xrspatial.hydro.flow_direction_dinf import flow_direction_dinf
 
         elev = np.array([
             [9, 8, 9],
@@ -221,7 +221,7 @@ class TestFlowLengthDinfDask:
     ])
     @pytest.mark.parametrize("direction", ['downstream', 'upstream'])
     def test_numpy_equals_dask(self, chunks, direction):
-        from xrspatial.flow_direction_dinf import flow_direction_dinf
+        from xrspatial.hydro.flow_direction_dinf import flow_direction_dinf
 
         np.random.seed(123)
         elev = np.random.uniform(0, 100, (6, 6)).astype(np.float64)
@@ -261,7 +261,7 @@ class TestFlowLengthDinfCuPy:
 
     @pytest.mark.parametrize("direction", ['downstream', 'upstream'])
     def test_numpy_equals_cupy(self, direction):
-        from xrspatial.flow_direction_dinf import flow_direction_dinf
+        from xrspatial.hydro.flow_direction_dinf import flow_direction_dinf
 
         np.random.seed(42)
         elev = np.random.uniform(0, 100, (6, 6)).astype(np.float64)
@@ -287,7 +287,7 @@ class TestFlowLengthDinfDaskCuPy:
 
     @pytest.mark.parametrize("direction", ['downstream', 'upstream'])
     def test_numpy_equals_dask_cupy(self, direction):
-        from xrspatial.flow_direction_dinf import flow_direction_dinf
+        from xrspatial.hydro.flow_direction_dinf import flow_direction_dinf
 
         np.random.seed(42)
         elev = np.random.uniform(0, 100, (6, 6)).astype(np.float64)
diff --git a/xrspatial/tests/test_flow_length_mfd.py b/xrspatial/hydro/tests/test_flow_length_mfd.py
similarity index 97%
rename from xrspatial/tests/test_flow_length_mfd.py
rename to xrspatial/hydro/tests/test_flow_length_mfd.py
index 7c037d65..990538cd 100644
--- a/xrspatial/tests/test_flow_length_mfd.py
+++ b/xrspatial/hydro/tests/test_flow_length_mfd.py
@@ -6,7 +6,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial.flow_length_mfd import flow_length_mfd
+from xrspatial.hydro.flow_length_mfd import flow_length_mfd
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
@@ -307,7 +307,7 @@ class TestFlowDirectionMfdIntegration:
     """Integration test with actual flow_direction_mfd output."""
 
     def test_bowl_downstream(self):
-        from xrspatial.flow_direction_mfd import flow_direction_mfd
+        from xrspatial.hydro.flow_direction_mfd import flow_direction_mfd
 
         # Bowl DEM: center is lowest
         elev = np.array([
@@ -334,7 +334,7 @@ class TestFlowLengthMfdDask:
     ])
     @pytest.mark.parametrize("direction", ['downstream', 'upstream'])
     def test_numpy_equals_dask(self, chunks, direction):
-        from xrspatial.flow_direction_mfd import flow_direction_mfd
+        from xrspatial.hydro.flow_direction_mfd import flow_direction_mfd
 
         np.random.seed(123)
         elev = np.random.uniform(0, 100, (6, 6)).astype(np.float64)
@@ -373,7 +373,7 @@ class TestFlowLengthMfdCuPy:
 
     @pytest.mark.parametrize("direction", ['downstream', 'upstream'])
     def test_numpy_equals_cupy(self, direction):
-        from xrspatial.flow_direction_mfd import flow_direction_mfd
+        from xrspatial.hydro.flow_direction_mfd import flow_direction_mfd
 
         np.random.seed(42)
         elev = np.random.uniform(0, 100, (6, 6)).astype(np.float64)
@@ -399,7 +399,7 @@ class TestFlowLengthMfdDaskCuPy:
 
     @pytest.mark.parametrize("direction", ['downstream', 'upstream'])
     def test_numpy_equals_dask_cupy(self, direction):
-        from xrspatial.flow_direction_mfd import flow_direction_mfd
+        from xrspatial.hydro.flow_direction_mfd import flow_direction_mfd
 
         np.random.seed(42)
         elev = np.random.uniform(0, 100, (6, 6)).astype(np.float64)
diff --git a/xrspatial/tests/test_flow_path.py b/xrspatial/hydro/tests/test_flow_path_d8.py
similarity index 99%
rename from xrspatial/tests/test_flow_path.py
rename to xrspatial/hydro/tests/test_flow_path_d8.py
index 026baddc..f7fa6667 100644
--- a/xrspatial/tests/test_flow_path.py
+++ b/xrspatial/hydro/tests/test_flow_path_d8.py
@@ -2,7 +2,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial import flow_path
+from xrspatial.hydro import flow_path
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
diff --git a/xrspatial/hydro/tests/test_flow_path_dinf.py b/xrspatial/hydro/tests/test_flow_path_dinf.py
new file mode 100644
index 00000000..ab2e548f
--- /dev/null
+++ b/xrspatial/hydro/tests/test_flow_path_dinf.py
@@ -0,0 +1,219 @@
+import math
+
+import numpy as np
+import pytest
+import xarray as xr
+
+from xrspatial.hydro.flow_path_dinf import flow_path_dinf
+from xrspatial.tests.general_checks import (
+    create_test_raster,
+    cuda_and_cupy_available,
+    dask_array_available,
+)
+
+
+def _make_fd_and_sp(fd_data, sp_data, backend='numpy', chunks=(3, 3)):
+    fd = create_test_raster(fd_data.astype(np.float64), backend=backend, chunks=chunks)
+    sp = create_test_raster(sp_data.astype(np.float64), backend=backend, chunks=chunks)
+    return fd, sp
+
+
+def test_single_path_cardinal_east():
+    """All cells angle=0 (pure east), pit at right column."""
+    fd = np.full((3, 4), 0.0, dtype=np.float64)
+    fd[:, -1] = -1.0  # pits
+    sp = np.full((3, 4), np.nan)
+    sp[0, 0] = 5.0
+
+    fd_da, sp_da = _make_fd_and_sp(fd, sp)
+    result = flow_path_dinf(fd_da, sp_da)
+
+    assert result.data[0, 0] == 5.0
+    assert result.data[0, 1] == 5.0
+    assert result.data[0, 2] == 5.0
+    assert result.data[0, 3] == 5.0
+    assert np.isnan(result.data[1, 0])
+
+
+def test_single_path_diagonal():
+    """Angle = pi/4 (NE), verify path traces NE."""
+    fd = np.full((4, 4), math.pi / 4, dtype=np.float64)
+    fd[0, :] = -1.0  # pits at top row
+    sp = np.full((4, 4), np.nan)
+    sp[3, 0] = 1.0
+
+    fd_da, sp_da = _make_fd_and_sp(fd, sp)
+    result = flow_path_dinf(fd_da, sp_da)
+
+    # pi/4 = pure NE (k=1, alpha=0, w1=1.0 for NE)
+    # Path: (3,0) -> (2,1) -> (1,2) -> (0,3) pit
+    assert result.data[3, 0] == 1.0
+    assert result.data[2, 1] == 1.0
+    assert result.data[1, 2] == 1.0
+    assert result.data[0, 3] == 1.0
+
+
+def test_dominant_neighbor():
+    """Angle between E and NE, dominant neighbor is E (w1 > w2)."""
+    # angle = 0.1 (close to 0, so w1 = E is dominant)
+    fd = np.full((2, 4), 0.1, dtype=np.float64)
+    fd[:, -1] = -1.0
+    sp = np.full((2, 4), np.nan)
+    sp[1, 0] = 3.0
+
+    fd_da, sp_da = _make_fd_and_sp(fd, sp)
+    result = flow_path_dinf(fd_da, sp_da)
+
+    # Dominant = E, so path goes east: (1,0) -> (1,1) -> (1,2) -> (1,3)
+    assert result.data[1, 0] == 3.0
+    assert result.data[1, 1] == 3.0
+    assert result.data[1, 2] == 3.0
+    assert result.data[1, 3] == 3.0
+
+
+def test_path_stops_at_pit():
+    """Pit angle (-1.0) stops the path."""
+    fd = np.array([[0.0, -1.0, 0.0]], dtype=np.float64)
+    sp = np.full((1, 3), np.nan)
+    sp[0, 0] = 2.0
+
+    fd_da, sp_da = _make_fd_and_sp(fd, sp)
+    result = flow_path_dinf(fd_da, sp_da)
+
+    assert result.data[0, 0] == 2.0
+    assert result.data[0, 1] == 2.0
+    assert np.isnan(result.data[0, 2])
+
+
+def test_path_stops_at_nan():
+    fd = np.array([[0.0, np.nan, 0.0]], dtype=np.float64)
+    sp = np.full((1, 3), np.nan)
+    sp[0, 0] = 1.0
+
+    fd_da, sp_da = _make_fd_and_sp(fd, sp)
+    result = flow_path_dinf(fd_da, sp_da)
+
+    assert result.data[0, 0] == 1.0
+    assert result.data[0, 1] == 1.0
+    assert np.isnan(result.data[0, 2])
+
+
+def test_path_stops_at_edge():
+    fd = np.full((1, 3), 0.0, dtype=np.float64)  # all east
+    sp = np.full((1, 3), np.nan)
+    sp[0, 0] = 3.0
+
+    fd_da, sp_da = _make_fd_and_sp(fd, sp)
+    result = flow_path_dinf(fd_da, sp_da)
+
+    assert result.data[0, 0] == 3.0
+    assert result.data[0, 1] == 3.0
+    assert result.data[0, 2] == 3.0
+
+
+def test_multiple_paths():
+    fd = np.full((4, 4), 0.0, dtype=np.float64)
+    fd[:, -1] = -1.0
+    sp = np.full((4, 4), np.nan)
+    sp[0, 0] = 10.0
+    sp[3, 0] = 20.0
+
+    fd_da, sp_da = _make_fd_and_sp(fd, sp)
+    result = flow_path_dinf(fd_da, sp_da)
+
+    for c in range(4):
+        assert result.data[0, c] == 10.0
+        assert result.data[3, c] == 20.0
+    assert np.isnan(result.data[1, 0])
+
+
+def test_paths_overlap_last_wins():
+    # angle ~ 1.178 => k=1, alpha~0.393, w1~0.5 w2~0.5, NE dominant since w1>=w2
+    # Use angle for SE: k=6 gives S, k=7 gives SE
+    # angle = 7*pi/4 = SE direction (pure), so path goes SE
+    se_angle = 7 * math.pi / 4
+    fd = np.full((3, 3), se_angle, dtype=np.float64)
+    fd[-1, :] = -1.0  # bottom row pits
+    fd[:, -1] = -1.0  # right col pits
+    sp = np.full((3, 3), np.nan)
+    sp[0, 0] = 10.0  # (0,0)->SE->(1,1)->SE->(2,2)
+    sp[0, 1] = 20.0  # (0,1)->SE->(1,2) pit
+
+    fd_da, sp_da = _make_fd_and_sp(fd, sp)
+    result = flow_path_dinf(fd_da, sp_da)
+
+    assert result.data[0, 0] == 10.0
+    assert result.data[0, 1] == 20.0
+    # (1,1) visited by sp[0,0] then overwritten? No, sp[0,1] path goes to (1,2)
+    # sp[0,0] goes SE to (1,1), then (1,1) pit -> stop
+    assert result.data[1, 1] == 10.0
+
+
+def test_output_dtype():
+    fd = np.full((2, 2), 0.0, dtype=np.float64)
+    fd[:, -1] = -1.0
+    sp = np.full((2, 2), np.nan)
+    sp[0, 0] = 1.0
+
+    fd_da, sp_da = _make_fd_and_sp(fd, sp)
+    result = flow_path_dinf(fd_da, sp_da)
+    assert result.data.dtype == np.float64
+
+
+@dask_array_available
+@pytest.mark.parametrize("chunks", [
+    (1, 1), (2, 2), (3, 3), (1, 4), (4, 1),
+])
+def test_numpy_equals_dask(chunks):
+    fd = np.full((4, 4), 0.0, dtype=np.float64)
+    fd[:, -1] = -1.0
+    fd[2, :] = math.pi / 2  # north
+    sp = np.full((4, 4), np.nan)
+    sp[0, 0] = 1.0
+    sp[3, 3] = 2.0
+
+    fd_np, sp_np = _make_fd_and_sp(fd, sp, backend='numpy')
+    fd_dk, sp_dk = _make_fd_and_sp(fd, sp, backend='dask', chunks=chunks)
+
+    np_result = flow_path_dinf(fd_np, sp_np)
+    dk_result = flow_path_dinf(fd_dk, sp_dk)
+
+    np.testing.assert_allclose(
+        np_result.data, dk_result.data.compute(), equal_nan=True)
+
+
+@cuda_and_cupy_available
+def test_numpy_equals_cupy():
+    fd = np.full((3, 4), 0.0, dtype=np.float64)
+    fd[:, -1] = -1.0
+    sp = np.full((3, 4), np.nan)
+    sp[0, 0] = 1.0
+    sp[2, 0] = 2.0
+
+    fd_np, sp_np = _make_fd_and_sp(fd, sp, backend='numpy')
+    fd_cp, sp_cp = _make_fd_and_sp(fd, sp, backend='cupy')
+
+    np_result = flow_path_dinf(fd_np, sp_np)
+    cp_result = flow_path_dinf(fd_cp, sp_cp)
+
+    np.testing.assert_allclose(
+        np_result.data, cp_result.data.get(), equal_nan=True)
+
+
+@dask_array_available
+@cuda_and_cupy_available
+def test_numpy_equals_dask_cupy():
+    fd = np.full((3, 4), 0.0, dtype=np.float64)
+    fd[:, -1] = -1.0
+    sp = np.full((3, 4), np.nan)
+    sp[0, 0] = 1.0
+    sp[2, 0] = 2.0
+
+    fd_np, sp_np = _make_fd_and_sp(fd, sp, backend='numpy')
+    fd_dcp, sp_dcp = _make_fd_and_sp(fd, sp, backend='dask+cupy', chunks=(2, 2))
+
+    np_result = flow_path_dinf(fd_np, sp_np)
+    dcp_result = flow_path_dinf(fd_dcp, sp_dcp)
+
+    np.testing.assert_allclose(
+        np_result.data, dcp_result.data.compute().get(), equal_nan=True)
diff --git a/xrspatial/hydro/tests/test_flow_path_mfd.py b/xrspatial/hydro/tests/test_flow_path_mfd.py
new file mode 100644
index 00000000..ff5b71a5
--- /dev/null
+++ b/xrspatial/hydro/tests/test_flow_path_mfd.py
@@ -0,0 +1,252 @@
+import numpy as np
+import pytest
+import xarray as xr
+
+from xrspatial.hydro.flow_path_mfd import flow_path_mfd
+from xrspatial.tests.general_checks import (
+    create_test_raster,
+    cuda_and_cupy_available,
+    dask_array_available,
+)
+
+
+def _make_mfd_raster(fractions, backend='numpy', chunks=(3, 3)):
+    _, H, W = fractions.shape
+    neighbor_names = ['E', 'SE', 'S', 'SW', 'W', 'NW', 'N', 'NE']
+    y_coords = np.arange(H, dtype=np.float64)
+    x_coords = np.arange(W, dtype=np.float64)
+    da_obj = xr.DataArray(
+        fractions.astype(np.float64),
+        dims=('neighbor', 'y', 'x'),
+        coords={'neighbor': neighbor_names, 'y': y_coords, 'x': x_coords},
+        name='mfd_fdir',
+    )
+    if backend == 'dask':
+        import dask.array as da
+        da_obj = da_obj.chunk({'neighbor': 8, 'y': chunks[0], 'x': chunks[1]})
+    elif backend == 'cupy':
+        import cupy as cp
+        da_obj = da_obj.copy(data=cp.asarray(da_obj.data))
+    elif backend == 'dask+cupy':
+        import cupy as cp
+        import dask.array as da
+        da_obj = da_obj.chunk({'neighbor': 8, 'y': chunks[0], 'x': chunks[1]})
+        da_obj = da_obj.copy(data=da_obj.data.map_blocks(
+            cp.asarray, dtype=da_obj.dtype, meta=cp.array((), dtype=da_obj.dtype)))
+    return da_obj
+
+
+def _make_all_east(H, W):
+    """MFD fractions: all flow east, pits at right column."""
+    fracs = np.zeros((8, H, W), dtype=np.float64)
+    fracs[0, :, :-1] = 1.0  # E direction for non-pit cells
+    # Right column: all zero = pit
+    return fracs
+
+
+def _make_all_south(H, W):
+    """MFD fractions: all flow south, pits at bottom row."""
+    fracs = np.zeros((8, H, W), dtype=np.float64)
+    fracs[2, :-1, :] = 1.0  # S direction for non-pit cells
+    return fracs
+
+
+def test_single_path_east():
+    fracs = _make_all_east(3, 4)
+    sp = np.full((3, 4), np.nan)
+    sp[0, 0] = 5.0
+
+    fd_da = _make_mfd_raster(fracs)
+    sp_da = create_test_raster(sp.astype(np.float64))
+    result = flow_path_mfd(fd_da, sp_da)
+
+    for c in range(4):
+        assert result.data[0, c] == 5.0
+    assert np.isnan(result.data[1, 0])
+
+
+def test_single_path_south():
+    fracs = _make_all_south(4, 3)
+    sp = np.full((4, 3), np.nan)
+    sp[0, 1] = 7.0
+
+    fd_da = _make_mfd_raster(fracs)
+    sp_da = create_test_raster(sp.astype(np.float64))
+    result = flow_path_mfd(fd_da, sp_da)
+
+    for r in range(4):
+        assert result.data[r, 1] == 7.0
+    assert np.isnan(result.data[0, 0])
+
+
+def test_dominant_neighbor_selection():
+    """Fractions split between E (0.7) and SE (0.3); dominant = E."""
+    fracs = np.zeros((8, 2, 4), dtype=np.float64)
+    fracs[0, :, :-1] = 0.7  # E
+    fracs[1, :, :-1] = 0.3  # SE
+    sp = np.full((2, 4), np.nan)
+    sp[0, 0] = 1.0
+
+    fd_da = _make_mfd_raster(fracs)
+    sp_da = create_test_raster(sp.astype(np.float64))
+    result = flow_path_mfd(fd_da, sp_da)
+
+    # Should go east: (0,0)->(0,1)->(0,2)->(0,3)
+    for c in range(4):
+        assert result.data[0, c] == 1.0
+
+
+def test_path_stops_at_pit():
+    fracs = np.zeros((8, 1, 3), dtype=np.float64)
+    fracs[0, 0, 0] = 1.0  # E at (0,0)
+    # (0,1) all zero = pit
+    sp = np.full((1, 3), np.nan)
+    sp[0, 0] = 2.0
+
+    fd_da = _make_mfd_raster(fracs)
+    sp_da = create_test_raster(sp.astype(np.float64))
+    result = flow_path_mfd(fd_da, sp_da)
+
+    assert result.data[0, 0] == 2.0
+    assert result.data[0, 1] == 2.0
+    assert np.isnan(result.data[0, 2])
+
+
+def test_path_stops_at_nan():
+    fracs = np.zeros((8, 1, 3), dtype=np.float64)
+    fracs[0, 0, 0] = 1.0  # E at (0,0)
+    fracs[:, 0, 1] = np.nan  # nodata at (0,1)
+    sp = np.full((1, 3), np.nan)
+    sp[0, 0] = 1.0
+
+    fd_da = _make_mfd_raster(fracs)
+    sp_da = create_test_raster(sp.astype(np.float64))
+    result = flow_path_mfd(fd_da, sp_da)
+
+    assert result.data[0, 0] == 1.0
+    assert result.data[0, 1] == 1.0  # reached before NaN check
+    assert np.isnan(result.data[0, 2])
+
+
+def test_path_stops_at_edge():
+    fracs = np.zeros((8, 1, 3), dtype=np.float64)
+    fracs[0, :, :] = 1.0  # all E
+    sp = np.full((1, 3), np.nan)
+    sp[0, 0] = 3.0
+
+    fd_da = _make_mfd_raster(fracs)
+    sp_da = create_test_raster(sp.astype(np.float64))
+    result = flow_path_mfd(fd_da, sp_da)
+
+    for c in range(3):
+        assert result.data[0, c] == 3.0
+
+
+def test_multiple_paths():
+    fracs = _make_all_east(4, 4)
+    sp = np.full((4, 4), np.nan)
+    sp[0, 0] = 10.0
+    sp[3, 0] = 20.0
+
+    fd_da = _make_mfd_raster(fracs)
+    sp_da = create_test_raster(sp.astype(np.float64))
+    result = flow_path_mfd(fd_da, sp_da)
+
+    for c in range(4):
+        assert result.data[0, c] == 10.0
+        assert result.data[3, c] == 20.0
+    assert np.isnan(result.data[1, 0])
+
+
+def test_paths_overlap_last_wins():
+    # Both start points flow SE to (1,1)
+    fracs = np.zeros((8, 3, 3), dtype=np.float64)
+    fracs[1, :, :] = 1.0  # all SE
+    fracs[:, -1, :] = 0.0  # bottom row pits
+    fracs[:, :, -1] = 0.0  # right col pits
+
+    sp = np.full((3, 3), np.nan)
+    sp[0, 0] = 10.0  # -> (1,1) -> pit
+    sp[0, 1] = 20.0  # -> (1,2) pit
+
+    fd_da = _make_mfd_raster(fracs)
+    sp_da = create_test_raster(sp.astype(np.float64))
+    result = flow_path_mfd(fd_da, sp_da)
+
+    assert result.data[0, 0] == 10.0
+    assert result.data[0, 1] == 20.0
+    assert result.data[1, 1] == 10.0
+
+
+def test_output_dtype():
+    fracs = _make_all_east(2, 3)
+    sp = np.full((2, 3), np.nan)
+    sp[0, 0] = 1.0
+
+    fd_da = _make_mfd_raster(fracs)
+    sp_da = create_test_raster(sp.astype(np.float64))
+    result = flow_path_mfd(fd_da, sp_da)
+    assert result.data.dtype == np.float64
+
+
+@dask_array_available
+@pytest.mark.parametrize("chunks", [
+    (1, 1), (2, 2), (3, 3), (1, 4), (4, 1),
+])
+def test_numpy_equals_dask(chunks):
+    fracs = _make_all_east(4, 4)
+    fracs[1, 2, :3] = 0.5  # add some SE flow in row 2
+    fracs[0, 2, :3] = 0.5
+    sp = np.full((4, 4), np.nan)
+    sp[0, 0] = 1.0
+    sp[3, 0] = 2.0
+
+    fd_np = _make_mfd_raster(fracs, backend='numpy')
+    sp_np = create_test_raster(sp.astype(np.float64), backend='numpy')
+    fd_dk = _make_mfd_raster(fracs, backend='dask', chunks=chunks)
+    sp_dk = create_test_raster(sp.astype(np.float64), backend='dask', chunks=chunks)
+
+    np_result = flow_path_mfd(fd_np, sp_np)
+    dk_result = flow_path_mfd(fd_dk, sp_dk)
+
+    np.testing.assert_allclose(
+        np_result.data, dk_result.data.compute(), equal_nan=True)
+
+
+@cuda_and_cupy_available
+def test_numpy_equals_cupy():
+    fracs = _make_all_east(3, 4)
+    sp = np.full((3, 4), np.nan)
+    sp[0, 0] = 1.0
+    sp[2, 0] = 2.0
+
+    fd_np = _make_mfd_raster(fracs, backend='numpy')
+    sp_np = create_test_raster(sp.astype(np.float64), backend='numpy')
+    fd_cp = _make_mfd_raster(fracs, backend='cupy')
+    sp_cp = create_test_raster(sp.astype(np.float64), backend='cupy')
+
+    np_result = flow_path_mfd(fd_np, sp_np)
+    cp_result = flow_path_mfd(fd_cp, sp_cp)
+
+    np.testing.assert_allclose(
+        np_result.data, cp_result.data.get(), equal_nan=True)
+
+
+@dask_array_available
+@cuda_and_cupy_available
+def test_numpy_equals_dask_cupy():
+    fracs = _make_all_east(3, 4)
+    sp = np.full((3, 4), np.nan)
+    sp[0, 0] = 1.0
+    sp[2, 0] = 2.0
+
+    fd_np = _make_mfd_raster(fracs, backend='numpy')
+    sp_np = create_test_raster(sp.astype(np.float64), backend='numpy')
+    fd_dcp = _make_mfd_raster(fracs, backend='dask+cupy', chunks=(2, 2))
+    sp_dcp = create_test_raster(sp.astype(np.float64), backend='dask+cupy', chunks=(2, 2))
+
+    np_result = flow_path_mfd(fd_np, sp_np)
+    dcp_result = flow_path_mfd(fd_dcp, sp_dcp)
+
+    np.testing.assert_allclose(
+        np_result.data, dcp_result.data.compute().get(), equal_nan=True)
diff --git a/xrspatial/tests/test_hand.py b/xrspatial/hydro/tests/test_hand_d8.py
similarity index 94%
rename from xrspatial/tests/test_hand.py
rename to xrspatial/hydro/tests/test_hand_d8.py
index 3427d91f..6080117f 100644
--- a/xrspatial/tests/test_hand.py
+++ b/xrspatial/hydro/tests/test_hand_d8.py
@@ -4,7 +4,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial.hand import hand
+from xrspatial.hydro.hand_d8 import hand_d8 as hand
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
@@ -157,8 +157,8 @@ class TestHandDask:
         (2, 2), (3, 3), (2, 6), (6, 2), (6, 6),
     ])
     def test_numpy_equals_dask(self, chunks):
-        from xrspatial.flow_direction import flow_direction
-        from xrspatial.flow_accumulation import flow_accumulation
+        from xrspatial.hydro.flow_direction_d8 import flow_direction_d8 as flow_direction
+        from xrspatial.hydro.flow_accumulation_d8 import flow_accumulation_d8 as flow_accumulation
 
         np.random.seed(42)
         elev_data = np.random.uniform(0, 100, (6, 6)).astype(np.float64)
@@ -208,8 +208,8 @@ class TestHandCuPy:
     """Cross-backend: numpy vs cupy."""
 
     def test_numpy_equals_cupy(self):
-        from xrspatial.flow_direction import flow_direction
-        from xrspatial.flow_accumulation import flow_accumulation
+        from xrspatial.hydro.flow_direction_d8 import flow_direction_d8 as flow_direction
+        from xrspatial.hydro.flow_accumulation_d8 import flow_accumulation_d8 as flow_accumulation
 
         np.random.seed(42)
         elev_data = np.random.uniform(0, 100, (6, 6)).astype(np.float64)
@@ -238,8 +238,8 @@ class TestHandDaskCuPy:
     """Cross-backend: numpy vs dask+cupy."""
 
     def test_numpy_equals_dask_cupy(self):
-        from xrspatial.flow_direction import flow_direction
-        from xrspatial.flow_accumulation import flow_accumulation
+        from xrspatial.hydro.flow_direction_d8 import flow_direction_d8 as flow_direction
+        from xrspatial.hydro.flow_accumulation_d8 import flow_accumulation_d8 as flow_accumulation
 
         np.random.seed(42)
         elev_data = np.random.uniform(0, 100, (6, 6)).astype(np.float64)
diff --git a/xrspatial/hydro/tests/test_hand_dinf.py b/xrspatial/hydro/tests/test_hand_dinf.py
new file mode 100644
index 00000000..4ce30341
--- /dev/null
+++ b/xrspatial/hydro/tests/test_hand_dinf.py
@@ -0,0 +1,186 @@
+import numpy as np
+import pytest
+
+from xrspatial.hydro.hand_dinf import hand_dinf
+from xrspatial.tests.general_checks import (
+    create_test_raster,
+    cuda_and_cupy_available,
+    dask_array_available,
+)
+
+
+def _make_hand_dinf_rasters(fd_data, fa_data, elev_data, backend='numpy',
+                             chunks=(3, 3)):
+    attrs = {'res': (1.0, 1.0)}
+    fd = create_test_raster(fd_data, backend=backend, name='fdir_dinf',
+                            attrs=attrs, chunks=chunks)
+    fa = create_test_raster(fa_data, backend=backend, name='fa',
+                            attrs=attrs, chunks=chunks)
+    el = create_test_raster(elev_data, backend=backend, name='elev',
+                            attrs=attrs, chunks=chunks)
+    return fd, fa, el
+
+
+class TestStreamCellsZero:
+    def test_stream_cells_zero(self):
+        # All flow east (angle=0), pit at right. Stream at col 2.
+        fd = np.array([
+            [0.0, 0.0, -1.0],
+            [0.0, 0.0, -1.0],
+            [0.0, 0.0, -1.0],
+        ], dtype=np.float64)
+        fa = np.array([
+            [1.0, 1.0, 3.0],
+            [1.0, 1.0, 3.0],
+            [1.0, 1.0, 3.0],
+        ], dtype=np.float64)
+        elev = np.array([
+            [10.0, 5.0, 1.0],
+            [10.0, 5.0, 1.0],
+            [10.0, 5.0, 1.0],
+        ], dtype=np.float64)
+        fd_r, fa_r, el_r = _make_hand_dinf_rasters(fd, fa, elev)
+        result = hand_dinf(fd_r, fa_r, el_r, threshold=3)
+        for r in range(3):
+            np.testing.assert_allclose(result.data[r, 2], 0.0, rtol=1e-10)
+
+
+class TestSlopeToStream:
+    def test_slope_to_stream(self):
+        fd = np.array([[0.0, 0.0, 0.0, 0.0, -1.0]], dtype=np.float64)
+        fa = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]], dtype=np.float64)
+        elev = np.array([[50.0, 40.0, 30.0, 20.0, 10.0]], dtype=np.float64)
+        fd_r, fa_r, el_r = _make_hand_dinf_rasters(fd, fa, elev)
+        result = hand_dinf(fd_r, fa_r, el_r, threshold=4)
+        np.testing.assert_allclose(result.data[0, 3], 0.0, rtol=1e-10)
+        np.testing.assert_allclose(result.data[0, 4], 0.0, rtol=1e-10)
+        np.testing.assert_allclose(result.data[0, 2], 10.0, rtol=1e-10)
+        np.testing.assert_allclose(result.data[0, 1], 20.0, rtol=1e-10)
+        np.testing.assert_allclose(result.data[0, 0], 30.0, rtol=1e-10)
+
+
+class TestAllStream:
+    def test_all_stream(self):
+        fd = np.array([[0.0, 0.0, -1.0],
+                       [0.0, 0.0, -1.0]], dtype=np.float64)
+        fa = np.array([[1.0, 2.0, 3.0],
+                       [1.0, 2.0, 3.0]], dtype=np.float64)
+        elev = np.array([[30.0, 20.0, 10.0],
+                         [30.0, 20.0, 10.0]], dtype=np.float64)
+        fd_r, fa_r, el_r = _make_hand_dinf_rasters(fd, fa, elev)
+        result = hand_dinf(fd_r, fa_r, el_r, threshold=0)
+        np.testing.assert_allclose(result.data, 0.0, rtol=1e-10)
+
+
+class TestNoStreamPit:
+    def test_no_stream_pit(self):
+        fd = np.array([[0.0, 0.0, -1.0]], dtype=np.float64)
+        fa = np.array([[1.0, 2.0, 3.0]], dtype=np.float64)
+        elev = np.array([[30.0, 20.0, 10.0]], dtype=np.float64)
+        fd_r, fa_r, el_r = _make_hand_dinf_rasters(fd, fa, elev)
+        result = hand_dinf(fd_r, fa_r, el_r, threshold=1000)
+        np.testing.assert_allclose(result.data[0, 2], 0.0, rtol=1e-10)
+        np.testing.assert_allclose(result.data[0, 1], 10.0, rtol=1e-10)
+        np.testing.assert_allclose(result.data[0, 0], 20.0, rtol=1e-10)
+
+
+class TestNanHandling:
+    def test_nan_handling(self):
+        fd = np.array([[0.0, np.nan, -1.0]], dtype=np.float64)
+        fa = np.array([[1.0, 2.0, 3.0]], dtype=np.float64)
+        elev = np.array([[30.0, 20.0, 10.0]], dtype=np.float64)
+        fd_r, fa_r, el_r = _make_hand_dinf_rasters(fd, fa, elev)
+        result = hand_dinf(fd_r, fa_r, el_r, threshold=3)
+        assert np.isnan(result.data[0, 1])
+        assert not np.isnan(result.data[0, 0])
+
+
+class TestEdgeExitDrain:
+    def test_edge_exit_drain(self):
+        fd = np.full((2, 3), 0.0, dtype=np.float64)  # all east, no pit
+        fa = np.full((2, 3), 1.0, dtype=np.float64)
+        elev = np.array([[30.0, 20.0, 10.0],
+                         [30.0, 20.0, 10.0]], dtype=np.float64)
+        fd_r, fa_r, el_r = _make_hand_dinf_rasters(fd, fa, elev)
+        result = hand_dinf(fd_r, fa_r, el_r, threshold=1000)
+        for r in range(2):
+            np.testing.assert_allclose(result.data[r, 2], 0.0, rtol=1e-10)
+        for r in range(2):
+            np.testing.assert_allclose(result.data[r, 1], 10.0, rtol=1e-10)
+
+
+@dask_array_available
+class TestHandDinfDask:
+    @pytest.mark.parametrize("chunks", [
+        (2, 2), (3, 3), (2, 6), (6, 2), (6, 6),
+    ])
+    def test_numpy_equals_dask(self, chunks):
+        fd = np.full((6, 6), 0.0, dtype=np.float64)
+        fd[:, -1] = -1.0
+        fa = np.ones((6, 6), dtype=np.float64)
+        for c in range(6):
+            fa[:, c] = c + 1
+        elev = np.random.RandomState(42).uniform(0, 100, (6, 6)).astype(np.float64)
+
+        fd_np, fa_np, el_np = _make_hand_dinf_rasters(fd, fa, elev, backend='numpy')
+        fd_da, fa_da, el_da = _make_hand_dinf_rasters(fd, fa, elev, backend='dask', chunks=chunks)
+
+        result_np = hand_dinf(fd_np, fa_np, el_np, threshold=5)
+        result_da = hand_dinf(fd_da, fa_da, el_da, threshold=5)
+
+        np.testing.assert_allclose(
+            result_np.data, result_da.data.compute(),
+            equal_nan=True, rtol=1e-10)
+
+
+@cuda_and_cupy_available
+class TestHandDinfCuPy:
+    def test_numpy_equals_cupy(self):
+        fd = np.full((4, 4), 0.0, dtype=np.float64)
+        fd[:, -1] = -1.0
+        fa = np.ones((4, 4), dtype=np.float64)
+        for c in range(4):
+            fa[:, c] = c + 1
+        elev = np.array([
+            [40, 30, 20, 10],
+            [40, 30, 20, 10],
+            [40, 30, 20, 10],
+            [40, 30, 20, 10],
+        ], dtype=np.float64)
+
+        fd_np, fa_np, el_np = _make_hand_dinf_rasters(fd, fa, elev, backend='numpy')
+        fd_cu, fa_cu, el_cu = _make_hand_dinf_rasters(fd, fa, elev, backend='cupy')
+
+        result_np = hand_dinf(fd_np, fa_np, el_np, threshold=3)
+        result_cu = hand_dinf(fd_cu, fa_cu, el_cu, threshold=3)
+
+        np.testing.assert_allclose(
+            result_np.data, result_cu.data.get(),
+            equal_nan=True, rtol=1e-10)
+
+
+@cuda_and_cupy_available
+@dask_array_available
+class TestHandDinfDaskCuPy:
+    def test_numpy_equals_dask_cupy(self):
+        fd = np.full((4, 4), 0.0, dtype=np.float64)
+        fd[:, -1] = -1.0
+        fa = np.ones((4, 4), dtype=np.float64)
+        for c in range(4):
+            fa[:, c] = c + 1
+        elev = np.array([
+            [40, 30, 20, 10],
+            [40, 30, 20, 10],
+            [40, 30, 20, 10],
+            [40, 30, 20, 10],
+        ], dtype=np.float64)
+
+        fd_np, fa_np, el_np = _make_hand_dinf_rasters(fd, fa, elev, backend='numpy')
+        fd_dc, fa_dc, el_dc = _make_hand_dinf_rasters(fd, fa, elev, backend='dask+cupy', chunks=(2, 2))
+
+        result_np = hand_dinf(fd_np, fa_np, el_np, threshold=3)
+        result_dc = hand_dinf(fd_dc, fa_dc, el_dc, threshold=3)
+
+        np.testing.assert_allclose(
+            result_np.data, result_dc.data.compute().get(),
+            equal_nan=True, rtol=1e-10)
diff --git a/xrspatial/hydro/tests/test_hand_mfd.py b/xrspatial/hydro/tests/test_hand_mfd.py
new file mode 100644
index 00000000..0e930c51
--- /dev/null
+++ b/xrspatial/hydro/tests/test_hand_mfd.py
@@ -0,0 +1,220 @@
+import numpy as np
+import pytest
+import xarray as xr
+
+from xrspatial.hydro.hand_mfd import hand_mfd
+from xrspatial.tests.general_checks import (
+    create_test_raster,
+    cuda_and_cupy_available,
+    dask_array_available,
+)
+
+
+def _make_mfd_raster(fractions, backend='numpy', chunks=(3, 3)):
+    _, H, W = fractions.shape
+    neighbor_names = ['E', 'SE', 'S', 'SW', 'W', 'NW', 'N', 'NE']
+    y_coords = np.arange(H, dtype=np.float64)
+    x_coords = np.arange(W, dtype=np.float64)
+    da_obj = xr.DataArray(
+        fractions.astype(np.float64),
+        dims=('neighbor', 'y', 'x'),
+        coords={'neighbor': neighbor_names, 'y': y_coords, 'x': x_coords},
+        name='mfd_fdir',
+    )
+    if backend == 'dask':
+        import dask.array as da
+        da_obj = da_obj.chunk({'neighbor': 8, 'y': chunks[0], 'x': chunks[1]})
+    elif backend == 'cupy':
+        import cupy as cp
+        da_obj = da_obj.copy(data=cp.asarray(da_obj.data))
+    elif backend == 'dask+cupy':
+        import cupy as cp
+        import dask.array as da
+        da_obj = da_obj.chunk({'neighbor': 8, 'y': chunks[0], 'x': chunks[1]})
+        da_obj = da_obj.copy(data=da_obj.data.map_blocks(
+            cp.asarray, dtype=da_obj.dtype, meta=cp.array((), dtype=da_obj.dtype)))
+    return da_obj
+
+
+def _make_all_east(H, W):
+    fracs = np.zeros((8, H, W), dtype=np.float64)
+    fracs[0, :, :-1] = 1.0  # E for non-pit cells
+    return fracs
+
+
+def _make_hand_mfd_rasters(fracs, fa_data, elev_data, backend='numpy',
+                            chunks=(3, 3)):
+    attrs = {'res': (1.0, 1.0)}
+    fd = _make_mfd_raster(fracs, backend=backend, chunks=chunks)
+    fa = create_test_raster(fa_data, backend=backend, name='fa',
+                            attrs=attrs, chunks=chunks)
+    el = create_test_raster(elev_data, backend=backend, name='elev',
+                            attrs=attrs, chunks=chunks)
+    return fd, fa, el
+
+
+class TestStreamCellsZero:
+    def test_stream_cells_zero(self):
+        fracs = _make_all_east(3, 3)
+        fa = np.array([
+            [1.0, 1.0, 3.0],
+            [1.0, 1.0, 3.0],
+            [1.0, 1.0, 3.0],
+        ], dtype=np.float64)
+        elev = np.array([
+            [10.0, 5.0, 1.0],
+            [10.0, 5.0, 1.0],
+            [10.0, 5.0, 1.0],
+        ], dtype=np.float64)
+        fd_r, fa_r, el_r = _make_hand_mfd_rasters(fracs, fa, elev)
+        result = hand_mfd(fd_r, fa_r, el_r, threshold=3)
+        for r in range(3):
+            np.testing.assert_allclose(result.data[r, 2], 0.0, rtol=1e-10)
+
+
+class TestSlopeToStream:
+    def test_slope_to_stream(self):
+        fracs = np.zeros((8, 1, 5), dtype=np.float64)
+        fracs[0, 0, :-1] = 1.0  # E for all but last
+        fa = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]], dtype=np.float64)
+        elev = np.array([[50.0, 40.0, 30.0, 20.0, 10.0]], dtype=np.float64)
+        fd_r, fa_r, el_r = _make_hand_mfd_rasters(fracs, fa, elev)
+        result = hand_mfd(fd_r, fa_r, el_r, threshold=4)
+        np.testing.assert_allclose(result.data[0, 3], 0.0, rtol=1e-10)
+        np.testing.assert_allclose(result.data[0, 4], 0.0, rtol=1e-10)
+        np.testing.assert_allclose(result.data[0, 2], 10.0, rtol=1e-10)
+        np.testing.assert_allclose(result.data[0, 1], 20.0, rtol=1e-10)
+        np.testing.assert_allclose(result.data[0, 0], 30.0, rtol=1e-10)
+
+
+class TestAllStream:
+    def test_all_stream(self):
+        fracs = _make_all_east(2, 3)
+        fa = np.array([[1.0, 2.0, 3.0],
+                       [1.0, 2.0, 3.0]], dtype=np.float64)
+        elev = np.array([[30.0, 20.0, 10.0],
+                         [30.0, 20.0, 10.0]], dtype=np.float64)
+        fd_r, fa_r, el_r = _make_hand_mfd_rasters(fracs, fa, elev)
+        result = hand_mfd(fd_r, fa_r, el_r, threshold=0)
+        np.testing.assert_allclose(result.data, 0.0, rtol=1e-10)
+
+
+class TestNoStreamPit:
+    def test_no_stream_pit(self):
+        fracs = np.zeros((8, 1, 3), dtype=np.float64)
+        fracs[0, 0, :-1] = 1.0
+        fa = np.array([[1.0, 2.0, 3.0]], dtype=np.float64)
+        elev = np.array([[30.0, 20.0, 10.0]], dtype=np.float64)
+        fd_r, fa_r, el_r = _make_hand_mfd_rasters(fracs, fa, elev)
+        result = hand_mfd(fd_r, fa_r, el_r, threshold=1000)
+        np.testing.assert_allclose(result.data[0, 2], 0.0, rtol=1e-10)
+        np.testing.assert_allclose(result.data[0, 1], 10.0, rtol=1e-10)
+        np.testing.assert_allclose(result.data[0, 0], 20.0, rtol=1e-10)
+
+
+class TestNanHandling:
+    def test_nan_handling(self):
+        fracs = np.zeros((8, 1, 3), dtype=np.float64)
+        fracs[0, 0, 0] = 1.0
+        fracs[:, 0, 1] = np.nan
+        fa = np.array([[1.0, 2.0, 3.0]], dtype=np.float64)
+        elev = np.array([[30.0, 20.0, 10.0]], dtype=np.float64)
+        fd_r, fa_r, el_r = _make_hand_mfd_rasters(fracs, fa, elev)
+        result = hand_mfd(fd_r, fa_r, el_r, threshold=3)
+        assert np.isnan(result.data[0, 1])
+        assert not np.isnan(result.data[0, 0])
+
+
+class TestEdgeExitDrain:
+    def test_edge_exit_drain(self):
+        fracs = np.zeros((8, 2, 3), dtype=np.float64)
+        fracs[0, :, :] = 1.0  # all E, exits at right edge
+        fa = np.full((2, 3), 1.0, dtype=np.float64)
+        elev = np.array([[30.0, 20.0, 10.0],
+                         [30.0, 20.0, 10.0]], dtype=np.float64)
+        fd_r, fa_r, el_r = _make_hand_mfd_rasters(fracs, fa, elev)
+        result = hand_mfd(fd_r, fa_r, el_r, threshold=1000)
+        for r in range(2):
+            np.testing.assert_allclose(result.data[r, 2], 0.0, rtol=1e-10)
+        for r in range(2):
+            np.testing.assert_allclose(result.data[r, 1], 10.0, rtol=1e-10)
+
+
+@dask_array_available
+class TestHandMfdDask:
+    @pytest.mark.parametrize("chunks", [
+        (2, 2), (3, 3), (2, 6), (6, 2), (6, 6),
+    ])
+    def test_numpy_equals_dask(self, chunks):
+        fracs = _make_all_east(6, 6)
+        fa = np.ones((6, 6), dtype=np.float64)
+        for c in range(6):
+            fa[:, c] = c + 1
+        elev = np.random.RandomState(42).uniform(0, 100, (6, 6)).astype(np.float64)
+
+        fd_np, fa_np, el_np = _make_hand_mfd_rasters(
+            fracs, fa, elev, backend='numpy')
+        fd_da, fa_da, el_da = _make_hand_mfd_rasters(
+            fracs, fa, elev, backend='dask', chunks=chunks)
+
+        result_np = hand_mfd(fd_np, fa_np, el_np, threshold=5)
+        result_da = hand_mfd(fd_da, fa_da, el_da, threshold=5)
+
+        np.testing.assert_allclose(
+            result_np.data, result_da.data.compute(),
+            equal_nan=True, rtol=1e-10)
+
+
+@cuda_and_cupy_available
+class TestHandMfdCuPy:
+    def test_numpy_equals_cupy(self):
+        fracs = _make_all_east(4, 4)
+        fa = np.ones((4, 4), dtype=np.float64)
+        for c in range(4):
+            fa[:, c] = c + 1
+        elev = np.array([
+            [40, 30, 20, 10],
+            [40, 30, 20, 10],
+            [40, 30, 20, 10],
+            [40, 30, 20, 10],
+        ], dtype=np.float64)
+
+        fd_np, fa_np, el_np = _make_hand_mfd_rasters(
+            fracs, fa, elev, backend='numpy')
+        fd_cu, fa_cu, el_cu = _make_hand_mfd_rasters(
+            fracs, fa, elev, backend='cupy')
+
+        result_np = hand_mfd(fd_np, fa_np, el_np, threshold=3)
+        result_cu = hand_mfd(fd_cu, fa_cu, el_cu, threshold=3)
+
+        np.testing.assert_allclose(
+            result_np.data, result_cu.data.get(),
+            equal_nan=True, rtol=1e-10)
+
+
+@cuda_and_cupy_available
+@dask_array_available
+class TestHandMfdDaskCuPy:
+    def test_numpy_equals_dask_cupy(self):
+        fracs = _make_all_east(4, 4)
+        fa = np.ones((4, 4), dtype=np.float64)
+        for c in range(4):
+            fa[:, c] = c + 1
+        elev = np.array([
+            [40, 30, 20, 10],
+            [40, 30, 20, 10],
+            [40, 30, 20, 10],
+            [40, 30, 20, 10],
+        ], dtype=np.float64)
+
+        fd_np, fa_np, el_np = _make_hand_mfd_rasters(
+            fracs, fa, elev, backend='numpy')
+        fd_dc, fa_dc, el_dc = _make_hand_mfd_rasters(
+            fracs, fa, elev, backend='dask+cupy', chunks=(2, 2))
+
+        result_np = hand_mfd(fd_np, fa_np, el_np, threshold=3)
+        result_dc = hand_mfd(fd_dc, fa_dc, el_dc, threshold=3)
+
+        np.testing.assert_allclose(
+            result_np.data, result_dc.data.compute().get(),
+            equal_nan=True, rtol=1e-10)
diff --git a/xrspatial/tests/test_sink.py b/xrspatial/hydro/tests/test_sink_d8.py
similarity index 99%
rename from xrspatial/tests/test_sink.py
rename to xrspatial/hydro/tests/test_sink_d8.py
index 670ff048..cee78f87 100644
--- a/xrspatial/tests/test_sink.py
+++ b/xrspatial/hydro/tests/test_sink_d8.py
@@ -2,7 +2,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial import sink
+from xrspatial.hydro import sink
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
diff --git a/xrspatial/tests/test_snap_pour_point.py b/xrspatial/hydro/tests/test_snap_pour_point_d8.py
similarity index 99%
rename from xrspatial/tests/test_snap_pour_point.py
rename to xrspatial/hydro/tests/test_snap_pour_point_d8.py
index 513e1c5a..fe70b13b 100644
--- a/xrspatial/tests/test_snap_pour_point.py
+++ b/xrspatial/hydro/tests/test_snap_pour_point_d8.py
@@ -2,7 +2,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial import snap_pour_point
+from xrspatial.hydro import snap_pour_point
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
diff --git a/xrspatial/tests/test_stream_link.py b/xrspatial/hydro/tests/test_stream_link_d8.py
similarity index 98%
rename from xrspatial/tests/test_stream_link.py
rename to xrspatial/hydro/tests/test_stream_link_d8.py
index dd6afb03..8b29227d 100644
--- a/xrspatial/tests/test_stream_link.py
+++ b/xrspatial/hydro/tests/test_stream_link_d8.py
@@ -2,7 +2,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial import stream_link
+from xrspatial.hydro import stream_link
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
@@ -197,7 +197,7 @@ def test_stream_link_numpy_equals_dask(chunks):
 @dask_array_available
 def test_stream_link_dask_random():
     """Random acyclic flow: dask matches numpy."""
-    from xrspatial import flow_direction, flow_accumulation
+    from xrspatial.hydro import flow_direction, flow_accumulation
 
     rng = np.random.default_rng(42)
     elev = rng.random((8, 10)).astype(np.float64)
@@ -274,7 +274,7 @@ def test_stream_link_numpy_equals_dask_cupy():
 @cuda_and_cupy_available
 def test_stream_link_dask_cupy_random():
     """Random acyclic flow: dask+cupy stream_link matches numpy."""
-    from xrspatial import flow_direction, flow_accumulation
+    from xrspatial.hydro import flow_direction, flow_accumulation
 
     rng = np.random.default_rng(952)
     elev = rng.random((8, 10)).astype(np.float64)
diff --git a/xrspatial/tests/test_stream_link_dinf.py b/xrspatial/hydro/tests/test_stream_link_dinf.py
similarity index 98%
rename from xrspatial/tests/test_stream_link_dinf.py
rename to xrspatial/hydro/tests/test_stream_link_dinf.py
index d6658a44..40dc5d6e 100644
--- a/xrspatial/tests/test_stream_link_dinf.py
+++ b/xrspatial/hydro/tests/test_stream_link_dinf.py
@@ -4,7 +4,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial.stream_link_dinf import stream_link_dinf
+from xrspatial.hydro.stream_link_dinf import stream_link_dinf
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
diff --git a/xrspatial/tests/test_stream_link_mfd.py b/xrspatial/hydro/tests/test_stream_link_mfd.py
similarity index 98%
rename from xrspatial/tests/test_stream_link_mfd.py
rename to xrspatial/hydro/tests/test_stream_link_mfd.py
index 9f2bd9c6..4fb7434d 100644
--- a/xrspatial/tests/test_stream_link_mfd.py
+++ b/xrspatial/hydro/tests/test_stream_link_mfd.py
@@ -2,7 +2,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial.stream_link_mfd import stream_link_mfd
+from xrspatial.hydro.stream_link_mfd import stream_link_mfd
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
diff --git a/xrspatial/tests/test_stream_order.py b/xrspatial/hydro/tests/test_stream_order_d8.py
similarity index 89%
rename from xrspatial/tests/test_stream_order.py
rename to xrspatial/hydro/tests/test_stream_order_d8.py
index 258b2ca0..b3de0dae 100644
--- a/xrspatial/tests/test_stream_order.py
+++ b/xrspatial/hydro/tests/test_stream_order_d8.py
@@ -2,7 +2,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial import stream_order
+from xrspatial.hydro import stream_order
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
@@ -49,7 +49,7 @@ def test_y_confluence():
         [1.0, 4.0, 1.0],
     ], dtype=np.float64)
     result = _make_stream_order(flow_dir, flow_accum, threshold=1,
-                                method='strahler')
+                                ordering='strahler')
     # (0,0) and (0,2) are headwaters: order 1
     assert result.data[0, 0] == 1.0
     assert result.data[0, 2] == 1.0
@@ -79,7 +79,7 @@ def test_unequal_confluence():
         [1.0, 5.0, 1.0],
     ], dtype=np.float64)
     result = _make_stream_order(flow_dir, flow_accum, threshold=1,
-                                method='strahler')
+                                ordering='strahler')
     assert result.data[0, 0] == 1.0  # headwater
     assert result.data[0, 2] == 1.0  # headwater
     assert result.data[1, 1] == 2.0  # two order-1 merge
@@ -108,7 +108,7 @@ def test_order_3():
         [1.0, 1.0, 1.0, 1.0, 1.0],
     ], dtype=np.float64)
     result = _make_stream_order(flow_dir, flow_accum, threshold=1,
-                                method='strahler')
+                                ordering='strahler')
     assert result.data[1, 1] == 2.0  # two order-1 merge
     assert result.data[1, 3] == 2.0  # two order-1 merge
     assert result.data[1, 2] == 3.0  # two order-2 merge
@@ -130,7 +130,7 @@ def test_order_3_equal_merge():
         [1.0, 2.0, 3.0],
     ], dtype=np.float64)
     result = _make_stream_order(flow_dir, flow_accum, threshold=2,
-                                method='strahler')
+                                ordering='strahler')
     # Only cells with accum >= 2 are stream cells
     assert np.isnan(result.data[0, 0])  # accum=1, not stream
     assert np.isnan(result.data[2, 0])  # accum=1, not stream
@@ -153,7 +153,7 @@ def test_linear_chain():
     flow_dir = np.array([[1.0, 1.0, 1.0, 1.0, 0.0]], dtype=np.float64)
     flow_accum = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]], dtype=np.float64)
     result = _make_stream_order(flow_dir, flow_accum, threshold=1,
-                                method='strahler')
+                                ordering='strahler')
     expected = np.array([[1.0, 1.0, 1.0, 1.0, 1.0]], dtype=np.float64)
     np.testing.assert_array_equal(result.data, expected)
 
@@ -175,7 +175,7 @@ def test_shreve_y_confluence():
         [1.0, 4.0, 1.0],
     ], dtype=np.float64)
     result = _make_stream_order(flow_dir, flow_accum, threshold=1,
-                                method='shreve')
+                                ordering='shreve')
     assert result.data[0, 0] == 1.0
     assert result.data[0, 2] == 1.0
     assert result.data[1, 1] == 2.0  # sum of two 1s
@@ -195,7 +195,7 @@ def test_shreve_triple():
         [1.0, 4.0, 1.0],
     ], dtype=np.float64)
     result = _make_stream_order(flow_dir, flow_accum, threshold=1,
-                                method='shreve')
+                                ordering='shreve')
     assert result.data[0, 0] == 1.0
     assert result.data[0, 1] == 1.0
     assert result.data[0, 2] == 1.0
@@ -218,7 +218,7 @@ def test_shreve_cascade():
         [1.0, 1.0, 1.0],
     ], dtype=np.float64)
     result = _make_stream_order(flow_dir, flow_accum, threshold=1,
-                                method='shreve')
+                                ordering='shreve')
     assert result.data[0, 0] == 1.0  # headwater
     assert result.data[1, 0] == 1.0  # headwater
     assert result.data[0, 1] == 2.0  # A + C
@@ -235,7 +235,7 @@ def test_threshold_filters_cells():
     flow_dir = np.array([[1.0, 1.0, 1.0, 0.0]], dtype=np.float64)
     flow_accum = np.array([[1.0, 2.0, 3.0, 4.0]], dtype=np.float64)
     result = _make_stream_order(flow_dir, flow_accum, threshold=3,
-                                method='strahler')
+                                ordering='strahler')
     assert np.isnan(result.data[0, 0])
     assert np.isnan(result.data[0, 1])
     assert result.data[0, 2] == 1.0
@@ -247,7 +247,7 @@ def test_threshold_zero():
     flow_dir = np.array([[1.0, 1.0, 0.0]], dtype=np.float64)
     flow_accum = np.array([[1.0, 2.0, 3.0]], dtype=np.float64)
     result = _make_stream_order(flow_dir, flow_accum, threshold=0,
-                                method='strahler')
+                                ordering='strahler')
     assert not np.any(np.isnan(result.data))
 
 
@@ -260,7 +260,7 @@ def test_no_streams():
     flow_dir = np.array([[1.0, 0.0]], dtype=np.float64)
     flow_accum = np.array([[1.0, 2.0]], dtype=np.float64)
     result = _make_stream_order(flow_dir, flow_accum, threshold=100,
-                                method='strahler')
+                                ordering='strahler')
     assert np.all(np.isnan(result.data))
 
 
@@ -273,7 +273,7 @@ def test_nan_handling():
         [1.0, 2.0, 3.0],
     ], dtype=np.float64)
     result = _make_stream_order(flow_dir, flow_accum, threshold=1,
-                                method='strahler')
+                                ordering='strahler')
     assert np.isnan(result.data[0, 1])
     # (0,0) still valid stream cell
     assert result.data[0, 0] == 1.0
@@ -284,7 +284,7 @@ def test_pit_in_stream():
     flow_dir = np.array([[1.0, 0.0]], dtype=np.float64)
     flow_accum = np.array([[1.0, 2.0]], dtype=np.float64)
     result = _make_stream_order(flow_dir, flow_accum, threshold=1,
-                                method='strahler')
+                                ordering='strahler')
     assert result.data[0, 0] == 1.0
     assert result.data[0, 1] == 1.0  # gets inflow from (0,0), single -> 1
 
@@ -312,8 +312,8 @@ def test_invalid_method():
     flow_accum = np.array([[1.0, 2.0]], dtype=np.float64)
     fd_da = create_test_raster(flow_dir)
     fa_da = create_test_raster(flow_accum)
-    with pytest.raises(ValueError, match="method must be"):
-        stream_order(fd_da, fa_da, method='horton')
+    with pytest.raises(ValueError, match="ordering must be"):
+        stream_order(fd_da, fa_da, ordering='horton')
 
 
 # ====================================================================
@@ -346,8 +346,8 @@ def test_dask_equivalence(chunks, method):
     da_fd = create_test_raster(flow_dir, backend='dask', chunks=chunks)
     da_fa = create_test_raster(flow_accum, backend='dask', chunks=chunks)
 
-    np_result = stream_order(np_fd, np_fa, threshold=1, method=method)
-    da_result = stream_order(da_fd, da_fa, threshold=1, method=method)
+    np_result = stream_order(np_fd, np_fa, threshold=1, ordering=method)
+    da_result = stream_order(da_fd, da_fa, threshold=1, ordering=method)
     np.testing.assert_allclose(
         np_result.data, da_result.data.compute(), equal_nan=True)
 
@@ -374,12 +374,12 @@ def test_dask_cross_tile_confluence(method):
 
     np_fd = create_test_raster(flow_dir, backend='numpy')
     np_fa = create_test_raster(flow_accum, backend='numpy')
-    np_result = stream_order(np_fd, np_fa, threshold=1, method=method)
+    np_result = stream_order(np_fd, np_fa, threshold=1, ordering=method)
 
     for chunks in [(2, 2), (3, 3), (2, 5)]:
         da_fd = create_test_raster(flow_dir, backend='dask', chunks=chunks)
         da_fa = create_test_raster(flow_accum, backend='dask', chunks=chunks)
-        da_result = stream_order(da_fd, da_fa, threshold=1, method=method)
+        da_result = stream_order(da_fd, da_fa, threshold=1, ordering=method)
         np.testing.assert_allclose(
             np_result.data, da_result.data.compute(), equal_nan=True,
         ), f"Mismatch with chunks={chunks}, method={method}"
@@ -389,7 +389,7 @@ def test_dask_cross_tile_confluence(method):
 @pytest.mark.parametrize("method", ['strahler', 'shreve'])
 def test_dask_random(method):
     """Random acyclic flow: dask matches numpy."""
-    from xrspatial import flow_direction, flow_accumulation
+    from xrspatial.hydro import flow_direction, flow_accumulation
 
     rng = np.random.default_rng(123)
     elev = rng.random((8, 10)).astype(np.float64)
@@ -397,14 +397,14 @@ def test_dask_random(method):
     fd = flow_direction(elev_da)
     fa = flow_accumulation(fd)
 
-    np_result = stream_order(fd, fa, threshold=3, method=method)
+    np_result = stream_order(fd, fa, threshold=3, ordering=method)
 
     fd_data = fd.data
     fa_data = fa.data
     for chunks in [(3, 3), (4, 5), (8, 10)]:
         da_fd = create_test_raster(fd_data, backend='dask', chunks=chunks)
         da_fa = create_test_raster(fa_data, backend='dask', chunks=chunks)
-        da_result = stream_order(da_fd, da_fa, threshold=3, method=method)
+        da_result = stream_order(da_fd, da_fa, threshold=3, ordering=method)
         np.testing.assert_allclose(
             np_result.data, da_result.data.compute(), equal_nan=True,
         ), f"Mismatch with chunks={chunks}, method={method}"
@@ -420,8 +420,8 @@ def test_gpu_equivalence(method):
     cp_fd = create_test_raster(flow_dir, backend='cupy')
     cp_fa = create_test_raster(flow_accum, backend='cupy')
 
-    np_result = stream_order(np_fd, np_fa, threshold=1, method=method)
-    cp_result = stream_order(cp_fd, cp_fa, threshold=1, method=method)
+    np_result = stream_order(np_fd, np_fa, threshold=1, ordering=method)
+    cp_result = stream_order(cp_fd, cp_fa, threshold=1, ordering=method)
     np.testing.assert_allclose(
         np_result.data, cp_result.data.get(), equal_nan=True)
 
@@ -437,8 +437,8 @@ def test_dask_cupy_equivalence(method):
     dcp_fd = create_test_raster(flow_dir, backend='dask+cupy', chunks=(3, 3))
     dcp_fa = create_test_raster(flow_accum, backend='dask+cupy', chunks=(3, 3))
 
-    np_result = stream_order(np_fd, np_fa, threshold=1, method=method)
-    dcp_result = stream_order(dcp_fd, dcp_fa, threshold=1, method=method)
+    np_result = stream_order(np_fd, np_fa, threshold=1, ordering=method)
+    dcp_result = stream_order(dcp_fd, dcp_fa, threshold=1, ordering=method)
     np.testing.assert_allclose(
         np_result.data, dcp_result.data.compute().get(), equal_nan=True)
 
@@ -448,7 +448,7 @@ def test_dask_cupy_equivalence(method):
 @pytest.mark.parametrize("method", ['strahler', 'shreve'])
 def test_dask_cupy_random(method):
     """Random acyclic flow: dask+cupy matches numpy."""
-    from xrspatial import flow_direction, flow_accumulation
+    from xrspatial.hydro import flow_direction, flow_accumulation
 
     rng = np.random.default_rng(952)
     elev = rng.random((8, 10)).astype(np.float64)
@@ -456,14 +456,14 @@ def test_dask_cupy_random(method):
     fd = flow_direction(elev_da)
     fa = flow_accumulation(fd)
 
-    np_result = stream_order(fd, fa, threshold=3, method=method)
+    np_result = stream_order(fd, fa, threshold=3, ordering=method)
 
     fd_data = fd.data
     fa_data = fa.data
     for chunks in [(3, 3), (4, 5), (2, 2)]:
         dcp_fd = create_test_raster(fd_data, backend='dask+cupy', chunks=chunks)
         dcp_fa = create_test_raster(fa_data, backend='dask+cupy', chunks=chunks)
-        dcp_result = stream_order(dcp_fd, dcp_fa, threshold=3, method=method)
+        dcp_result = stream_order(dcp_fd, dcp_fa, threshold=3, ordering=method)
         np.testing.assert_allclose(
             np_result.data, dcp_result.data.compute().get(), equal_nan=True,
         ), f"Mismatch with chunks={chunks}, method={method}"
diff --git a/xrspatial/tests/test_stream_order_dinf.py b/xrspatial/hydro/tests/test_stream_order_dinf.py
similarity index 99%
rename from xrspatial/tests/test_stream_order_dinf.py
rename to xrspatial/hydro/tests/test_stream_order_dinf.py
index d36338a0..e5303d45 100644
--- a/xrspatial/tests/test_stream_order_dinf.py
+++ b/xrspatial/hydro/tests/test_stream_order_dinf.py
@@ -4,7 +4,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial.stream_order_dinf import stream_order_dinf
+from xrspatial.hydro.stream_order_dinf import stream_order_dinf
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
diff --git a/xrspatial/tests/test_stream_order_mfd.py b/xrspatial/hydro/tests/test_stream_order_mfd.py
similarity index 99%
rename from xrspatial/tests/test_stream_order_mfd.py
rename to xrspatial/hydro/tests/test_stream_order_mfd.py
index 921e13f8..daff206d 100644
--- a/xrspatial/tests/test_stream_order_mfd.py
+++ b/xrspatial/hydro/tests/test_stream_order_mfd.py
@@ -2,7 +2,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial.stream_order_mfd import stream_order_mfd
+from xrspatial.hydro.stream_order_mfd import stream_order_mfd
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
diff --git a/xrspatial/tests/test_twi.py b/xrspatial/hydro/tests/test_twi_d8.py
similarity index 99%
rename from xrspatial/tests/test_twi.py
rename to xrspatial/hydro/tests/test_twi_d8.py
index 8a9f3a5f..69f787d9 100644
--- a/xrspatial/tests/test_twi.py
+++ b/xrspatial/hydro/tests/test_twi_d8.py
@@ -4,7 +4,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial.twi import twi
+from xrspatial.hydro.twi_d8 import twi_d8 as twi
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
diff --git a/xrspatial/tests/test_watershed.py b/xrspatial/hydro/tests/test_watershed_d8.py
similarity index 98%
rename from xrspatial/tests/test_watershed.py
rename to xrspatial/hydro/tests/test_watershed_d8.py
index ca64e397..75e83eec 100644
--- a/xrspatial/tests/test_watershed.py
+++ b/xrspatial/hydro/tests/test_watershed_d8.py
@@ -2,7 +2,7 @@
 import pytest
 import xarray as xr
 
-from xrspatial import watershed, basins
+from xrspatial.hydro import watershed, basins
 from xrspatial.tests.general_checks import (
     create_test_raster,
     cuda_and_cupy_available,
@@ -152,7 +152,7 @@ def test_multiple_paths_to_same_pour_point():
 
 def test_basins_backward_compat():
     """basins() wrapper delegates to basin() correctly."""
-    from xrspatial import basin
+    from xrspatial.hydro import basin
 
     flow_dir = np.array([
         [2.0,  4.0,  8.0],
@@ -274,7 +274,7 @@ def test_watershed_dask_two_pour_points():
 @dask_array_available
 def test_watershed_dask_random():
     """Random acyclic flow_dir: dask watershed matches numpy."""
-    from xrspatial import flow_direction, flow_accumulation
+    from xrspatial.hydro import flow_direction, flow_accumulation
 
     rng = np.random.default_rng(456)
     elev = rng.random((8, 10)).astype(np.float64)
@@ -335,7 +335,7 @@ def test_watershed_numpy_equals_dask_cupy():
 @cuda_and_cupy_available
 def test_watershed_dask_cupy_random():
     """Random acyclic flow_dir: dask+cupy watershed matches numpy."""
-    from xrspatial import flow_direction, flow_accumulation
+    from xrspatial.hydro import flow_direction, flow_accumulation
 
     rng = np.random.default_rng(952)
     elev = rng.random((8, 10)).astype(np.float64)
diff --git a/xrspatial/hydro/tests/test_watershed_dinf.py b/xrspatial/hydro/tests/test_watershed_dinf.py
new file mode 100644
index 00000000..edf3c4d2
--- /dev/null
+++ b/xrspatial/hydro/tests/test_watershed_dinf.py
@@ -0,0 +1,191 @@
+import math
+
+import numpy as np
+import pytest
+
+from xrspatial.hydro.watershed_dinf import watershed_dinf
+from xrspatial.tests.general_checks import (
+    create_test_raster,
+    cuda_and_cupy_available,
+    dask_array_available,
+)
+
+
+def _make_fd_and_pp(fd_data, pp_data, backend='numpy', chunks=(3, 3)):
+    fd = create_test_raster(fd_data.astype(np.float64), backend=backend, chunks=chunks)
+    pp = create_test_raster(pp_data.astype(np.float64), backend=backend, chunks=chunks)
+    return fd, pp
+
+
+def test_single_pour_point():
+    """5x5 bowl: all cells flow toward center, single pour point."""
+    # Build angles pointing to center (2,2)
+    fd = np.full((5, 5), -1.0, dtype=np.float64)
+    # Approximate: use angles that point toward center
+    # SE=7*pi/4, S=3*pi/2, SW=5*pi/4, E=0, W=pi, NE=pi/4, N=pi/2, NW=3*pi/4
+    for r in range(5):
+        for c in range(5):
+            if r == 2 and c == 2:
+                fd[r, c] = -1.0  # pit
+                continue
+            dr = 2 - r
+            dc = 2 - c
+            angle = math.atan2(-dr, dc)  # atan2(-dy, dx) for grid coords
+            if angle < 0:
+                angle += 2 * math.pi
+            fd[r, c] = angle
+
+    pp = np.full((5, 5), np.nan, dtype=np.float64)
+    pp[2, 2] = 42.0
+
+    fd_da, pp_da = _make_fd_and_pp(fd, pp)
+    result = watershed_dinf(fd_da, pp_da)
+
+    # All cells should reach pour point
+    expected = np.full((5, 5), 42.0, dtype=np.float64)
+    np.testing.assert_allclose(result.data, expected)
+
+
+def test_two_pour_points():
+    """Two separate drainage areas: single row flows east to two pits."""
+    fd = np.full((1, 6), 0.0, dtype=np.float64)  # all east
+    fd[0, 2] = -1.0  # pit
+    fd[0, 5] = -1.0  # pit
+
+    pp = np.full((1, 6), np.nan, dtype=np.float64)
+    pp[0, 2] = 10.0
+    pp[0, 5] = 20.0
+
+    fd_da, pp_da = _make_fd_and_pp(fd, pp)
+    result = watershed_dinf(fd_da, pp_da)
+
+    for c in range(3):
+        assert result.data[0, c] == 10.0, f"Failed at (0,{c})"
+    for c in range(3, 6):
+        assert result.data[0, c] == 20.0, f"Failed at (0,{c})"
+
+
+def test_unreachable_cells():
+    """Cells not draining to any pour point produce NaN."""
+    fd = np.array([
+        [0.0, -1.0, 0.0, -1.0],
+    ], dtype=np.float64)
+    pp = np.full((1, 4), np.nan, dtype=np.float64)
+    pp[0, 1] = 5.0
+
+    fd_da, pp_da = _make_fd_and_pp(fd, pp)
+    result = watershed_dinf(fd_da, pp_da)
+
+    assert result.data[0, 0] == 5.0
+    assert result.data[0, 1] == 5.0
+    assert np.isnan(result.data[0, 2])
+    assert np.isnan(result.data[0, 3])
+
+
+def test_nan_handling():
+    fd = np.array([
+        [0.0, -1.0],
+        [np.nan, 0.0],
+    ], dtype=np.float64)
+    pp = np.full((2, 2), np.nan, dtype=np.float64)
+    pp[0, 1] = 7.0
+
+    fd_da, pp_da = _make_fd_and_pp(fd, pp)
+    result = watershed_dinf(fd_da, pp_da)
+
+    assert result.data[0, 0] == 7.0
+    assert result.data[0, 1] == 7.0
+    assert np.isnan(result.data[1, 0])
+
+
+def test_linear_chain_east():
+    N = 6
+    fd = np.full((1, N), 0.0, dtype=np.float64)
+    fd[0, -1] = -1.0
+    pp = np.full((1, N), np.nan, dtype=np.float64)
+    pp[0, -1] = 99.0
+
+    fd_da, pp_da = _make_fd_and_pp(fd, pp)
+    result = watershed_dinf(fd_da, pp_da)
+
+    expected = np.full((1, N), 99.0, dtype=np.float64)
+    np.testing.assert_allclose(result.data, expected)
+
+
+def test_pour_point_at_non_pit():
+    fd = np.array([[0.0, 0.0, -1.0]], dtype=np.float64)
+    pp = np.full((1, 3), np.nan, dtype=np.float64)
+    pp[0, 1] = 50.0
+
+    fd_da, pp_da = _make_fd_and_pp(fd, pp)
+    result = watershed_dinf(fd_da, pp_da)
+
+    assert result.data[0, 0] == 50.0
+    assert result.data[0, 1] == 50.0
+    assert np.isnan(result.data[0, 2])
+
+
+def test_output_dtype():
+    fd = np.full((3, 3), -1.0, dtype=np.float64)
+    pp = np.full((3, 3), np.nan, dtype=np.float64)
+    pp[1, 1] = 1.0
+
+    fd_da, pp_da = _make_fd_and_pp(fd, pp)
+    result = watershed_dinf(fd_da, pp_da)
+    assert result.data.dtype == np.float64
+
+
+@dask_array_available
+@pytest.mark.parametrize("chunks", [
+    (2, 2), (3, 3), (5, 5), (1, 1), (2, 5),
+])
+def test_numpy_equals_dask(chunks):
+    fd = np.full((5, 5), 0.0, dtype=np.float64)
+    fd[:, -1] = -1.0
+    fd[2, :] = math.pi * 1.5  # south
+    pp = np.full((5, 5), np.nan, dtype=np.float64)
+    pp[0, 4] = 42.0
+
+    np_fd, np_pp = _make_fd_and_pp(fd, pp, backend='numpy')
+    dk_fd, dk_pp = _make_fd_and_pp(fd, pp, backend='dask', chunks=chunks)
+
+    np_result = watershed_dinf(np_fd, np_pp)
+    dk_result = watershed_dinf(dk_fd, dk_pp)
+
+    np.testing.assert_allclose(
+        np_result.data, dk_result.data.compute(), equal_nan=True)
+
+
+@cuda_and_cupy_available
+def test_numpy_equals_cupy():
+    fd = np.full((4, 4), 0.0, dtype=np.float64)
+    fd[:, -1] = -1.0
+    pp = np.full((4, 4), np.nan, dtype=np.float64)
+    pp[0, 3] = 1.0
+
+    np_fd, np_pp = _make_fd_and_pp(fd, pp, backend='numpy')
+    cp_fd, cp_pp = _make_fd_and_pp(fd, pp, backend='cupy')
+
+    np_result = watershed_dinf(np_fd, np_pp)
+    cp_result = watershed_dinf(cp_fd, cp_pp)
+
+    np.testing.assert_allclose(
+        np_result.data, cp_result.data.get(), equal_nan=True)
+
+
+@dask_array_available
+@cuda_and_cupy_available
+def test_numpy_equals_dask_cupy():
+    fd = np.full((4, 4), 0.0, dtype=np.float64)
+    fd[:, -1] = -1.0
+    pp = np.full((4, 4), np.nan, dtype=np.float64)
+    pp[0, 3] = 1.0
+
+    np_fd, np_pp = _make_fd_and_pp(fd, pp, backend='numpy')
+    dcp_fd, dcp_pp = _make_fd_and_pp(fd, pp, backend='dask+cupy', chunks=(2, 2))
+
+    np_result = watershed_dinf(np_fd, np_pp)
+    dcp_result = watershed_dinf(dcp_fd, dcp_pp)
+
+    np.testing.assert_allclose(
+        np_result.data, dcp_result.data.compute().get(), equal_nan=True)
diff --git a/xrspatial/hydro/tests/test_watershed_mfd.py b/xrspatial/hydro/tests/test_watershed_mfd.py
new file mode 100644
index 00000000..dbf9955b
--- /dev/null
+++ b/xrspatial/hydro/tests/test_watershed_mfd.py
@@ -0,0 +1,204 @@
+import numpy as np
+import pytest
+import xarray as xr
+
+from xrspatial.hydro.watershed_mfd import watershed_mfd
+from xrspatial.tests.general_checks import (
+    create_test_raster,
+    cuda_and_cupy_available,
+    dask_array_available,
+)
+
+
+def _make_mfd_raster(fractions, backend='numpy', chunks=(3, 3)):
+    _, H, W = fractions.shape
+    neighbor_names = ['E', 'SE', 'S', 'SW', 'W', 'NW', 'N', 'NE']
+    y_coords = np.arange(H, dtype=np.float64)
+    x_coords = np.arange(W, dtype=np.float64)
+    da_obj = xr.DataArray(
+        fractions.astype(np.float64),
+        dims=('neighbor', 'y', 'x'),
+        coords={'neighbor': neighbor_names, 'y': y_coords, 'x': x_coords},
+        name='mfd_fdir',
+    )
+    if backend == 'dask':
+        import dask.array as da
+        da_obj = da_obj.chunk({'neighbor': 8, 'y': chunks[0], 'x': chunks[1]})
+    elif backend == 'cupy':
+        import cupy as cp
+        da_obj = da_obj.copy(data=cp.asarray(da_obj.data))
+    elif backend == 'dask+cupy':
+        import cupy as cp
+        import dask.array as da
+        da_obj = da_obj.chunk({'neighbor': 8, 'y': chunks[0], 'x': chunks[1]})
+        da_obj = da_obj.copy(data=da_obj.data.map_blocks(
+            cp.asarray, dtype=da_obj.dtype, meta=cp.array((), dtype=da_obj.dtype)))
+    return da_obj
+
+
+def _make_all_east(H, W):
+    fracs = np.zeros((8, H, W), dtype=np.float64)
+    fracs[0, :, :-1] = 1.0  # E
+    return fracs
+
+
+def test_single_pour_point():
+    """All cells flow east, pour point at right column."""
+    fracs = _make_all_east(3, 5)
+    pp = np.full((3, 5), np.nan, dtype=np.float64)
+    pp[1, 4] = 42.0
+
+    fd_da = _make_mfd_raster(fracs)
+    pp_da = create_test_raster(pp)
+    result = watershed_mfd(fd_da, pp_da)
+
+    # Row 1: all reach pour point
+    for c in range(5):
+        assert result.data[1, c] == 42.0
+    # Other rows: flow east but never reach the pour point (different row)
+    # Actually: all cells in row 0 and 2 flow east to col 4 which is a pit (frac=0)
+    # They never reach pp at (1,4) since they go to (r, 4) not (1, 4)
+    assert np.isnan(result.data[0, 0])
+
+
+def test_two_pour_points():
+    fracs = np.zeros((8, 3, 6), dtype=np.float64)
+    fracs[0, :, :2] = 1.0  # E in first 2 cols
+    fracs[0, :, 3:5] = 1.0  # E in cols 3-4
+    # cols 2 and 5 are pits
+
+    pp = np.full((3, 6), np.nan, dtype=np.float64)
+    pp[1, 2] = 10.0
+    pp[1, 5] = 20.0
+
+    fd_da = _make_mfd_raster(fracs)
+    pp_da = create_test_raster(pp)
+    result = watershed_mfd(fd_da, pp_da)
+
+    for c in range(3):
+        assert result.data[1, c] == 10.0
+    for c in range(3, 6):
+        assert result.data[1, c] == 20.0
+
+
+def test_unreachable_cells():
+    fracs = np.zeros((8, 1, 4), dtype=np.float64)
+    fracs[0, 0, 0] = 1.0  # E at (0,0)
+    # (0,1) pit, (0,2) flow E, (0,3) pit
+    fracs[0, 0, 2] = 1.0
+
+    pp = np.full((1, 4), np.nan, dtype=np.float64)
+    pp[0, 1] = 5.0
+
+    fd_da = _make_mfd_raster(fracs)
+    pp_da = create_test_raster(pp)
+    result = watershed_mfd(fd_da, pp_da)
+
+    assert result.data[0, 0] == 5.0
+    assert result.data[0, 1] == 5.0
+    assert np.isnan(result.data[0, 2])
+    assert np.isnan(result.data[0, 3])
+
+
+def test_nan_handling():
+    fracs = np.zeros((8, 2, 2), dtype=np.float64)
+    fracs[0, 0, 0] = 1.0  # (0,0) flows E
+    fracs[:, 1, 0] = np.nan  # (1,0) nodata
+    # (0,1) pit, (1,1) pit
+
+    pp = np.full((2, 2), np.nan, dtype=np.float64)
+    pp[0, 1] = 7.0
+
+    fd_da = _make_mfd_raster(fracs)
+    pp_da = create_test_raster(pp)
+    result = watershed_mfd(fd_da, pp_da)
+
+    assert result.data[0, 0] == 7.0
+    assert result.data[0, 1] == 7.0
+    assert np.isnan(result.data[1, 0])
+
+
+def test_linear_chain_east():
+    N = 6
+    fracs = np.zeros((8, 1, N), dtype=np.float64)
+    fracs[0, 0, :-1] = 1.0  # E for all but last
+
+    pp = np.full((1, N), np.nan, dtype=np.float64)
+    pp[0, -1] = 99.0
+
+    fd_da = _make_mfd_raster(fracs)
+    pp_da = create_test_raster(pp)
+    result = watershed_mfd(fd_da, pp_da)
+
+    expected = np.full((1, N), 99.0, dtype=np.float64)
+    np.testing.assert_allclose(result.data, expected)
+
+
+def test_output_dtype():
+    fracs = _make_all_east(2, 3)
+    pp = np.full((2, 3), np.nan, dtype=np.float64)
+    pp[0, 0] = 1.0
+
+    fd_da = _make_mfd_raster(fracs)
+    pp_da = create_test_raster(pp)
+    result = watershed_mfd(fd_da, pp_da)
+    assert result.data.dtype == np.float64
+
+
+@dask_array_available
+@pytest.mark.parametrize("chunks", [
+    (2, 2), (3, 3), (1, 1), (2, 6),
+])
+def test_numpy_equals_dask(chunks):
+    fracs = _make_all_east(4, 6)
+    pp = np.full((4, 6), np.nan, dtype=np.float64)
+    pp[0, 5] = 1.0
+    pp[3, 5] = 2.0
+
+    fd_np = _make_mfd_raster(fracs, backend='numpy')
+    pp_np = create_test_raster(pp, backend='numpy')
+    fd_dk = _make_mfd_raster(fracs, backend='dask', chunks=chunks)
+    pp_dk = create_test_raster(pp, backend='dask', chunks=chunks)
+
+    np_result = watershed_mfd(fd_np, pp_np)
+    dk_result = watershed_mfd(fd_dk, pp_dk)
+
+    np.testing.assert_allclose(
+        np_result.data, dk_result.data.compute(), equal_nan=True)
+
+
+@cuda_and_cupy_available
+def test_numpy_equals_cupy():
+    fracs = _make_all_east(3, 4)
+    pp = np.full((3, 4), np.nan, dtype=np.float64)
+    pp[1, 3] = 1.0
+
+    fd_np = _make_mfd_raster(fracs, backend='numpy')
+    pp_np = create_test_raster(pp, backend='numpy')
+    fd_cp = _make_mfd_raster(fracs, backend='cupy')
+    pp_cp = create_test_raster(pp, backend='cupy')
+
+    np_result = watershed_mfd(fd_np, pp_np)
+    cp_result = watershed_mfd(fd_cp, pp_cp)
+
+    np.testing.assert_allclose(
+        np_result.data, cp_result.data.get(), equal_nan=True)
+
+
+@dask_array_available
+@cuda_and_cupy_available
+def test_numpy_equals_dask_cupy():
+    fracs = _make_all_east(3, 4)
+    pp = np.full((3, 4), np.nan, dtype=np.float64)
+    pp[1, 3] = 1.0
+
+    fd_np = _make_mfd_raster(fracs, backend='numpy')
+    pp_np = create_test_raster(pp, backend='numpy')
+    fd_dcp = _make_mfd_raster(fracs, backend='dask+cupy', chunks=(2, 2))
+    pp_dcp = create_test_raster(pp, backend='dask+cupy', chunks=(2, 2))
+
+    np_result = watershed_mfd(fd_np, pp_np)
+    dcp_result = watershed_mfd(fd_dcp, pp_dcp)
+
+    np.testing.assert_allclose(
+        np_result.data, dcp_result.data.compute().get(), equal_nan=True)
diff --git a/xrspatial/twi.py b/xrspatial/hydro/twi_d8.py
similarity index 99%
rename from xrspatial/twi.py
rename to xrspatial/hydro/twi_d8.py
index 193c59e0..0af3cbe9 100644
--- a/xrspatial/twi.py
+++ b/xrspatial/hydro/twi_d8.py
@@ -29,7 +29,7 @@
 _TAN_MIN = np.tan(np.radians(0.001))
 
 
-def twi(flow_accum: xr.DataArray,
+def twi_d8(flow_accum: xr.DataArray,
         slope_agg: xr.DataArray,
         name: str = 'twi') -> xr.DataArray:
     """Compute the Topographic Wetness Index.
diff --git a/xrspatial/watershed.py b/xrspatial/hydro/watershed_d8.py
similarity index 99%
rename from xrspatial/watershed.py
rename to xrspatial/hydro/watershed_d8.py
index 439177d4..09accc4b 100644
--- a/xrspatial/watershed.py
+++ b/xrspatial/hydro/watershed_d8.py
@@ -39,7 +39,7 @@ class cupy:  # type: ignore[no-redef]
     is_dask_cupy,
     ngjit,
 )
-from xrspatial._boundary_store import BoundaryStore
+from xrspatial.hydro._boundary_store import BoundaryStore
 from xrspatial.dataset_support import supports_dataset
 
 
@@ -929,7 +929,7 @@ def _watershed_dask_cupy(flow_dir_da, pour_points_da):
 # =====================================================================
 
 @supports_dataset
-def watershed(flow_dir: xr.DataArray,
+def watershed_d8(flow_dir: xr.DataArray,
               pour_points: xr.DataArray,
               name: str = 'watershed') -> xr.DataArray:
     """Label each cell with the pour point it drains to.
@@ -1000,7 +1000,7 @@ def watershed(flow_dir: xr.DataArray,
                         attrs=flow_dir.attrs)
 
 
-def basins(flow_dir, name='basins'):
+def basins_d8(flow_dir, name='basins'):
     """Backward-compatible wrapper; use :func:`basin` instead."""
-    from .basin import basin
-    return basin(flow_dir, name=name)
+    from .basin_d8 import basin_d8
+    return basin_d8(flow_dir, name=name)
diff --git a/xrspatial/hydro/watershed_dinf.py b/xrspatial/hydro/watershed_dinf.py
new file mode 100644
index 00000000..84bb0f67
--- /dev/null
+++ b/xrspatial/hydro/watershed_dinf.py
@@ -0,0 +1,638 @@
+"""D-infinity watershed delineation.
+
+Labels each cell with the pour point it drains to, using D-inf
+dominant-neighbor downstream tracing with path compression.
+
+Algorithm
+---------
+CPU : downstream tracing with path compression, using the dominant
+      neighbor from D-inf angle decomposition.
+GPU : CuPy-via-CPU.
+Dask: iterative tile sweep with exit-label propagation.
+"""
+
+from __future__ import annotations
+
+import math
+
+import numpy as np
+import xarray as xr
+
+try:
+    import cupy
+except ImportError:
+    class cupy:  # type: ignore[no-redef]
+        ndarray = False
+
+try:
+    import dask.array as da
+except ImportError:
+    da = None
+
+from xrspatial.hydro.flow_accumulation_dinf import _angle_to_neighbors
+from xrspatial.hydro.flow_path_dinf import _angle_to_neighbors_py
+from xrspatial.hydro._boundary_store import BoundaryStore
+from xrspatial.utils import (
+    _validate_raster,
+    has_cuda_and_cupy,
+    is_cupy_array,
+    is_dask_cupy,
+    ngjit,
+)
+from xrspatial.dataset_support import supports_dataset
+
+
+def _to_numpy_f64(arr):
+    if hasattr(arr, 'get'):
+        arr = arr.get()
+    return np.asarray(arr, dtype=np.float64)
+
+
+def _dominant_offset_py(angle):
+    """Return (dy, dx) for the dominant D-inf neighbor, or (0,0) for pit/NaN."""
+    dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors_py(angle)
+    if w1 <= 0.0 and w2 <= 0.0:
+        return (0, 0)
+    if w1 >= w2:
+        return (dy1, dx1)
+    return (dy2, dx2)
+
+
+# =====================================================================
+# CPU kernel
+# =====================================================================
+
+@ngjit
+def _watershed_dinf_cpu(flow_dir, labels, state, h, w):
+    """Downstream tracing with path compression for D-inf watershed.
+
+    State: 0=nodata, 1=unresolved, 2=in-trace, 3=resolved.
+    """
+    path_r = np.empty(h * w, dtype=np.int64)
+    path_c = np.empty(h * w, dtype=np.int64)
+
+    for r in range(h):
+        for c in range(w):
+            if state[r, c] != 1:
+                continue
+
+            path_len = 0
+            cr, cc = r, c
+            found_label = np.nan
+            found = False
+
+            while True:
+                s = state[cr, cc]
+                if s == 3:
+                    found_label = labels[cr, cc]
+                    found = True
+                    break
+                if s != 1:
+                    break
+
+                path_r[path_len] = cr
+                path_c[path_len] = cc
+                path_len += 1
+                state[cr, cc] = 2
+
+                angle = flow_dir[cr, cc]
+                if angle != angle:  # NaN
+                    break
+                dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(angle)
+                if w1 <= 0.0 and w2 <= 0.0:
+                    break
+                if w1 >= w2:
+                    dy, dx = dy1, dx1
+                else:
+                    dy, dx = dy2, dx2
+                nr, nc = cr + dy, cc + dx
+                if nr < 0 or nr >= h or nc < 0 or nc >= w:
+                    break
+                cr, cc = nr, nc
+
+            for i in range(path_len):
+                if found:
+                    labels[path_r[i], path_c[i]] = found_label
+                    state[path_r[i], path_c[i]] = 3
+                else:
+                    labels[path_r[i], path_c[i]] = np.nan
+                    state[path_r[i], path_c[i]] = 0
+
+    return labels
+
+
+# =====================================================================
+# CuPy backend
+# =====================================================================
+
+def _watershed_dinf_cupy(flow_dir_data, pour_points_data):
+    import cupy as cp
+    fd_np = _to_numpy_f64(flow_dir_data)
+    pp_np = _to_numpy_f64(pour_points_data)
+    h, w = fd_np.shape
+    labels = np.full((h, w), np.nan, dtype=np.float64)
+    state = np.zeros((h, w), dtype=np.int8)
+    for r in range(h):
+        for c in range(w):
+            if fd_np[r, c] != fd_np[r, c]:
+                pass
+            elif pp_np[r, c] == pp_np[r, c]:
+                labels[r, c] = pp_np[r, c]
+                state[r, c] = 3
+            else:
+                state[r, c] = 1
+    out = _watershed_dinf_cpu(fd_np, labels, state, h, w)
+    return cp.asarray(out)
+
+
+# =====================================================================
+# Dask tile kernel
+# =====================================================================
+
+@ngjit
+def _watershed_dinf_tile_kernel(flow_dir, h, w, pour_points,
+                                 exit_top, exit_bottom, exit_left, exit_right,
+                                 exit_tl, exit_tr, exit_bl, exit_br):
+    """Seeded downstream tracing for a D-inf tile."""
+    labels = np.empty((h, w), dtype=np.float64)
+    state = np.empty((h, w), dtype=np.int8)
+
+    for r in range(h):
+        for c in range(w):
+            v = flow_dir[r, c]
+            if v != v:
+                labels[r, c] = np.nan
+                state[r, c] = 0
+                continue
+            pp = pour_points[r, c]
+            if pp == pp:
+                labels[r, c] = pp
+                state[r, c] = 3
+                continue
+            labels[r, c] = np.nan
+            state[r, c] = 1
+
+    # Apply exit labels at boundaries
+    for c in range(w):
+        if state[0, c] == 1:
+            el = exit_top[c]
+            if el == el:
+                labels[0, c] = el
+                state[0, c] = 3
+    for c in range(w):
+        if state[h - 1, c] == 1:
+            el = exit_bottom[c]
+            if el == el:
+                labels[h - 1, c] = el
+                state[h - 1, c] = 3
+    for r in range(h):
+        if state[r, 0] == 1:
+            el = exit_left[r]
+            if el == el:
+                labels[r, 0] = el
+                state[r, 0] = 3
+    for r in range(h):
+        if state[r, w - 1] == 1:
+            el = exit_right[r]
+            if el == el:
+                labels[r, w - 1] = el
+                state[r, w - 1] = 3
+
+    if state[0, 0] == 1 and exit_tl == exit_tl:
+        labels[0, 0] = exit_tl
+        state[0, 0] = 3
+    if state[0, w - 1] == 1 and exit_tr == exit_tr:
+        labels[0, w - 1] = exit_tr
+        state[0, w - 1] = 3
+    if state[h - 1, 0] == 1 and exit_bl == exit_bl:
+        labels[h - 1, 0] = exit_bl
+        state[h - 1, 0] = 3
+    if state[h - 1, w - 1] == 1 and exit_br == exit_br:
+        labels[h - 1, w - 1] = exit_br
+        state[h - 1, w - 1] = 3
+
+    # Downstream tracing with path compression
+    path_r = np.empty(h * w, dtype=np.int64)
+    path_c = np.empty(h * w, dtype=np.int64)
+
+    for r in range(h):
+        for c in range(w):
+            if state[r, c] != 1:
+                continue
+
+            path_len = 0
+            cr, cc = r, c
+            found_label = np.nan
+            found = False
+            exit_tile = False
+
+            while True:
+                s = state[cr, cc]
+                if s == 3:
+                    found_label = labels[cr, cc]
+                    found = True
+                    break
+                if s != 1:
+                    break
+
+                path_r[path_len] = cr
+                path_c[path_len] = cc
+                path_len += 1
+                state[cr, cc] = 2
+
+                angle = flow_dir[cr, cc]
+                if angle != angle:
+                    break
+                dy1, dx1, w1, dy2, dx2, w2 = _angle_to_neighbors(angle)
+                if w1 <= 0.0 and w2 <= 0.0:
+                    break
+                if w1 >= w2:
+                    dy, dx = dy1, dx1
+                else:
+                    dy, dx = dy2, dx2
+                nr, nc = cr + dy, cc + dx
+                if nr < 0 or nr >= h or nc < 0 or nc >= w:
+                    exit_tile = True
+                    break
+                cr, cc = nr, nc
+
+            for i in range(path_len):
+                if found:
+                    labels[path_r[i], path_c[i]] = found_label
+                    state[path_r[i], path_c[i]] = 3
+                elif exit_tile:
+                    state[path_r[i], path_c[i]] = 1
+                else:
+                    labels[path_r[i], path_c[i]] = np.nan
+                    state[path_r[i], path_c[i]] = 0
+
+    return labels
+
+
+# =====================================================================
+# Dask iterative tile sweep
+# =====================================================================
+
+def _preprocess_tiles(flow_dir_da, chunks_y, chunks_x):
+    n_tile_y = len(chunks_y)
+    n_tile_x = len(chunks_x)
+    flow_bdry = BoundaryStore(chunks_y, chunks_x, fill_value=np.nan)
+    for iy in range(n_tile_y):
+        for ix in range(n_tile_x):
+            chunk = flow_dir_da.blocks[iy, ix].compute()
+            flow_bdry.set('top', iy, ix, _to_numpy_f64(chunk[0, :]))
+            flow_bdry.set('bottom', iy, ix, _to_numpy_f64(chunk[-1, :]))
+            flow_bdry.set('left', iy, ix, _to_numpy_f64(chunk[:, 0]))
+            flow_bdry.set('right', iy, ix, _to_numpy_f64(chunk[:, -1]))
+    return flow_bdry
+
+
+def _compute_exit_labels(iy, ix, boundaries, flow_bdry,
+                          chunks_y, chunks_x, n_tile_y, n_tile_x):
+    """Compute exit labels for D-inf tile using dominant neighbor."""
+    tile_h = chunks_y[iy]
+    tile_w = chunks_x[ix]
+
+    exit_top = np.full(tile_w, np.nan)
+    exit_bottom = np.full(tile_w, np.nan)
+    exit_left = np.full(tile_h, np.nan)
+    exit_right = np.full(tile_h, np.nan)
+    exit_tl = np.nan
+    exit_tr = np.nan
+    exit_bl = np.nan
+    exit_br = np.nan
+
+    # Top row
+    if iy > 0:
+        fdir_top = flow_bdry.get('top', iy, ix)
+        nb_labels = boundaries.get('bottom', iy - 1, ix)
+        for j in range(tile_w):
+            d = _dominant_offset_py(float(fdir_top[j]))
+            if d[0] == -1:
+                dj = j + d[1]
+                if d[1] == 0 and 0 <= dj < len(nb_labels):
+                    exit_top[j] = nb_labels[dj]
+                elif d[1] == -1:
+                    if 0 <= dj < len(nb_labels):
+                        exit_top[j] = nb_labels[dj]
+                    elif dj < 0 and ix > 0:
+                        exit_top[j] = boundaries.get('bottom', iy - 1, ix - 1)[-1]
+                elif d[1] == 1:
+                    if 0 <= dj < len(nb_labels):
+                        exit_top[j] = nb_labels[dj]
+                    elif dj >= len(nb_labels) and ix < n_tile_x - 1:
+                        exit_top[j] = boundaries.get('bottom', iy - 1, ix + 1)[0]
+
+    # Bottom row
+    if iy < n_tile_y - 1:
+        fdir_bot = flow_bdry.get('bottom', iy, ix)
+        nb_labels = boundaries.get('top', iy + 1, ix)
+        for j in range(tile_w):
+            d = _dominant_offset_py(float(fdir_bot[j]))
+            if d[0] == 1:
+                dj = j + d[1]
+                if d[1] == 0 and 0 <= dj < len(nb_labels):
+                    exit_bottom[j] = nb_labels[dj]
+                elif d[1] == 1:
+                    if 0 <= dj < len(nb_labels):
+                        exit_bottom[j] = nb_labels[dj]
+                    elif dj >= len(nb_labels) and ix < n_tile_x - 1:
+                        exit_bottom[j] = boundaries.get('top', iy + 1, ix + 1)[0]
+                elif d[1] == -1:
+                    if 0 <= dj < len(nb_labels):
+                        exit_bottom[j] = nb_labels[dj]
+                    elif dj < 0 and ix > 0:
+                        exit_bottom[j] = boundaries.get('top', iy + 1, ix - 1)[-1]
+
+    # Left column
+    if ix > 0:
+        fdir_left = flow_bdry.get('left', iy, ix)
+        nb_labels = boundaries.get('right', iy, ix - 1)
+        for r in range(tile_h):
+            d = _dominant_offset_py(float(fdir_left[r]))
+            if d[1] == -1:
+                dr = r + d[0]
+                if d[0] == 0 and 0 <= dr < len(nb_labels):
+                    exit_left[r] = nb_labels[dr]
+                elif d[0] == -1:
+                    if r == 0:
+                        continue
+                    if 0 <= dr < len(nb_labels):
+                        exit_left[r] = nb_labels[dr]
+                elif d[0] == 1:
+                    if r == tile_h - 1:
+                        continue
+                    if 0 <= dr < len(nb_labels):
+                        exit_left[r] = nb_labels[dr]
+
+    # Right column
+    if ix < n_tile_x - 1:
+        fdir_right = flow_bdry.get('right', iy, ix)
+        nb_labels = boundaries.get('left', iy, ix + 1)
+        for r in range(tile_h):
+            d = _dominant_offset_py(float(fdir_right[r]))
+            if d[1] == 1:
+                dr = r + d[0]
+                if d[0] == 0 and 0 <= dr < len(nb_labels):
+                    exit_right[r] = nb_labels[dr]
+                elif d[0] == -1:
+                    if r == 0:
+                        continue
+                    if 0 <= dr < len(nb_labels):
+                        exit_right[r] = nb_labels[dr]
+                elif d[0] == 1:
+                    if r == tile_h - 1:
+                        continue
+                    if 0 <= dr < len(nb_labels):
+                        exit_right[r] = nb_labels[dr]
+
+    # Edge-of-grid exits
+    if iy == 0:
+        fdir_top = flow_bdry.get('top', iy, ix)
+        for j in range(tile_w):
+            d = _dominant_offset_py(float(fdir_top[j]))
+            if d[0] == -1:
+                exit_top[j] = np.nan
+    if iy == n_tile_y - 1:
+        fdir_bot = flow_bdry.get('bottom', iy, ix)
+        for j in range(tile_w):
+            d = _dominant_offset_py(float(fdir_bot[j]))
+            if d[0] == 1:
+                exit_bottom[j] = np.nan
+    if ix == 0:
+        fdir_left = flow_bdry.get('left', iy, ix)
+        for r in range(tile_h):
+            d = _dominant_offset_py(float(fdir_left[r]))
+            if d[1] == -1:
+                exit_left[r] = np.nan
+    if ix == n_tile_x - 1:
+        fdir_right = flow_bdry.get('right', iy, ix)
+        for r in range(tile_h):
+            d = _dominant_offset_py(float(fdir_right[r]))
+            if d[1] == 1:
+                exit_right[r] = np.nan
+
+    # Diagonal corners
+    fdir_tl = flow_bdry.get('top', iy, ix)[0]
+    d = _dominant_offset_py(float(fdir_tl))
+    if d == (-1, -1):
+        if iy > 0 and ix > 0:
+            exit_tl = boundaries.get('bottom', iy - 1, ix - 1)[-1]
+        else:
+            exit_tl = np.nan
+
+    fdir_tr = flow_bdry.get('top', iy, ix)[-1]
+    d = _dominant_offset_py(float(fdir_tr))
+    if d == (-1, 1):
+        if iy > 0 and ix < n_tile_x - 1:
+            exit_tr = boundaries.get('bottom', iy - 1, ix + 1)[0]
+        else:
+            exit_tr = np.nan
+
+    fdir_bl = flow_bdry.get('bottom', iy, ix)[0]
+    d = _dominant_offset_py(float(fdir_bl))
+    if d == (1, -1):
+        if iy < n_tile_y - 1 and ix > 0:
+            exit_bl = boundaries.get('top', iy + 1, ix - 1)[-1]
+        else:
+            exit_bl = np.nan
+
+    fdir_br = flow_bdry.get('bottom', iy, ix)[-1]
+    d = _dominant_offset_py(float(fdir_br))
+    if d == (1, 1):
+        if iy < n_tile_y - 1 and ix < n_tile_x - 1:
+            exit_br = boundaries.get('top', iy + 1, ix + 1)[0]
+        else:
+            exit_br = np.nan
+
+    return (exit_top, exit_bottom, exit_left, exit_right,
+            exit_tl, exit_tr, exit_bl, exit_br)
+
+
+def _process_tile(iy, ix, flow_dir_da, pour_points_da,
+                   boundaries, flow_bdry,
+                   chunks_y, chunks_x, n_tile_y, n_tile_x):
+    chunk = np.asarray(
+        flow_dir_da.blocks[iy, ix].compute(), dtype=np.float64)
+    pp_chunk = np.asarray(
+        pour_points_da.blocks[iy, ix].compute(), dtype=np.float64)
+    h, w = chunk.shape
+
+    exits = _compute_exit_labels(
+        iy, ix, boundaries, flow_bdry,
+        chunks_y, chunks_x, n_tile_y, n_tile_x)
+
+    result = _watershed_dinf_tile_kernel(chunk, h, w, pp_chunk, *exits)
+
+    new_top = result[0, :].copy()
+    new_bottom = result[-1, :].copy()
+    new_left = result[:, 0].copy()
+    new_right = result[:, -1].copy()
+
+    changed = False
+    for side, new in (('top', new_top), ('bottom', new_bottom),
+                      ('left', new_left), ('right', new_right)):
+        old = boundaries.get(side, iy, ix).copy()
+        with np.errstate(invalid='ignore'):
+            mask = ~(np.isnan(old) & np.isnan(new))
+            if mask.any():
+                diff = old[mask] != new[mask]
+                if np.any(diff):
+                    changed = True
+                    break
+
+    boundaries.set('top', iy, ix, new_top)
+    boundaries.set('bottom', iy, ix, new_bottom)
+    boundaries.set('left', iy, ix, new_left)
+    boundaries.set('right', iy, ix, new_right)
+
+    return changed
+
+
+def _watershed_dinf_dask(flow_dir_da, pour_points_da):
+    chunks_y = flow_dir_da.chunks[0]
+    chunks_x = flow_dir_da.chunks[1]
+    n_tile_y = len(chunks_y)
+    n_tile_x = len(chunks_x)
+
+    flow_bdry = _preprocess_tiles(flow_dir_da, chunks_y, chunks_x)
+    flow_bdry = flow_bdry.snapshot()
+
+    boundaries = BoundaryStore(chunks_y, chunks_x, fill_value=np.nan)
+
+    max_iterations = max(n_tile_y, n_tile_x) * 2 + 10
+
+    for _iteration in range(max_iterations):
+        any_changed = False
+
+        for iy in range(n_tile_y):
+            for ix in range(n_tile_x):
+                c = _process_tile(
+                    iy, ix, flow_dir_da, pour_points_da,
+                    boundaries, flow_bdry,
+                    chunks_y, chunks_x, n_tile_y, n_tile_x)
+                if c:
+                    any_changed = True
+
+        for iy in reversed(range(n_tile_y)):
+            for ix in reversed(range(n_tile_x)):
+                c = _process_tile(
+                    iy, ix, flow_dir_da, pour_points_da,
+                    boundaries, flow_bdry,
+                    chunks_y, chunks_x, n_tile_y, n_tile_x)
+                if c:
+                    any_changed = True
+
+        if not any_changed:
+            break
+
+    boundaries = boundaries.snapshot()
+
+    def _tile_fn(flow_dir_block, pp_block, block_info=None):
+        if block_info is None or 0 not in block_info:
+            return np.full(flow_dir_block.shape, np.nan, dtype=np.float64)
+        iy, ix = block_info[0]['chunk-location']
+        h, w = flow_dir_block.shape
+        exits = _compute_exit_labels(
+            iy, ix, boundaries, flow_bdry,
+            chunks_y, chunks_x, n_tile_y, n_tile_x)
+        return _watershed_dinf_tile_kernel(
+            np.asarray(flow_dir_block, dtype=np.float64),
+            h, w,
+            np.asarray(pp_block, dtype=np.float64),
+            *exits)
+
+    return da.map_blocks(
+        _tile_fn,
+        flow_dir_da, pour_points_da,
+        dtype=np.float64,
+        meta=np.array((), dtype=np.float64),
+    )
+
+
+# =====================================================================
+# Dask+CuPy backend
+# =====================================================================
+
+def _watershed_dinf_dask_cupy(flow_dir_da, pour_points_da):
+    import cupy as cp
+    fd_np = flow_dir_da.map_blocks(
+        lambda b: b.get(), dtype=flow_dir_da.dtype,
+        meta=np.array((), dtype=flow_dir_da.dtype),
+    )
+    pp_np = pour_points_da.map_blocks(
+        lambda b: b.get(), dtype=pour_points_da.dtype,
+        meta=np.array((), dtype=pour_points_da.dtype),
+    )
+    result = _watershed_dinf_dask(fd_np, pp_np)
+    return result.map_blocks(
+        cp.asarray, dtype=result.dtype,
+        meta=cp.array((), dtype=result.dtype),
+    )
+
+
+# =====================================================================
+# Public API
+# =====================================================================
+
+@supports_dataset
+def watershed_dinf(flow_dir_dinf: xr.DataArray,
+                   pour_points: xr.DataArray,
+                   name: str = 'watershed_dinf') -> xr.DataArray:
+    """Label each cell with the pour point it drains to (D-inf).
+
+    Parameters
+    ----------
+    flow_dir_dinf : xarray.DataArray or xr.Dataset
+        2D D-infinity flow direction grid.
+    pour_points : xarray.DataArray
+        2D raster where non-NaN cells are pour points.
+    name : str, default='watershed_dinf'
+        Name of output DataArray.
+
+    Returns
+    -------
+    xarray.DataArray or xr.Dataset
+        2D float64 array where each cell = label of its pour point.
+        NaN for nodata or unreachable cells.
+    """
+    _validate_raster(flow_dir_dinf, func_name='watershed_dinf',
+                     name='flow_dir_dinf')
+
+    data = flow_dir_dinf.data
+    pp_data = pour_points.data
+
+    if isinstance(data, np.ndarray):
+        fd = data.astype(np.float64)
+        pp = np.asarray(pp_data, dtype=np.float64)
+        h, w = fd.shape
+        labels = np.full((h, w), np.nan, dtype=np.float64)
+        state = np.zeros((h, w), dtype=np.int8)
+        for r in range(h):
+            for c in range(w):
+                if fd[r, c] != fd[r, c]:
+                    pass
+                elif pp[r, c] == pp[r, c]:
+                    labels[r, c] = pp[r, c]
+                    state[r, c] = 3
+                else:
+                    state[r, c] = 1
+        out = _watershed_dinf_cpu(fd, labels, state, h, w)
+
+    elif has_cuda_and_cupy() and is_cupy_array(data):
+        out = _watershed_dinf_cupy(data, pp_data)
+
+    elif has_cuda_and_cupy() and is_dask_cupy(flow_dir_dinf):
+        out = _watershed_dinf_dask_cupy(data, pp_data)
+
+    elif da is not None and isinstance(data, da.Array):
+        out = _watershed_dinf_dask(data, pp_data)
+
+    else:
+        raise TypeError(f"Unsupported array type: {type(data)}")
+
+    return xr.DataArray(out,
+                        name=name,
+                        coords=flow_dir_dinf.coords,
+                        dims=flow_dir_dinf.dims,
+                        attrs=flow_dir_dinf.attrs)
diff --git a/xrspatial/hydro/watershed_mfd.py b/xrspatial/hydro/watershed_mfd.py
new file mode 100644
index 00000000..5a7ddbc1
--- /dev/null
+++ b/xrspatial/hydro/watershed_mfd.py
@@ -0,0 +1,636 @@
+"""MFD watershed delineation.
+
+Labels each cell with the pour point it drains to, using MFD
+dominant-neighbor downstream tracing with path compression.
+
+Algorithm
+---------
+CPU : downstream tracing with path compression, following the neighbor
+      with the highest fraction at each step.
+GPU : CuPy-via-CPU.
+Dask: iterative tile sweep with exit-label propagation.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import xarray as xr
+
+try:
+    import cupy
+except ImportError:
+    class cupy:  # type: ignore[no-redef]
+        ndarray = False
+
+try:
+    import dask.array as da
+except ImportError:
+    da = None
+
+from xrspatial.hydro._boundary_store import BoundaryStore
+from xrspatial.utils import (
+    _validate_raster,
+    has_cuda_and_cupy,
+    is_cupy_array,
+    is_dask_cupy,
+    ngjit,
+)
+from xrspatial.dataset_support import supports_dataset
+
+_DY_LIST = [0, 1, 1, 1, 0, -1, -1, -1]
+_DX_LIST = [1, 1, 0, -1, -1, -1, 0, 1]
+
+
+def _to_numpy_f64(arr):
+    if hasattr(arr, 'get'):
+        arr = arr.get()
+    return np.asarray(arr, dtype=np.float64)
+
+
+def _dominant_offset_mfd_py(fractions_8):
+    """Return (dy, dx) of dominant MFD neighbor, or (0,0) for pit/nodata."""
+    best_k = -1
+    best_f = 0.0
+    for k in range(8):
+        f = float(fractions_8[k])
+        if f > best_f:
+            best_f = f
+            best_k = k
+    if best_k == -1:
+        return (0, 0)
+    return (_DY_LIST[best_k], _DX_LIST[best_k])
+
+
+# =====================================================================
+# CPU kernel
+# =====================================================================
+
+@ngjit
+def _watershed_mfd_cpu(fractions, labels, state, h, w):
+    """Downstream tracing with path compression for MFD watershed."""
+    dy = np.array([0, 1, 1, 1, 0, -1, -1, -1], dtype=np.int64)
+    dx = np.array([1, 1, 0, -1, -1, -1, 0, 1], dtype=np.int64)
+
+    path_r = np.empty(h * w, dtype=np.int64)
+    path_c = np.empty(h * w, dtype=np.int64)
+
+    for r in range(h):
+        for c in range(w):
+            if state[r, c] != 1:
+                continue
+
+            path_len = 0
+            cr, cc = r, c
+            found_label = np.nan
+            found = False
+
+            while True:
+                s = state[cr, cc]
+                if s == 3:
+                    found_label = labels[cr, cc]
+                    found = True
+                    break
+                if s != 1:
+                    break
+
+                path_r[path_len] = cr
+                path_c[path_len] = cc
+                path_len += 1
+                state[cr, cc] = 2
+
+                chk = fractions[0, cr, cc]
+                if chk != chk:  # NaN
+                    break
+
+                best_k = -1
+                best_frac = 0.0
+                for k in range(8):
+                    f = fractions[k, cr, cc]
+                    if f > best_frac:
+                        best_frac = f
+                        best_k = k
+                if best_k == -1:
+                    break
+
+                nr, nc = cr + dy[best_k], cc + dx[best_k]
+                if nr < 0 or nr >= h or nc < 0 or nc >= w:
+                    break
+                cr, cc = nr, nc
+
+            for i in range(path_len):
+                if found:
+                    labels[path_r[i], path_c[i]] = found_label
+                    state[path_r[i], path_c[i]] = 3
+                else:
+                    labels[path_r[i], path_c[i]] = np.nan
+                    state[path_r[i], path_c[i]] = 0
+
+    return labels
+
+
+# =====================================================================
+# CuPy backend
+# =====================================================================
+
+def _watershed_mfd_cupy(fractions_data, pour_points_data):
+    import cupy as cp
+    fr_np = _to_numpy_f64(fractions_data)
+    pp_np = _to_numpy_f64(pour_points_data)
+    _, h, w = fr_np.shape
+    labels = np.full((h, w), np.nan, dtype=np.float64)
+    state = np.zeros((h, w), dtype=np.int8)
+    for r in range(h):
+        for c in range(w):
+            if fr_np[0, r, c] != fr_np[0, r, c]:
+                pass
+            elif pp_np[r, c] == pp_np[r, c]:
+                labels[r, c] = pp_np[r, c]
+                state[r, c] = 3
+            else:
+                state[r, c] = 1
+    out = _watershed_mfd_cpu(fr_np, labels, state, h, w)
+    return cp.asarray(out)
+
+
+# =====================================================================
+# Dask tile kernel
+# =====================================================================
+
+@ngjit
+def _watershed_mfd_tile_kernel(fractions, h, w, pour_points,
+                                exit_top, exit_bottom, exit_left, exit_right,
+                                exit_tl, exit_tr, exit_bl, exit_br):
+    """Seeded downstream tracing for an MFD tile."""
+    dy = np.array([0, 1, 1, 1, 0, -1, -1, -1], dtype=np.int64)
+    dx = np.array([1, 1, 0, -1, -1, -1, 0, 1], dtype=np.int64)
+
+    labels = np.empty((h, w), dtype=np.float64)
+    state = np.empty((h, w), dtype=np.int8)
+
+    for r in range(h):
+        for c in range(w):
+            v = fractions[0, r, c]
+            if v != v:
+                labels[r, c] = np.nan
+                state[r, c] = 0
+                continue
+            pp = pour_points[r, c]
+            if pp == pp:
+                labels[r, c] = pp
+                state[r, c] = 3
+                continue
+            labels[r, c] = np.nan
+            state[r, c] = 1
+
+    # Apply exit labels
+    for c in range(w):
+        if state[0, c] == 1:
+            el = exit_top[c]
+            if el == el:
+                labels[0, c] = el
+                state[0, c] = 3
+    for c in range(w):
+        if state[h - 1, c] == 1:
+            el = exit_bottom[c]
+            if el == el:
+                labels[h - 1, c] = el
+                state[h - 1, c] = 3
+    for r in range(h):
+        if state[r, 0] == 1:
+            el = exit_left[r]
+            if el == el:
+                labels[r, 0] = el
+                state[r, 0] = 3
+    for r in range(h):
+        if state[r, w - 1] == 1:
+            el = exit_right[r]
+            if el == el:
+                labels[r, w - 1] = el
+                state[r, w - 1] = 3
+
+    if state[0, 0] == 1 and exit_tl == exit_tl:
+        labels[0, 0] = exit_tl
+        state[0, 0] = 3
+    if state[0, w - 1] == 1 and exit_tr == exit_tr:
+        labels[0, w - 1] = exit_tr
+        state[0, w - 1] = 3
+    if state[h - 1, 0] == 1 and exit_bl == exit_bl:
+        labels[h - 1, 0] = exit_bl
+        state[h - 1, 0] = 3
+    if state[h - 1, w - 1] == 1 and exit_br == exit_br:
+        labels[h - 1, w - 1] = exit_br
+        state[h - 1, w - 1] = 3
+
+    # Downstream tracing
+    path_r = np.empty(h * w, dtype=np.int64)
+    path_c = np.empty(h * w, dtype=np.int64)
+
+    for r in range(h):
+        for c in range(w):
+            if state[r, c] != 1:
+                continue
+
+            path_len = 0
+            cr, cc = r, c
+            found_label = np.nan
+            found = False
+            exit_tile = False
+
+            while True:
+                s = state[cr, cc]
+                if s == 3:
+                    found_label = labels[cr, cc]
+                    found = True
+                    break
+                if s != 1:
+                    break
+
+                path_r[path_len] = cr
+                path_c[path_len] = cc
+                path_len += 1
+                state[cr, cc] = 2
+
+                chk = fractions[0, cr, cc]
+                if chk != chk:
+                    break
+
+                best_k = -1
+                best_frac = 0.0
+                for k in range(8):
+                    f = fractions[k, cr, cc]
+                    if f > best_frac:
+                        best_frac = f
+                        best_k = k
+                if best_k == -1:
+                    break
+
+                nr, nc = cr + dy[best_k], cc + dx[best_k]
+                if nr < 0 or nr >= h or nc < 0 or nc >= w:
+                    exit_tile = True
+                    break
+                cr, cc = nr, nc
+
+            for i in range(path_len):
+                if found:
+                    labels[path_r[i], path_c[i]] = found_label
+                    state[path_r[i], path_c[i]] = 3
+                elif exit_tile:
+                    state[path_r[i], path_c[i]] = 1
+                else:
+                    labels[path_r[i], path_c[i]] = np.nan
+                    state[path_r[i], path_c[i]] = 0
+
+    return labels
+
+
+# =====================================================================
+# Dask iterative tile sweep
+# =====================================================================
+
+def _preprocess_mfd_tiles(fractions_da, chunks_y, chunks_x):
+    n_tile_y = len(chunks_y)
+    n_tile_x = len(chunks_x)
+    frac_bdry = {}
+    for iy in range(n_tile_y):
+        for ix in range(n_tile_x):
+            chunk = fractions_da[:, sum(chunks_y[:iy]):sum(chunks_y[:iy+1]),
+                                    sum(chunks_x[:ix]):sum(chunks_x[:ix+1])].compute()
+            chunk = np.asarray(chunk, dtype=np.float64)
+            frac_bdry[('top', iy, ix)] = chunk[:, 0, :].copy()
+            frac_bdry[('bottom', iy, ix)] = chunk[:, -1, :].copy()
+            frac_bdry[('left', iy, ix)] = chunk[:, :, 0].copy()
+            frac_bdry[('right', iy, ix)] = chunk[:, :, -1].copy()
+    return frac_bdry
+
+
+def _compute_exit_labels_mfd(iy, ix, boundaries, frac_bdry,
+                              chunks_y, chunks_x, n_tile_y, n_tile_x):
+    """Compute exit labels for MFD tile using dominant neighbor."""
+    tile_h = chunks_y[iy]
+    tile_w = chunks_x[ix]
+
+    exit_top = np.full(tile_w, np.nan)
+    exit_bottom = np.full(tile_w, np.nan)
+    exit_left = np.full(tile_h, np.nan)
+    exit_right = np.full(tile_h, np.nan)
+    exit_tl = np.nan
+    exit_tr = np.nan
+    exit_bl = np.nan
+    exit_br = np.nan
+
+    # Top row
+    fdir_top = frac_bdry.get(('top', iy, ix))
+    if fdir_top is not None and iy > 0:
+        nb_labels = boundaries.get('bottom', iy - 1, ix)
+        for j in range(tile_w):
+            d = _dominant_offset_mfd_py(fdir_top[:, j])
+            if d[0] == -1:
+                dj = j + d[1]
+                if 0 <= dj < len(nb_labels):
+                    exit_top[j] = nb_labels[dj]
+                elif dj < 0 and ix > 0:
+                    exit_top[j] = boundaries.get('bottom', iy - 1, ix - 1)[-1]
+                elif dj >= len(nb_labels) and ix < n_tile_x - 1:
+                    exit_top[j] = boundaries.get('bottom', iy - 1, ix + 1)[0]
+
+    # Bottom row
+    fdir_bot = frac_bdry.get(('bottom', iy, ix))
+    if fdir_bot is not None and iy < n_tile_y - 1:
+        nb_labels = boundaries.get('top', iy + 1, ix)
+        for j in range(tile_w):
+            d = _dominant_offset_mfd_py(fdir_bot[:, j])
+            if d[0] == 1:
+                dj = j + d[1]
+                if 0 <= dj < len(nb_labels):
+                    exit_bottom[j] = nb_labels[dj]
+                elif dj < 0 and ix > 0:
+                    exit_bottom[j] = boundaries.get('top', iy + 1, ix - 1)[-1]
+                elif dj >= len(nb_labels) and ix < n_tile_x - 1:
+                    exit_bottom[j] = boundaries.get('top', iy + 1, ix + 1)[0]
+
+    # Left column
+    fdir_left = frac_bdry.get(('left', iy, ix))
+    if fdir_left is not None and ix > 0:
+        nb_labels = boundaries.get('right', iy, ix - 1)
+        for r in range(tile_h):
+            d = _dominant_offset_mfd_py(fdir_left[:, r])
+            if d[1] == -1:
+                dr = r + d[0]
+                if 0 <= dr < len(nb_labels):
+                    exit_left[r] = nb_labels[dr]
+
+    # Right column
+    fdir_right = frac_bdry.get(('right', iy, ix))
+    if fdir_right is not None and ix < n_tile_x - 1:
+        nb_labels = boundaries.get('left', iy, ix + 1)
+        for r in range(tile_h):
+            d = _dominant_offset_mfd_py(fdir_right[:, r])
+            if d[1] == 1:
+                dr = r + d[0]
+                if 0 <= dr < len(nb_labels):
+                    exit_right[r] = nb_labels[dr]
+
+    # Edge-of-grid exits
+    if iy == 0 and fdir_top is not None:
+        for j in range(tile_w):
+            d = _dominant_offset_mfd_py(fdir_top[:, j])
+            if d[0] == -1:
+                exit_top[j] = np.nan
+    if iy == n_tile_y - 1 and fdir_bot is not None:
+        for j in range(tile_w):
+            d = _dominant_offset_mfd_py(fdir_bot[:, j])
+            if d[0] == 1:
+                exit_bottom[j] = np.nan
+    if ix == 0 and fdir_left is not None:
+        for r in range(tile_h):
+            d = _dominant_offset_mfd_py(fdir_left[:, r])
+            if d[1] == -1:
+                exit_left[r] = np.nan
+    if ix == n_tile_x - 1 and fdir_right is not None:
+        for r in range(tile_h):
+            d = _dominant_offset_mfd_py(fdir_right[:, r])
+            if d[1] == 1:
+                exit_right[r] = np.nan
+
+    # Diagonal corners
+    if fdir_top is not None:
+        d = _dominant_offset_mfd_py(fdir_top[:, 0])
+        if d == (-1, -1):
+            if iy > 0 and ix > 0:
+                exit_tl = boundaries.get('bottom', iy - 1, ix - 1)[-1]
+            else:
+                exit_tl = np.nan
+        d = _dominant_offset_mfd_py(fdir_top[:, -1])
+        if d == (-1, 1):
+            if iy > 0 and ix < n_tile_x - 1:
+                exit_tr = boundaries.get('bottom', iy - 1, ix + 1)[0]
+            else:
+                exit_tr = np.nan
+    if fdir_bot is not None:
+        d = _dominant_offset_mfd_py(fdir_bot[:, 0])
+        if d == (1, -1):
+            if iy < n_tile_y - 1 and ix > 0:
+                exit_bl = boundaries.get('top', iy + 1, ix - 1)[-1]
+            else:
+                exit_bl = np.nan
+        d = _dominant_offset_mfd_py(fdir_bot[:, -1])
+        if d == (1, 1):
+            if iy < n_tile_y - 1 and ix < n_tile_x - 1:
+                exit_br = boundaries.get('top', iy + 1, ix + 1)[0]
+            else:
+                exit_br = np.nan
+
+    return (exit_top, exit_bottom, exit_left, exit_right,
+            exit_tl, exit_tr, exit_bl, exit_br)
+
+
+def _process_tile_mfd(iy, ix, fractions_da, pour_points_da,
+                       boundaries, frac_bdry,
+                       chunks_y, chunks_x, n_tile_y, n_tile_x):
+    y_start = sum(chunks_y[:iy])
+    y_end = y_start + chunks_y[iy]
+    x_start = sum(chunks_x[:ix])
+    x_end = x_start + chunks_x[ix]
+
+    chunk = np.asarray(
+        fractions_da[:, y_start:y_end, x_start:x_end].compute(),
+        dtype=np.float64)
+    pp_chunk = np.asarray(
+        pour_points_da.blocks[iy, ix].compute(), dtype=np.float64)
+    _, h, w = chunk.shape
+
+    exits = _compute_exit_labels_mfd(
+        iy, ix, boundaries, frac_bdry,
+        chunks_y, chunks_x, n_tile_y, n_tile_x)
+
+    result = _watershed_mfd_tile_kernel(chunk, h, w, pp_chunk, *exits)
+
+    new_top = result[0, :].copy()
+    new_bottom = result[-1, :].copy()
+    new_left = result[:, 0].copy()
+    new_right = result[:, -1].copy()
+
+    changed = False
+    for side, new in (('top', new_top), ('bottom', new_bottom),
+                      ('left', new_left), ('right', new_right)):
+        old = boundaries.get(side, iy, ix).copy()
+        with np.errstate(invalid='ignore'):
+            mask = ~(np.isnan(old) & np.isnan(new))
+            if mask.any():
+                diff = old[mask] != new[mask]
+                if np.any(diff):
+                    changed = True
+                    break
+
+    boundaries.set('top', iy, ix, new_top)
+    boundaries.set('bottom', iy, ix, new_bottom)
+    boundaries.set('left', iy, ix, new_left)
+    boundaries.set('right', iy, ix, new_right)
+
+    return changed
+
+
+def _watershed_mfd_dask(fractions_da, pour_points_da, chunks_y, chunks_x):
+    n_tile_y = len(chunks_y)
+    n_tile_x = len(chunks_x)
+
+    frac_bdry = _preprocess_mfd_tiles(fractions_da, chunks_y, chunks_x)
+    boundaries = BoundaryStore(chunks_y, chunks_x, fill_value=np.nan)
+
+    max_iterations = max(n_tile_y, n_tile_x) * 2 + 10
+
+    for _iteration in range(max_iterations):
+        any_changed = False
+
+        for iy in range(n_tile_y):
+            for ix in range(n_tile_x):
+                c = _process_tile_mfd(
+                    iy, ix, fractions_da, pour_points_da,
+                    boundaries, frac_bdry,
+                    chunks_y, chunks_x, n_tile_y, n_tile_x)
+                if c:
+                    any_changed = True
+
+        for iy in reversed(range(n_tile_y)):
+            for ix in reversed(range(n_tile_x)):
+                c = _process_tile_mfd(
+                    iy, ix, fractions_da, pour_points_da,
+                    boundaries, frac_bdry,
+                    chunks_y, chunks_x, n_tile_y, n_tile_x)
+                if c:
+                    any_changed = True
+
+        if not any_changed:
+            break
+
+    boundaries = boundaries.snapshot()
+
+    # Assemble final result
+    rows = []
+    for iy in range(n_tile_y):
+        row = []
+        for ix in range(n_tile_x):
+            y_start = sum(chunks_y[:iy])
+            y_end = y_start + chunks_y[iy]
+            x_start = sum(chunks_x[:ix])
+            x_end = x_start + chunks_x[ix]
+
+            chunk = np.asarray(
+                fractions_da[:, y_start:y_end, x_start:x_end].compute(),
+                dtype=np.float64)
+            pp_chunk = np.asarray(
+                pour_points_da.blocks[iy, ix].compute(), dtype=np.float64)
+            _, h, w = chunk.shape
+
+            exits = _compute_exit_labels_mfd(
+                iy, ix, boundaries, frac_bdry,
+                chunks_y, chunks_x, n_tile_y, n_tile_x)
+
+            tile = _watershed_mfd_tile_kernel(chunk, h, w, pp_chunk, *exits)
+            row.append(da.from_array(tile, chunks=tile.shape))
+        rows.append(row)
+
+    return da.block(rows)
+
+
+# =====================================================================
+# Dask+CuPy backend
+# =====================================================================
+
+def _watershed_mfd_dask_cupy(fractions_da, pour_points_da, chunks_y, chunks_x):
+    import cupy as cp
+    fr_np = fractions_da.map_blocks(
+        lambda b: b.get(), dtype=fractions_da.dtype,
+        meta=np.array((), dtype=fractions_da.dtype),
+    )
+    pp_np = pour_points_da.map_blocks(
+        lambda b: b.get(), dtype=pour_points_da.dtype,
+        meta=np.array((), dtype=pour_points_da.dtype),
+    )
+    result = _watershed_mfd_dask(fr_np, pp_np, chunks_y, chunks_x)
+    return result.map_blocks(
+        cp.asarray, dtype=result.dtype,
+        meta=cp.array((), dtype=result.dtype),
+    )
+
+
+# =====================================================================
+# Public API
+# =====================================================================
+
+@supports_dataset
+def watershed_mfd(flow_dir_mfd: xr.DataArray,
+                  pour_points: xr.DataArray,
+                  name: str = 'watershed_mfd') -> xr.DataArray:
+    """Label each cell with the pour point it drains to (MFD).
+
+    Parameters
+    ----------
+    flow_dir_mfd : xarray.DataArray or xr.Dataset
+        3D MFD flow direction array of shape (8, H, W).
+    pour_points : xarray.DataArray
+        2D raster where non-NaN cells are pour points.
+    name : str, default='watershed_mfd'
+        Name of output DataArray.
+
+    Returns
+    -------
+    xarray.DataArray or xr.Dataset
+        2D float64 array where each cell = label of its pour point.
+        NaN for nodata or unreachable cells.
+    """
+    _validate_raster(flow_dir_mfd, func_name='watershed_mfd',
+                     name='flow_dir_mfd', ndim=3)
+
+    data = flow_dir_mfd.data
+    pp_data = pour_points.data
+
+    if data.ndim != 3 or data.shape[0] != 8:
+        raise ValueError(
+            f"flow_dir_mfd must have shape (8, H, W), got {data.shape}")
+
+    _, H, W = data.shape
+
+    if isinstance(data, np.ndarray):
+        fr = data.astype(np.float64)
+        pp = np.asarray(pp_data, dtype=np.float64)
+        labels = np.full((H, W), np.nan, dtype=np.float64)
+        state = np.zeros((H, W), dtype=np.int8)
+        for r in range(H):
+            for c in range(W):
+                if fr[0, r, c] != fr[0, r, c]:
+                    pass
+                elif pp[r, c] == pp[r, c]:
+                    labels[r, c] = pp[r, c]
+                    state[r, c] = 3
+                else:
+                    state[r, c] = 1
+        out = _watershed_mfd_cpu(fr, labels, state, H, W)
+
+    elif has_cuda_and_cupy() and is_cupy_array(data):
+        out = _watershed_mfd_cupy(data, pp_data)
+
+    elif has_cuda_and_cupy() and is_dask_cupy(flow_dir_mfd):
+        chunks_y = data.chunks[1]
+        chunks_x = data.chunks[2]
+        out = _watershed_mfd_dask_cupy(data, pp_data, chunks_y, chunks_x)
+
+    elif da is not None and isinstance(data, da.Array):
+        chunks_y = data.chunks[1]
+        chunks_x = data.chunks[2]
+        out = _watershed_mfd_dask(data, pp_data, chunks_y, chunks_x)
+
+    else:
+        raise TypeError(f"Unsupported array type: {type(data)}")
+
+    spatial_dims = flow_dir_mfd.dims[1:]
+    coords = {}
+    for d in spatial_dims:
+        if d in flow_dir_mfd.coords:
+            coords[d] = flow_dir_mfd.coords[d]
+
+    return xr.DataArray(out,
+                        name=name,
+                        coords=coords,
+                        dims=spatial_dims,
+                        attrs=flow_dir_mfd.attrs)