Comparing-Nodes/inDegrees.py at main · AlexChanson/Comparing-Nodes · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import pandas as pd


def in_degree_by_relationship_type(driver, label: str) -> pd.DataFrame:
    """
    Compute, for every node with the given label, the in-degree grouped by relationship type.
    Returns a pandas DataFrame with:
      - first column: nodeId (Neo4j internal id)
      - subsequent columns: one per relationship type, values are counts (integers).

    Parameters
    ----------
    driver : neo4j.Driver
        An existing Neo4j driver (e.g., from neo4j import GraphDatabase; GraphDatabase.driver(...))
    label : str
        Node label to target (e.g., "City")

    Notes
    -----
    - Only incoming relationships are counted: ()-[r]->(n:Label).
    - Nodes with no incoming relationships are included with zeros across all relationship columns.
    - Relationship types become column names.

    """
    # Safely backtick-escape the label (handles spaces/special chars)
    label_bt = f"`{label}`"

    q_nodes = f"""
    MATCH (n:{label_bt})
    RETURN id(n) AS nodeId
    """

    q_counts = f"""
    MATCH ()-[r]->(n:{label_bt})
    RETURN id(n) AS nodeId, type(r) AS relType, count(r) AS cnt
    """

    with driver.session() as session:
        # All target node ids
        node_rows = session.run(q_nodes).data()
        nodes_df = pd.DataFrame(node_rows)
        if nodes_df.empty:
            # No nodes of that label: return empty frame with nodeId only
            return pd.DataFrame(columns=["nodeId"]).astype({"nodeId": "Int64"})

        # In-degree counts grouped by relationship type
        count_rows = session.run(q_counts).data()
        counts_df = pd.DataFrame(count_rows, columns=["nodeId", "relType", "cnt"])

    if counts_df.empty:
        # No incoming relationships at all; return zeros-only columns (just nodeId)
        df = nodes_df.copy()
        return df.astype({"nodeId": "Int64"})

    # Pivot so each relationship type is its own column
    pivot = counts_df.pivot_table(
        index="nodeId", columns="relType", values="cnt", aggfunc="sum", fill_value=0
    ).reset_index()

    # Ensure *all* nodes are present (including those without incoming edges)
    df = nodes_df.merge(pivot, on="nodeId", how="left").fillna(0)

    # Make integer dtype for counts when possible
    for col in df.columns:
        if col != "nodeId":
            df[col] = df[col].astype(int)

    # Sort columns: nodeId first, then relationship types (alphabetical)
    cols = ["nodeId", *sorted([c for c in df.columns if c != "nodeId"])]
    return df[cols].astype({"nodeId": "Int64"})