# Advanced Excercise

### 1. **Generate Primary Stock using Market Cap**

**df\_primary\_stock — Primary Security Selection per Entity**

This DataFrame identifies the primary traded security for each entity on each date, based on the largest market capitalization among its share classes.

```python

import pandas as pd
import gc
from finter.data import ContentFactory, IdTable


def compute_primary_stock(data_start=20100101, data_end=20250901):
    """
    Compute primary stock per entity (gvkey) per day based on market capitalization ranking.
    - Uses ffill(limit_area='inside', limit=3)
    - Keeps only tpci in ['0','F']
    - Adds ticker ('tic') column from IdTable
    - ascending=True explicitly applied in sort_values()
    """

    quantit_universe = False
    fill_nan = False

    # ---------------------------------------------------
    # 1️ Load universe metadata and define period
    # ---------------------------------------------------
    df_stock = IdTable("spglobal-usa").get_stock()
    df_stock["gvkeyiid"] = df_stock["gvkey"] + df_stock["iid"]

    cf = ContentFactory("us_stock", start=data_start, end=data_end)

    df_mktcap_pv = cf.get_df("mkt_cap",
                             quantit_universe=quantit_universe,
                             fill_nan=fill_nan)
    df_mktcap_pv = df_mktcap_pv.dropna(axis=1, how="all")
    df_mktcap_pv = df_mktcap_pv.ffill(limit_area="inside", limit=3)

    # ---------------------------------------------------
    # 2️ Reshape into long format
    # ---------------------------------------------------
    df_mktcap = (
        df_mktcap_pv.reset_index()
                    .melt(id_vars="index",
                          var_name="gvkeyiid",
                          value_name="mktcap")
    )
    df_mktcap["gvkey"] = df_mktcap["gvkeyiid"].str[:6]
    df_mktcap = df_mktcap.dropna(subset=["mktcap"])

    # ---------------------------------------------------
    # 3️ Rank securities within each entity by size
    # ---------------------------------------------------
    df_mktcap["mktcap_rank"] = (
        df_mktcap.groupby(["index", "gvkey"])["mktcap"]
                 .rank(ascending=False)
    )
    del df_mktcap_pv

    # ---------------------------------------------------
    # 4️ Select ordinary shares & ADR only
    # ---------------------------------------------------
    ordinary_adr_shares = set(df_stock.query("tpci in ['0', 'F']")["gvkeyiid"].unique())
    df_mktcap_ordinary = df_mktcap[df_mktcap["gvkeyiid"].isin(ordinary_adr_shares)]

    # ---------------------------------------------------
    # 5️ Select top-ranked (primary) share per entity per day
    # ---------------------------------------------------
    df_primary_shares = (
        df_mktcap_ordinary
        .sort_values(by=["index", "gvkey", "mktcap_rank"], ascending=True)  # ✅ ascending added
        .drop_duplicates(subset=["index", "gvkey"], keep="first")
        .reset_index(drop=True)
    )

    # ---------------------------------------------------
    # 6️ Add ticker (tic) information
    # ---------------------------------------------------
    df_primary_shares = df_primary_shares.merge(
        df_stock[["gvkeyiid", "tic"]],
        on="gvkeyiid",
        how="left"
    )

    # ---------------------------------------------------
    # 7️ Sanity check
    # ---------------------------------------------------
    num_shares_entity = df_primary_shares.groupby(["index", "gvkey"])["gvkeyiid"].count().max()
    if num_shares_entity > 1:
        print(f"Total number of shares for an entity exceeds 1!! ({num_shares_entity})")
    else:
        print("No worries :) Each entity has exactly one primary share per day.")

    assert all(~df_primary_shares[["index", "gvkey"]].duplicated()), \
           "Duplicated gvkey detected!"

    del df_mktcap, df_mktcap_ordinary
    gc.collect()

    print(f"Primary shares computed: {df_primary_shares['index'].nunique()} dates, "
          f"{df_primary_shares['gvkey'].nunique()} unique entities.")
    return df_primary_shares



primary_stock = compute_primary_stock()
    
```

<figure><img src="https://2201909190-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FPpEHhcMaleDgz0Cju56y%2Fuploads%2Fgit-blob-eba784a28c95b4a92064ce08c5c35553671c9242%2Fimage%20(11).png?alt=media" alt="" width="563"><figcaption></figcaption></figure>

***

### 2. Linking Fundamental PIT Data to Tickers

> In this example, we use the function created in Exercise 1.

```python
# =====================================================
# 0) Import Packages
# =====================================================
import pandas as pd
import numpy as np
import gc
from finter.data import ContentFactory, FinHelper



# =====================================================
# 1) Universe Metadata & Period Definition
# =====================================================
data_start = 20240101
data_end   = 20251104


# =====================================================
# 2) Compute Primary Stock using the helper function
# =====================================================
df_primary_stock = compute_primary_stock(data_start=data_start, data_end=data_end)


# =====================================================
# 3) Load Quarterly Financial Statement Data
# =====================================================
cf = ContentFactory(universe_name="us_stock", start=data_start, end=data_end)

rev    = cf.get_df("pit-revtq",  mode="unpivot").dropna(how="all", axis=1).astype({"fiscal": int}).astype({"fiscal": str})
niq    = cf.get_df("pit-niq",    mode="unpivot").dropna(how="all", axis=1).astype({"fiscal": int}).astype({"fiscal": str})
oiadpq = cf.get_df("pit-oiadpq", mode="unpivot").dropna(how="all", axis=1).astype({"fiscal": int}).astype({"fiscal": str})

u_rev, u_niq, u_oiadpq = FinHelper.unify_idx(rev, niq, oiadpq)

df_financials = pd.concat([
    u_rev.rename(columns={"value": "Revenue"}),
    u_niq.rename(columns={"value": "NetIncome"}),
    u_oiadpq.rename(columns={"value": "OperatingIncome"})
], axis=1).reset_index()


# =====================================================
# 4) Keep the Latest PIT per Fiscal Period
# =====================================================
df_financials_latest = (
    df_financials
    .sort_values(by=["id", "pit", "fiscal"], ascending=True)
    .drop_duplicates(subset=["id", "pit"], keep="last")
)

df_financials_snapshot = df_financials_latest.copy()
df_financials_snapshot["pit"] = pd.to_datetime(df_financials_snapshot["pit"])
df_financials_snapshot = df_financials_snapshot.rename(columns={"id": "gvkey"})


# =====================================================
# 5) Map Financials → Primary Security (by PIT date)
# =====================================================
df_primary = df_primary_stock[["index", "gvkey", "gvkeyiid", "tic"]].copy()
df_primary = df_primary.rename(columns={"index": "date"})

# --- Essential Data Cleaning Steps ---
df_financials_snapshot["pit"] = pd.to_datetime(df_financials_snapshot["pit"]).dt.tz_localize(None)
df_primary["date"] = pd.to_datetime(df_primary["date"]).dt.tz_localize(None)

df_financials_snapshot["gvkey"] = df_financials_snapshot["gvkey"].astype(str)
df_primary["gvkey"] = df_primary["gvkey"].astype(str)

df_financials_snapshot = df_financials_snapshot.dropna(subset=["gvkey", "pit"])
df_primary = df_primary.dropna(subset=["gvkey", "date"])

# Sort both sides (merge_asof requires sorted inputs)
df_primary = df_primary.sort_values(["date", "gvkey"]).reset_index(drop=True)
df_financials_snapshot = df_financials_snapshot.sort_values(["pit", "gvkey"]).reset_index(drop=True)

# Merge PIT-based financial data with the nearest ticker info
df_financials_with_tic = pd.merge_asof(
    left=df_financials_snapshot,     # Base: financial snapshot (pit)
    right=df_primary,                # Lookup: daily primary data (date, tic)
    by="gvkey",
    left_on="pit",
    right_on="date",
    direction="backward",            # Match to the closest earlier date
    tolerance=pd.Timedelta("90D")    # Allow 90-day gap (quarterly reporting lag)
)

# Keep relevant columns only
df_financials_with_tic = df_financials_with_tic[[
    "gvkey", "fiscal", "pit", "Revenue", "NetIncome", "OperatingIncome", "tic"
]]

print(df_financials_with_tic.head())
```

<figure><img src="https://2201909190-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FPpEHhcMaleDgz0Cju56y%2Fuploads%2Fgit-blob-53fb1cf27b28d39fd3966ef46c3dbfa263d199bf%2Fimage%20(12).png?alt=media" alt="" width="563"><figcaption></figcaption></figure>

***

### 3. Visualizing Profitability Margins

Net Profit, Operating Margins and EV/EBITDA Over Time:

```python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from finter.data import ContentFactory, FinHelper, IdTable

# =====================================================
# 0) Universe & Period
# =====================================================
data_start = 20000101
data_end   = 20251231
cf = ContentFactory(universe_name='us_stock', start=data_start, end=data_end)

# =====================================================
# 1) Load Raw Financial Statement Data (Income Statement)
# =====================================================
rev    = cf.get_df('pit-revtq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
niq    = cf.get_df('pit-niq',    mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
oiadpq = cf.get_df('pit-oiadpq', mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
cogsq  = cf.get_df('pit-cogsq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})

# =====================================================
# 2) EBITDA and PPE
# =====================================================
oibdpq = cf.get_df('pit-oibdpq', mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
ppegtq = cf.get_df('pit-ppegtq', mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})

# =====================================================
# 3) Balance Sheet and Cash Flow Items
# =====================================================
ltq  = cf.get_df('pit-ltq',   mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
ceqq = cf.get_df('pit-ceqq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
atq  = cf.get_df('pit-atq',   mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
actq = cf.get_df('pit-actq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
lctq = cf.get_df('pit-lctq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
cheq = cf.get_df('pit-cheq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
cfoq = cf.get_df('pit-cfoq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})

# =====================================================
# 4) Apply unify_idx and Merge into a Single DataFrame
# =====================================================
u_rev, u_niq, u_oiadpq, u_cogsq, u_oibdpq, u_ppegtq, u_ltq, u_ceqq, u_atq, u_actq, u_lctq, u_cheq, u_cfoq = FinHelper.unify_idx(
    rev, niq, oiadpq, cogsq, oibdpq, ppegtq, ltq, ceqq, atq, actq, lctq, cheq, cfoq
)

df_financials = pd.concat(
    [
        u_rev.rename(columns={'value': 'Revenue'}),
        u_niq.rename(columns={'value': 'NetIncome'}),
        u_oiadpq.rename(columns={'value': 'OperatingIncome'}),
        u_cogsq.rename(columns={'value': 'COGS'}),
        u_oibdpq.rename(columns={'value': 'EBITDA'}),
        u_ppegtq.rename(columns={'value': 'PPE'}),
        u_ltq.rename(columns={'value': 'TotalLiabilities'}),
        u_ceqq.rename(columns={'value': 'Equity'}),
        u_atq.rename(columns={'value': 'TotalAssets'}),
        u_actq.rename(columns={'value': 'CurrentAssets'}),
        u_lctq.rename(columns={'value': 'CurrentLiabilities'}),
        u_cheq.rename(columns={'value': 'Cash'}),
        u_cfoq.rename(columns={'value': 'OperatingCashFlow'})
    ],
    axis=1
).reset_index()

# =====================================================
# 5) Keep the Latest PIT per Fiscal
# =====================================================
df_financials_latest = (
    df_financials
    .sort_values(by=['id', 'pit', 'fiscal'], ascending=True)
    .drop_duplicates(subset=['id', 'pit'], keep='last')
)
df_financials_snapshot = df_financials_latest.copy()

# =====================================================
# 6) Company Mapping (Ticker via Primary Stock function)
#    → Use market-cap–based primary stock (Exercise 1 function)
# =====================================================
df_primary_stock = compute_primary_stock(data_start=data_start, data_end=data_end)

# PIT(date) 기준으로 최근 일자 primary ticker 매칭
df_financials_snapshot["pit"] = pd.to_datetime(df_financials_snapshot["pit"])
df_financials_snapshot = df_financials_snapshot.rename(columns={"id": "gvkey"})

df_primary = df_primary_stock[["index", "gvkey", "gvkeyiid", "tic"]].copy()
df_primary = df_primary.rename(columns={"index": "date"})
df_primary["date"] = pd.to_datetime(df_primary["date"])

# merge_asof 준비: 정렬 & 타입 통일
df_financials_snapshot["gvkey"] = df_financials_snapshot["gvkey"].astype(str)
df_primary["gvkey"]             = df_primary["gvkey"].astype(str)

df_financials_snapshot = df_financials_snapshot.dropna(subset=["gvkey", "pit"])
df_primary = df_primary.dropna(subset=["gvkey", "date"])

df_financials_snapshot = df_financials_snapshot.sort_values(["pit", "gvkey"]).reset_index(drop=True)
df_primary = df_primary.sort_values(["date", "gvkey"]).reset_index(drop=True)

df_financials_snapshot = pd.merge_asof(
    left=df_financials_snapshot,
    right=df_primary,
    by="gvkey",
    left_on="pit",
    right_on="date",
    direction="backward",
    tolerance=pd.Timedelta("90D")  
)


df_financials_snapshot = df_financials_snapshot.drop(columns=["date"], errors="ignore")

# =====================================================
# 7) Profitability Ratios
# =====================================================
df_financials_snapshot["OperatingMargin"] = (
    df_financials_snapshot["OperatingIncome"] / df_financials_snapshot["Revenue"]
)
df_financials_snapshot["NetMargin"] = (
    df_financials_snapshot["NetIncome"] / df_financials_snapshot["Revenue"]
)

# =====================================================
# 8) EV/EBITDA Multiple (Entity MktCap)
# =====================================================
df_mktcap_pv = cf.get_df("mkt_cap")
df_mktcap_pv = df_mktcap_pv.dropna(axis=1, how="all").ffill(limit_area='inside', limit=3)

df_mktcap = (
    df_mktcap_pv.reset_index()
                 .melt(id_vars="index", var_name="gvkeyiid", value_name="mktcap")
)
df_mktcap["gvkey"] = df_mktcap["gvkeyiid"].str[:6]

df_entity_mktcap = (
    df_mktcap.groupby(["index", "gvkey"], as_index=False)["mktcap"]
             .sum()
             .rename(columns={"index": "pit", "mktcap": "entity_mktcap"})
)
df_entity_mktcap["pit"] = pd.to_datetime(df_entity_mktcap["pit"])


df_financials_snapshot["pit"] = pd.to_datetime(df_financials_snapshot["pit"])
df_financials_snapshot = pd.merge_asof(
    df_financials_snapshot.sort_values("pit"),
    df_entity_mktcap.sort_values("pit"),
    by="gvkey",
    on="pit",
    direction="backward"
)
df_financials_snapshot = df_financials_snapshot.drop(columns=["index"], errors="ignore")

df_financials_snapshot["entity_mktcap_million"] = df_financials_snapshot["entity_mktcap"] / 1_000_000

# EV = Market Cap (in million USD) + Total Liabilities - Cash
df_financials_snapshot["EnterpriseValue"] = (
    df_financials_snapshot["entity_mktcap_million"]
    + df_financials_snapshot["TotalLiabilities"]
    - df_financials_snapshot["Cash"]
)

df_financials_snapshot["EV_EBITDA"] = np.where(
    df_financials_snapshot["EBITDA"] > 0,
    df_financials_snapshot["EnterpriseValue"] / df_financials_snapshot["EBITDA"],
    np.nan
)

# =====================================================
# 9) Data Cleaning and Sanity Checks
# =====================================================
df_financials_snapshot.loc[df_financials_snapshot["EnterpriseValue"] <= 0, "EV_EBITDA"] = np.nan
df_financials_snapshot.loc[df_financials_snapshot["OperatingMargin"].abs() > 10, "OperatingMargin"] = np.nan
df_financials_snapshot.loc[df_financials_snapshot["NetMargin"].abs() > 10, "NetMargin"] = np.nan

df_financials_snapshot["OperatingMargin_pct"] = df_financials_snapshot["OperatingMargin"] * 100
df_financials_snapshot["NetMargin_pct"]       = df_financials_snapshot["NetMargin"] * 100


df_financials_snapshot = df_financials_snapshot[df_financials_snapshot['fiscal'] >= 200001]

# =====================================================
# 10) Preview
# =====================================================
print(
    df_financials_snapshot[
        ["tic", "fiscal", "OperatingMargin_pct", "NetMargin_pct", "EV_EBITDA"]
    ].head(10)
)

# =====================================================
# 11) Operating Margin Distribution by Fiscal Period
# =====================================================
df_counts = (
    df_financials_snapshot
    .groupby(['fiscal'])['OperatingMargin_pct']
    .count()
    .reset_index(name='count')
)

df_op_margin = (
    df_financials_snapshot
    .groupby(['fiscal'])['OperatingMargin_pct']
    .describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95])
)

valid_idx = df_counts.query("count >= 500")['fiscal']
df_op_margin_filtered = df_op_margin.loc[df_op_margin.index.isin(valid_idx)].copy()

def fiscal_to_quarter(f):
    year = str(f)[:4]; q = str(f)[-2:]
    return f"{year}-Q{int(q)}"

df_op_margin_filtered['fiscal_label'] = df_op_margin_filtered.index.map(fiscal_to_quarter)

plt.figure(figsize=(14,6))
plt.plot(df_op_margin_filtered['fiscal_label'], df_op_margin_filtered['5%'],  label="5th Percentile")
plt.plot(df_op_margin_filtered['fiscal_label'], df_op_margin_filtered['25%'], label="25th Percentile")
plt.plot(df_op_margin_filtered['fiscal_label'], df_op_margin_filtered['50%'], label="Median (50%)", linewidth=2.5)
plt.plot(df_op_margin_filtered['fiscal_label'], df_op_margin_filtered['75%'], label="75th Percentile")
plt.plot(df_op_margin_filtered['fiscal_label'], df_op_margin_filtered['95%'], label="95th Percentile")

plt.title("U.S. Companies — Operating Margin Distribution (%)", fontsize=14)
plt.ylabel("Operating Margin (%)", fontsize=12)
plt.xlabel("Fiscal Quarter", fontsize=12)
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()
plt.tight_layout()
plt.show()

# =====================================================
# 12) Net Margin Distribution by Fiscal Period
# =====================================================
df_counts = (
    df_financials_snapshot
    .groupby(['fiscal'])['NetMargin_pct']
    .count()
    .reset_index(name='count')
)

df_net_margin = (
    df_financials_snapshot
    .groupby(['fiscal'])['NetMargin_pct']
    .describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95])
)

valid_idx = df_counts.query("count >= 500")['fiscal']
df_net_margin_filtered = df_net_margin.loc[df_net_margin.index.isin(valid_idx)].copy()
df_net_margin_filtered['fiscal_label'] = df_net_margin_filtered.index.map(fiscal_to_quarter)

plt.figure(figsize=(14,6))
plt.plot(df_net_margin_filtered['fiscal_label'], df_net_margin_filtered['5%'],  label="5th Percentile")
plt.plot(df_net_margin_filtered['fiscal_label'], df_net_margin_filtered['25%'], label="25th Percentile")
plt.plot(df_net_margin_filtered['fiscal_label'], df_net_margin_filtered['50%'], label="Median (50%)", linewidth=2.5)
plt.plot(df_net_margin_filtered['fiscal_label'], df_net_margin_filtered['75%'], label="75th Percentile")
plt.plot(df_net_margin_filtered['fiscal_label'], df_net_margin_filtered['95%'], label="95th Percentile")

plt.title("U.S. Companies — Net Margin Distribution (%)", fontsize=14)
plt.ylabel("Net Margin (%)", fontsize=12)
plt.xlabel("Fiscal Quarter", fontsize=12)
plt.xticks(rotation=45)
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()
plt.tight_layout()
plt.show()
```

<figure><img src="https://2201909190-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FPpEHhcMaleDgz0Cju56y%2Fuploads%2Fgit-blob-2066dacc46fecb9b92023d6a6648f13e5c0ef601%2Fimage%20(15).png?alt=media" alt=""><figcaption></figcaption></figure>

<figure><img src="https://2201909190-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FPpEHhcMaleDgz0Cju56y%2Fuploads%2Fgit-blob-d923ec2d8e991dabe1f32eb281b4018f916f1e3b%2Fimage%20(16).png?alt=media" alt=""><figcaption></figcaption></figure>

***

### 4. ROE DuPont analysis using S\&P 500 data

```python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from finter.data import ContentFactory, FinHelper, IdTable

# =====================================================
# 0) Universe & Period
# =====================================================
data_start = 20200101
data_end   = 20251026
cf = ContentFactory(universe_name='us_stock', start=data_start, end=data_end)

# =====================================================
# 1) Load Raw Financial Statement Data (Income Statement)
#    (Load all financial items in unpivot format: id, fiscal, pit, value)
# =====================================================
rev    = cf.get_df('pit-revtq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
niq    = cf.get_df('pit-niq',    mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
oiadpq = cf.get_df('pit-oiadpq', mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
cogsq  = cf.get_df('pit-cogsq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})

# =====================================================
# 2) EBITDA and PPE
# =====================================================
oibdpq = cf.get_df('pit-oibdpq', mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
ppegtq = cf.get_df('pit-ppegtq', mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})

# =====================================================
# 3) Balance Sheet and Cash Flow Items
# =====================================================
ltq  = cf.get_df('pit-ltq',   mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
ceqq = cf.get_df('pit-ceqq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
atq  = cf.get_df('pit-atq',   mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
actq = cf.get_df('pit-actq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
lctq = cf.get_df('pit-lctq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
cheq = cf.get_df('pit-cheq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
cfoq = cf.get_df('pit-cfoq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})

# =====================================================
# 4) Keep the Earliest PIT per Fiscal Period
"""
In this step, we handle cases where the same company may release 
multiple financial reports for the same fiscal period. 
To ensure consistency, we keep only the latest confirmed filing for each 
company and fiscal year before visualization.
"""
# =====================================================
def keep_earliest_per_fiscal(df):
    return (
        df.sort_values(['id', 'fiscal', 'pit'])
          .groupby(['id', 'fiscal'], as_index=False)
          .first()
    )

datasets = [rev, niq, oiadpq, cogsq, oibdpq, ppegtq, ltq, ceqq, atq, actq, lctq, cheq, cfoq]
datasets = [keep_earliest_per_fiscal(df) for df in datasets]
rev, niq, oiadpq, cogsq, oibdpq, ppegtq, ltq, ceqq, atq, actq, lctq, cheq, cfoq = datasets

# =====================================================
# 5) Apply unify_idx and Merge into a Single DataFrame
#    (Unify id, fiscal, pit indices and reshape to wide format)
# =====================================================
u_rev, u_niq, u_oiadpq, u_cogsq, u_oibdpq, u_ppegtq, u_ltq, u_ceqq, u_atq, u_actq, u_lctq, u_cheq, u_cfoq = FinHelper.unify_idx(
    rev, niq, oiadpq, cogsq, oibdpq, ppegtq, ltq, ceqq, atq, actq, lctq, cheq, cfoq
)

df_financials = pd.concat(
    [
        u_rev.rename(columns={'value': 'Revenue'}),
        u_niq.rename(columns={'value': 'NetIncome'}),
        u_oiadpq.rename(columns={'value': 'OperatingIncome'}),
        u_cogsq.rename(columns={'value': 'COGS'}),
        u_oibdpq.rename(columns={'value': 'EBITDA'}),
        u_ppegtq.rename(columns={'value': 'PPE'}),
        u_ltq.rename(columns={'value': 'TotalLiabilities'}),
        u_ceqq.rename(columns={'value': 'Equity'}),
        u_atq.rename(columns={'value': 'TotalAssets'}),
        u_actq.rename(columns={'value': 'CurrentAssets'}),
        u_lctq.rename(columns={'value': 'CurrentLiabilities'}),
        u_cheq.rename(columns={'value': 'Cash'}),
        u_cfoq.rename(columns={'value': 'OperatingCashFlow'})
    ],
    axis=1
).reset_index()  # bring id, fiscal, pit back as columns

# =====================================================
# 6) Keep the Latest PIT per Fiscal
#    (For the same pit, keep the latest fiscal only)
# =====================================================
df_financials_latest = (
    df_financials
    .sort_values(by=['id', 'pit', 'fiscal'], ascending=True)
    .drop_duplicates(subset=['id', 'pit'], keep='last')
)

# =====================================================
# 7) Align PIT vs Fiscal Date and Remove Outliers
"""
In this step, we handle cases where the same company may release 
multiple financial reports for the same fiscal period. 
To ensure consistency, we keep only the latest confirmed filing for each 
company and fiscal year before visualization.
"""
# =====================================================
df_financials_clean = df_financials_latest

df_financials_snapshot = (
    df_financials_clean
    .sort_values(['id', 'fiscal', 'pit'])
    .groupby(['id', 'fiscal'], as_index=False)
    .last()
)

# =====================================================
# 8) Company Mapping (Ticker & Name)
#    (Attach gvkey, ticker(tic), company name(conml))
# =====================================================
df_stock = IdTable("spglobal-usa").get_stock()

# Exchange filter can be skipped if S&P500 filter will be used later
# df_stock = df_stock[df_stock["miccd"].isin(["XASE", "XNAS", "XNYS"])].copy()

df_primary = (
    df_stock.sort_values(["gvkey", "iid"])
            .drop_duplicates(subset=["gvkey"], keep="first")
            .reset_index(drop=True)
)

df_financials_snapshot = df_financials_snapshot.merge(
    df_primary[["gvkey", "tic", "conml"]],
    left_on="id", right_on="gvkey", how="left"
)

# Drop records with missing gvkey/tic and remove outdated quarters
df_financials_snapshot = df_financials_snapshot.dropna(subset=["gvkey", "tic"]).copy()
df_financials_snapshot = df_financials_snapshot[df_financials_snapshot['fiscal'] >= 200001]

# =====================================================
# 9) Filter Only S&P500 Constituents (groupby merge_asof version with tqdm)
#    Filter based on actual S&P500 membership timeline (spx_constituent)
# =====================================================
# Load S&P500 constituent info
spx_const = cf.get_df("spx_constituent", quantit_universe=False, fill_nan=False)

# Trim column names to gvkey codes (first 6 digits)
spx_const.columns = [c[:6] for c in spx_const.columns]
spx_const.index.name = "pit"
spx_const = spx_const.reset_index()

# Convert from wide → long format: (pit, gvkey, spx_flag)
spx_long = spx_const.melt(id_vars="pit", var_name="gvkey", value_name="spx_flag")

# Clean and sort types
spx_long["pit"] = pd.to_datetime(spx_long["pit"], errors="coerce")
spx_long["gvkey"] = spx_long["gvkey"].astype(str).str.zfill(6)
spx_long = (
    spx_long
    .dropna(subset=["pit"])
    .sort_values(["gvkey", "pit"])
    .reset_index(drop=True)
)

# Standardize gvkey/pit in financial snapshot
df_financials_snapshot["pit"] = pd.to_datetime(df_financials_snapshot["pit"], errors="coerce")
df_financials_snapshot["gvkey"] = df_financials_snapshot["id"].astype(str).str.zfill(6)
df_financials_snapshot = (
    df_financials_snapshot
    .dropna(subset=["pit"])
    .sort_values(["gvkey", "pit"])
    .reset_index(drop=True)
)

# Perform merge_asof by gvkey
merged_list = []
unique_gvkeys = df_financials_snapshot["gvkey"].unique()

for gvkey_code in tqdm(unique_gvkeys, desc="Merging S&P500 constituents by gvkey"):
    df_sub = df_financials_snapshot[df_financials_snapshot["gvkey"] == gvkey_code]
    df_spx_sub = spx_long[spx_long["gvkey"] == gvkey_code]
    if df_spx_sub.empty:
        continue

    merged = pd.merge_asof(
        df_sub.sort_values("pit"),
        df_spx_sub.sort_values("pit"),
        on="pit",
        by="gvkey",
        direction="backward",
        tolerance=pd.Timedelta("4D")  # match to the most recent S&P500 status within 4 days
    )
    merged_list.append(merged)

# Combine all merged results
df_financials_spx = pd.concat(merged_list, ignore_index=True)

# Keep only periods when company actually belonged to S&P500
df_financials_spx = df_financials_spx[df_financials_spx["spx_flag"] == 1.0].copy()

# Remove duplicates for same company-quarter and sort
df_financials_spx = (
    df_financials_spx
    .drop_duplicates(subset=["id", "fiscal"], keep="first")
    .sort_values(["gvkey", "fiscal", "pit"])
    .reset_index(drop=True)
)

print("After S&P500 filter (groupby merge_asof version, tolerance=4D)")
print(f"Rows: {len(df_financials_spx):,}")
print(f"Unique gvkeys: {df_financials_spx['gvkey'].nunique():,}")
print(f"Fiscal range: {df_financials_spx['fiscal'].min()} ~ {df_financials_spx['fiscal'].max()}")
display(df_financials_spx.head(10))

# =====================================================
# 10) S&P 500 ROE DuPont Analysis (TTM-based)
# =====================================================
# Use df_financials_spx to calculate S&P500-level indicators
df_spx = df_financials_spx.copy()
df_spx = df_spx.sort_values(['id', 'fiscal']).reset_index(drop=True)

# ---- TTM Calculation: NetIncome_TTM / Revenue_TTM ----
ttm_results = []
for col in ['NetIncome', 'Revenue']:
    df_unpivot = df_spx[['id', 'fiscal', 'pit', col]].rename(columns={col: 'value'})
    df_ttm = FinHelper.rolling(df_unpivot, window=4, method='sum')
    df_ttm = df_ttm.rename(columns={'value': f"{col}_TTM"})
    ttm_results.append(df_ttm[['id', 'fiscal', f"{col}_TTM"]])

# Merge TTM results back
df_spx = df_spx.merge(ttm_results[0], on=['id', 'fiscal'], how='left')
df_spx = df_spx.merge(ttm_results[1], on=['id', 'fiscal'], how='left')

# ---- Aggregate by Quarter (S&P500-level snapshot) ----
market_dupont_ts = df_spx.groupby('fiscal').agg(
    Total_NetIncome_TTM=('NetIncome_TTM', 'sum'),
    Total_Revenue_TTM=('Revenue_TTM', 'sum'),
    Total_Assets=('TotalAssets', 'sum'),
    Total_Equity=('Equity', 'sum')
).reset_index()

# ---- Use up to 2025Q3 ----
market_dupont_ts['fiscal'] = market_dupont_ts['fiscal'].astype(str)
market_dupont_ts = market_dupont_ts[market_dupont_ts['fiscal'] <= '202503']
market_dupont_ts = market_dupont_ts[market_dupont_ts['fiscal'] >= '202001']
# Replace 0 values with NaN to avoid division errors
market_dupont_ts = market_dupont_ts.replace(0, np.nan)

# ---- DuPont Components ----
market_dupont_ts['Market_Profit_Margin'] = (
    market_dupont_ts['Total_NetIncome_TTM'] / market_dupont_ts['Total_Revenue_TTM']
)
market_dupont_ts['Market_Asset_Turnover'] = (
    market_dupont_ts['Total_Revenue_TTM'] / market_dupont_ts['Total_Assets']
)
market_dupont_ts['Market_Financial_Leverage'] = (
    market_dupont_ts['Total_Assets'] / market_dupont_ts['Total_Equity']
)
market_dupont_ts['Market_ROE'] = (
    market_dupont_ts['Market_Profit_Margin']
    * market_dupont_ts['Market_Asset_Turnover']
    * market_dupont_ts['Market_Financial_Leverage']
)

# ---- Convert fiscal to human-readable quarter ----
def convert_fiscal_to_quarter(fiscal):
    fiscal = str(fiscal)
    year = int(fiscal[:4])
    q = int(fiscal[-2:])
    return f"{year}Q{q}"

market_dupont_ts['Quarter'] = market_dupont_ts['fiscal'].apply(convert_fiscal_to_quarter)

# =====================================================
# 11) Plot
# =====================================================
print("===== S&P 500 ROE DuPont Analysis Results (≤ 2025Q3) =====")

fig, axes = plt.subplots(2, 2, figsize=(20, 12))
fig.suptitle('S&P 500 ROE DuPont Analysis (2020–2025Q3)', fontsize=20)

# 1) Market ROE
axes[0, 0].plot(market_dupont_ts['Quarter'], market_dupont_ts['Market_ROE'])
axes[0, 0].set_title('1. S&P 500 Market ROE', fontsize=14)
axes[0, 0].set_ylabel('ROE')
axes[0, 0].grid(True, linestyle=':', alpha=0.6)

# 2) Profit Margin
axes[0, 1].plot(market_dupont_ts['Quarter'], market_dupont_ts['Market_Profit_Margin'])
axes[0, 1].set_title('2. Profit Margin', fontsize=14)
axes[0, 1].set_ylabel('NetIncome / Revenue')
axes[0, 1].grid(True, linestyle=':', alpha=0.6)

# 3) Asset Turnover
axes[1, 0].plot(market_dupont_ts['Quarter'], market_dupont_ts['Market_Asset_Turnover'])
axes[1, 0].set_title('3. Asset Turnover', fontsize=14)
axes[1, 0].set_ylabel('Revenue / TotalAssets')
axes[1, 0].grid(True, linestyle=':', alpha=0.6)

# 4) Financial Leverage
axes[1, 1].plot(market_dupont_ts['Quarter'], market_dupont_ts['Market_Financial_Leverage'])
axes[1, 1].set_title('4. Financial Leverage', fontsize=14)
axes[1, 1].set_ylabel('TotalAssets / Equity')
axes[1, 1].grid(True, linestyle=':', alpha=0.6)

# Rotate x-axis labels and reduce tick frequency
for ax in axes.flat:
    ax.tick_params(axis='x', rotation=45)
    ax.set_xticks(ax.get_xticks()[::8])

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

```

<figure><img src="https://2201909190-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FPpEHhcMaleDgz0Cju56y%2Fuploads%2Fgit-blob-dd2a5d6e13d354d39878eb09468264e7e13144ee%2Fimage%20(13).png?alt=media" alt=""><figcaption></figcaption></figure>

***

### 6. Financial Metrics Analysis by US GICS Sector

```python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from finter.data import ContentFactory, FinHelper, IdTable
from scipy.stats.mstats import winsorize

# =====================================================
# 0) Universe & Period
# =====================================================
data_start = 20090101
data_end   = 20251026
cf = ContentFactory(universe_name='us_stock', start=data_start, end=data_end)

# =====================================================
# 1) Load Raw Financial Statement Data (Income Statement)
# =====================================================
rev    = cf.get_df('pit-revtq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
niq    = cf.get_df('pit-niq',    mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
oiadpq = cf.get_df('pit-oiadpq', mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
cogsq  = cf.get_df('pit-cogsq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})

# =====================================================
# 2) EBITDA and PPE
# =====================================================
oibdpq = cf.get_df('pit-oibdpq', mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
ppegtq = cf.get_df('pit-ppegtq', mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})

# =====================================================
# 3) Balance Sheet and Cash Flow Items
# =====================================================
ltq  = cf.get_df('pit-ltq',   mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
ceqq = cf.get_df('pit-ceqq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
atq  = cf.get_df('pit-atq',   mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
actq = cf.get_df('pit-actq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
lctq = cf.get_df('pit-lctq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
cheq = cf.get_df('pit-cheq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})
cfoq = cf.get_df('pit-cfoq',  mode='unpivot').dropna(how='all', axis=1).astype({'fiscal': int}).astype({'fiscal': str})

# =====================================================
# 4) Keep the Earliest PIT per Fiscal Period
# =====================================================
def keep_earliest_per_fiscal(df):
    return (
        df.sort_values(['id', 'fiscal', 'pit'])
          .groupby(['id', 'fiscal'], as_index=False)
          .first()
    )

datasets = [rev, niq, oiadpq, cogsq, oibdpq, ppegtq, ltq, ceqq, atq, actq, lctq, cheq, cfoq]
datasets = [keep_earliest_per_fiscal(df) for df in datasets]
rev, niq, oiadpq, cogsq, oibdpq, ppegtq, ltq, ceqq, atq, actq, lctq, cheq, cfoq = datasets

# =====================================================
# 5) Apply unify_idx and Merge into a Single DataFrame
# =====================================================
u_rev, u_niq, u_oiadpq, u_cogsq, u_oibdpq, u_ppegtq, u_ltq, u_ceqq, u_atq, u_actq, u_lctq, u_cheq, u_cfoq = FinHelper.unify_idx(
    rev, niq, oiadpq, cogsq, oibdpq, ppegtq, ltq, ceqq, atq, actq, lctq, cheq, cfoq
)

df_financials = pd.concat(
    [
        u_rev.rename(columns={'value': 'Revenue'}),
        u_niq.rename(columns={'value': 'NetIncome'}),
        u_oiadpq.rename(columns={'value': 'OperatingIncome'}),
        u_cogsq.rename(columns={'value': 'COGS'}),
        u_oibdpq.rename(columns={'value': 'EBITDA'}),
        u_ppegtq.rename(columns={'value': 'PPE'}),
        u_ltq.rename(columns={'value': 'TotalLiabilities'}),
        u_ceqq.rename(columns={'value': 'Equity'}),
        u_atq.rename(columns={'value': 'TotalAssets'}),
        u_actq.rename(columns={'value': 'CurrentAssets'}),
        u_lctq.rename(columns={'value': 'CurrentLiabilities'}),
        u_cheq.rename(columns={'value': 'Cash'}),
        u_cfoq.rename(columns={'value': 'OperatingCashFlow'})
    ],
    axis=1
).reset_index()

# =====================================================
# 6) Keep the Latest PIT per Fiscal
# =====================================================
df_financials_latest = (
    df_financials
    .sort_values(by=['id', 'pit', 'fiscal'], ascending=True)
    .drop_duplicates(subset=['id', 'pit'], keep='last')
)

# =====================================================
# 7) Align PIT vs Fiscal Date and Remove Outliers
"""
In this step, we handle cases where the same company may release 
multiple financial reports for the same fiscal period. 
To ensure consistency, we keep only the latest confirmed filing for each 
company and fiscal year before visualization.
"""
# =====================================================

df_financials_clean = df_financials_latest

# Keep only the latest PIT for each fiscal period
df_financials_snapshot = (
    df_financials_clean
    .sort_values(['id', 'fiscal', 'pit'])
    .groupby(['id', 'fiscal'], as_index=False)
    .last()
)

# =====================================================
# 8) Company Mapping (Ticker & Name) 
# Used for simplicity. 
# =====================================================
df_stock = IdTable("spglobal-usa").get_stock()
df_stock = df_stock[df_stock["miccd"].isin(["XASE", "XNAS", "XNYS"])].copy()

df_primary = (
    df_stock.sort_values(["gvkey", "iid"])
             .drop_duplicates(subset=["gvkey"], keep="first")
             .reset_index(drop=True)
)

df_financials_snapshot = df_financials_snapshot.merge(
    df_primary[["gvkey", "tic", "conml"]],
    left_on="id", right_on="gvkey", how="left"
)
df_financials_snapshot = df_financials_snapshot.dropna(subset=["gvkey", "tic"]).copy()
df_financials_snapshot = df_financials_snapshot[df_financials_snapshot['fiscal'] >= 200001]

# =====================================================
# 9) Profitability Ratios
# =====================================================
df_financials_snapshot["OperatingMargin"] = (
    df_financials_snapshot["OperatingIncome"] / df_financials_snapshot["Revenue"]
)
df_financials_snapshot["NetMargin"] = (
    df_financials_snapshot["NetIncome"] / df_financials_snapshot["Revenue"]
)

# =====================================================
# 11) Calculating EV/EBITDA Multiple
# =====================================================

df_mktcap_pv = cf.get_df("mkt_cap")
df_mktcap_pv = df_mktcap_pv.dropna(axis=1, how="all").ffill(limit_area='inside', limit=3)

df_mktcap = (
    df_mktcap_pv.reset_index()
                 .melt(id_vars="index", var_name="gvkeyiid", value_name="mktcap")
)
df_mktcap["gvkey"] = df_mktcap["gvkeyiid"].str[:6]

df_entity_mktcap = (
    df_mktcap.groupby(["index", "gvkey"], as_index=False)["mktcap"]
             .sum()
             .rename(columns={"index": "pit", "mktcap": "entity_mktcap"})
)

df_entity_mktcap["pit"] = pd.to_datetime(df_entity_mktcap["pit"])

df_financials_snapshot["pit"] = pd.to_datetime(df_financials_snapshot["pit"])

df_financials_snapshot = pd.merge_asof(
    df_financials_snapshot.sort_values("pit"),
    df_entity_mktcap.sort_values("pit"),
    by="gvkey",
    on="pit",
    direction="backward"
)

df_financials_snapshot = df_financials_snapshot.drop(columns=["index"], errors="ignore")


df_financials_snapshot["entity_mktcap_million"] = (
    df_financials_snapshot["entity_mktcap"] / 1_000_000
)

# EV = Market Cap (in million USD) + Total Liabilities - Cash
df_financials_snapshot["EnterpriseValue"] = (
    df_financials_snapshot["entity_mktcap_million"]
    + df_financials_snapshot["TotalLiabilities"]
    - df_financials_snapshot["Cash"]
)

df_financials_snapshot["EV_EBITDA"] = np.where(
    df_financials_snapshot["EBITDA"] > 0,
    df_financials_snapshot["EnterpriseValue"] / df_financials_snapshot["EBITDA"],
    np.nan
)

# =====================================================
# 12) Data Cleaning and Sanity Checks
# =====================================================
df_financials_snapshot.loc[df_financials_snapshot["EnterpriseValue"] <= 0, "EV_EBITDA"] = np.nan
df_financials_snapshot.loc[df_financials_snapshot["OperatingMargin"].abs() > 10, "OperatingMargin"] = np.nan
df_financials_snapshot.loc[df_financials_snapshot["NetMargin"].abs() > 10, "NetMargin"] = np.nan

df_financials_snapshot["OperatingMargin_pct"] = df_financials_snapshot["OperatingMargin"] * 100
df_financials_snapshot["NetMargin_pct"] = df_financials_snapshot["NetMargin"] * 100

# =====================================================
# 13) Preview
# =====================================================
print("--- Preview of Financials Snapshot (Head) ---")
print(
    df_financials_snapshot[
        ["tic", "fiscal", "OperatingMargin_pct", "NetMargin_pct", "EV_EBITDA"]
    ].head(10)
)
print("-" * 50)


# =====================================================
# 16) GICS Sector Mapping
# =====================================================
quantit_universe = False # False: when you conduct researchers on data. True: When you write Alpha models
fill_nan = False 

df_gics_pv = cf.get_df('gics', quantit_universe=quantit_universe, fill_nan=fill_nan) # use quantit_universe=False when you want to conduct researches. However, use those=True when you write Alpha.
df_gics_pv = df_gics_pv.dropna(axis=1, how='all') # remove unnecessary columns
df_gics_pv = df_gics_pv.applymap(lambda x: str(int(x)) if pd.notna(x) else x) # convert float -> str
df_gics = df_gics_pv.dropna(axis=1, how='all').reset_index().melt(id_vars='index', var_name='gvkey', value_name='gics')
# Extract first 2 digits from gics column (Sector)
df_gics["gics_sector"] = df_gics["gics"].astype(str).str[:2]

# Extract first 4 digits from gics column (Sub-Sector)
df_gics["gics_sub_sector"] = df_gics["gics"].astype(str).str[:4]


# Rename df_gics 'index' to 'pit' for merge_asof
df_gics_renamed = df_gics.rename(columns={"index": "pit"}).sort_values(["gvkey", "pit"])

# Merge df_clean and df_gics (by gvkey + pit, match nearest past GICS code)
df = pd.merge_asof(
    df_financials_snapshot.sort_values("pit"),
    df_gics_renamed.sort_values("pit"),
    by="gvkey",
    on="pit",
    direction="backward"   # Get the last value before the pit timestamp
)

# Drop unnecessary columns (e.g., full gics code if possible)
df = df.drop(columns=["gics"])
df.sort_values(by=["gvkey", "fiscal", "pit"])

print("--- Dataframe Head after GICS Merge ---")
print(df.head())
print("-" * 50)

# =====================================================
# 17) Duplicate Checks
# =====================================================
# Check for duplicates based on (id, fiscal, pit)
dup_check_pit = df[df.duplicated(subset=["id", "fiscal", "pit"], keep=False)]
print(f"Number of rows with duplicate (id, fiscal, pit) combinations: {len(dup_check_pit)}")

# Check which firms/quarters have duplicate pits
dup_summary_pit = (
    dup_check_pit.groupby(["id", "fiscal", "pit"])
                  .size()
                  .reset_index(name="count")
                  .sort_values("count", ascending=False)
)
print("--- Duplicate Summary ---")
print(dup_summary_pit.head(10))
print("-" * 50)

# =====================================================
# 18) NetIncome TTM (Trailing 12-Month) Calculation
# =====================================================
# *** THIS IS THE ACTIVATED CODE BLOCK ***
df_netincome = (
    df[["id", "fiscal", "pit", "NetIncome"]]
      .rename(columns={"NetIncome": "value"})
      .dropna(subset=["value"])
)

netincome_ttm = FinHelper.rolling(
    df_netincome,
    window=4,
    method='sum'
)

netincome_ttm["fiscal"] = netincome_ttm["fiscal"].astype(int)
df["fiscal"] = df["fiscal"].astype(int)
netincome_ttm = netincome_ttm.rename(columns={"value": "NetIncome_TTM"})

df = df.merge(netincome_ttm[["id", "fiscal", "NetIncome_TTM"]], on=["id", "fiscal"], how="left")

# Drop rows where TTM couldn't be calculated (first 3 quarters for each stock)
df = df.dropna(subset=["NetIncome_TTM"])

print("--- TTM Calculation and Merge Complete ---")
print(df[["tic", "fiscal", "NetIncome", "NetIncome_TTM"]].head())
print("-" * 50)

# =====================================================
# 19) Filter Top 50 Companies per Sector/Fiscal
# =====================================================
df_top50 = (
    df.sort_values(["gics_sector", "fiscal", "entity_mktcap_million"], ascending=[True, True, False])
      .groupby(["gics_sector", "fiscal"], group_keys=False)
      .head(50)   # Keep only the top 50
)

# =====================================================
# 20) Calculate Sector Aggregates (Robust Median Method)
# =====================================================

def calculate_sector_metrics(g):
    # Calculate individual company ratios
    g['PER'] = g['entity_mktcap_million'] / g['NetIncome_TTM']
    g['PBR'] = g['entity_mktcap_million'] / g['Equity']
    # EV_EBITDA is already calculated, just use it

    # Remove infinites resulting from 0 denominator
    g['PER'] = g['PER'].replace([np.inf, -np.inf], np.nan)
    g['PBR'] = g['PBR'].replace([np.inf, -np.inf], np.nan)
    g['EV_EBITDA'] = g['EV_EBITDA'].replace([np.inf, -np.inf], np.nan)

    # Winsorize positive values to clip extreme outliers (e.g., top/bottom 5%)
    # This makes the median even more stable
    per_positive = g.loc[g['PER'] > 0, 'PER'].dropna()
    if not per_positive.empty:
        per_winsorized = winsorize(per_positive, limits=[0.05, 0.05])
    else:
        per_winsorized = pd.Series(dtype=float) # Empty series if no positive PER

    pbr_positive = g.loc[g['PBR'] > 0, 'PBR'].dropna()
    if not pbr_positive.empty:
        pbr_winsorized = winsorize(pbr_positive, limits=[0.05, 0.05])
    else:
        pbr_winsorized = pd.Series(dtype=float) # Empty series if no positive PBR
        
    ev_ebitda_positive = g.loc[g['EV_EBITDA'] > 0, 'EV_EBITDA'].dropna()
    if not ev_ebitda_positive.empty:
        ev_ebitda_winsorized = winsorize(ev_ebitda_positive, limits=[0.05, 0.05])
    else:
        ev_ebitda_winsorized = pd.Series(dtype=float) # Empty series if no positive EV_EBITDA

    return pd.Series({
        # Original 'sum/sum' method (prone to outliers)
        "per_sector_sum": g["entity_mktcap_million"].sum() / g["NetIncome_TTM"].sum()
                           if g["NetIncome_TTM"].sum() > 0 else np.nan,
        "pbr_sector_sum": g["entity_mktcap_million"].sum() / g["Equity"].sum()
                           if g["Equity"].sum() > 0 else np.nan,
        
        # New robust 'median' method
        "per_sector_median": pd.Series(per_winsorized).median(),
        "pbr_sector_median": pd.Series(pbr_winsorized).median(),
        "ev_ebitda_sector_median": pd.Series(ev_ebitda_winsorized).median(),

        "total_mktcap": g["entity_mktcap_million"].sum()
    })

df_sector = (
    df_top50.groupby(["gics_sector", "fiscal"])
            .apply(calculate_sector_metrics)
            .reset_index()
)

# =====================================================
# 21) Prepare for Plotting
# =====================================================
def fiscal_to_quarter(fiscal: int) -> str:
    year = fiscal // 100
    q = fiscal % 100
    return f"{year}Q{q}"

df_sector["fiscal_str"] = df_sector["fiscal"].apply(fiscal_to_quarter)

# =====================================================
# 22) Sector Name Mapping
# =====================================================
sector_map = {
    "10": "Energy",
    "15": "Materials",
    "20": "Industrials",
    "25": "Consumer Discretionary",
    "30": "Consumer Staples",
    "35": "Health Care",
    "40": "Financials",
    "45": "Information Technology",
    "50": "Communication Services",
    "55": "Utilities",
    "60": "Real Estate"
}

df_sector["gics_sector"] = df_sector["gics_sector"].astype(str)
unique_sectors = df_sector["gics_sector"].dropna().unique()
numeric_sectors = [s for s in unique_sectors if s.isdigit()]
other_sectors   = [s for s in unique_sectors if not s.isdigit()]
sorted_sectors = sorted(numeric_sectors, key=lambda x: int(x)) + sorted(other_sectors)


# =====================================================
# 23) Sector Visualization (Using Median PER/PBR/EV_EBITDA)
# =====================================================
for sector in sorted_sectors:
    # --- !! MODIFIED SECTION !! ---
    # fiscal_str >= "2011Q1" (Start) AND
    # fiscal_str <= "2025Q2" (End) filters applied
    data = df_sector[
        (df_sector["gics_sector"] == sector) &
        (df_sector["fiscal_str"] >= "2011Q1") & # <-- Start point modified to "2011Q1"
        (df_sector["fiscal_str"] <= "2025Q2")   # <-- End point filter
    ].sort_values("fiscal")

    # If data is empty after filtering, skip to the next sector
    if data.empty:
        continue
    # --- !! END MODIFIED SECTION !! ---

    sector_name = sector_map.get(str(sector), f"Sector {sector}")

    # --- PER vs Market Cap (Median) ---
    fig, ax1 = plt.subplots(figsize=(14,5))
    ax1.plot(data["fiscal_str"], data["per_sector_median"], color="blue", label="Median PER (Sector)")
    ax1.set_xlabel("Fiscal Quarter")
    ax1.set_ylabel("Median PER (Sector)", color="blue")
    ax1.tick_params(axis="y", labelcolor="blue")

    # === Sector Average Median PER horizontal line ===
    per_mean = data["per_sector_median"].mean()
    if pd.notna(per_mean):
        ax1.axhline(y=per_mean, color="blue", linestyle=":", linewidth=1, alpha=0.7)
        # Adjust Avg text position to center (len(data)//2)
        ax1.text(len(data)//2, per_mean, f"Avg Median PER: {per_mean:.2f}",
                 color="blue", ha="center", va="bottom", fontsize=9, alpha=0.8)

    ax2 = ax1.twinx()
    ax2.plot(data["fiscal_str"], data["total_mktcap"]/1e6, color="green", linestyle="--", label="Market Cap (Trillion)")
    ax2.set_ylabel("Market Cap (Trillion USD)", color="green")
    ax2.tick_params(axis="y", labelcolor="green")

    ax1.set_xticks(range(len(data["fiscal_str"])))
    ax1.set_xticklabels(data["fiscal_str"], rotation=60, ha="right")

    # vlines range is also automatically adjusted to data length
    ax1.vlines(
        x=range(len(data["fiscal_str"])),
        ymin=ax1.get_ylim()[0],
        ymax=ax1.get_ylim()[1],
        colors="lightgray",
        linestyles="--",
        linewidth=0.5,
        alpha=0.7
    )

    plt.title(f"{sector_name} - Median PER vs Market Cap")
    fig.tight_layout()
    plt.show()

    # --- PBR vs Market Cap (Median) ---
    fig, ax1 = plt.subplots(figsize=(14,5))
    ax1.plot(data["fiscal_str"], data["pbr_sector_median"], color="orange", label="Median PBR (Sector)")
    ax1.set_xlabel("Fiscal Quarter")
    ax1.set_ylabel("Median PBR (Sector)", color="orange")
    ax1.tick_params(axis="y", labelcolor="orange")

    # === Sector Average Median PBR horizontal line ===
    pbr_mean = data["pbr_sector_median"].mean()
    if pd.notna(pbr_mean):
        ax1.axhline(y=pbr_mean, color="orange", linestyle=":", linewidth=1, alpha=0.7)
        # Adjust Avg text position to center (len(data)//2)
        ax1.text(len(data)//2, pbr_mean, f"Avg Median PBR: {pbr_mean:.2f}",
                 color="orange", ha="center", va="bottom", fontsize=9, alpha=0.8)

    ax2 = ax1.twinx()
    ax2.plot(data["fiscal_str"], data["total_mktcap"]/1e6, color="green", linestyle="--", label="Market Cap (Trillion)")
    ax2.set_ylabel("Market Cap (Trillion USD)", color="green")
    ax2.tick_params(axis="y", labelcolor="green")

    ax1.set_xticks(range(len(data["fiscal_str"])))
    ax1.set_xticklabels(data["fiscal_str"], rotation=60, ha="right")

    # vlines range is also automatically adjusted to data length
    ax1.vlines(
        x=range(len(data["fiscal_str"])),
        ymin=ax1.get_ylim()[0],
        ymax=ax1.get_ylim()[1],
        colors="lightgray",
        linestyles="--",
        linewidth=0.5,
        alpha=0.7
    )

    plt.title(f"{sector_name} - Median PBR vs Market Cap")
    fig.tight_layout()
    plt.show()

    # --- !! NEW PLOT: EV/EBITDA vs Market Cap (Median) !! ---
    fig, ax1 = plt.subplots(figsize=(14,5))
    ax1.plot(data["fiscal_str"], data["ev_ebitda_sector_median"], color="red", label="Median EV/EBITDA (Sector)")
    ax1.set_xlabel("Fiscal Quarter")
    ax1.set_ylabel("Median EV/EBITDA (Sector)", color="red")
    ax1.tick_params(axis="y", labelcolor="red")

    # === Sector Average Median EV/EBITDA horizontal line ===
    ev_ebitda_mean = data["ev_ebitda_sector_median"].mean()
    if pd.notna(ev_ebitda_mean):
        ax1.axhline(y=ev_ebitda_mean, color="red", linestyle=":", linewidth=1, alpha=0.7)
        # Adjust Avg text position to center (len(data)//2)
        ax1.text(len(data)//2, ev_ebitda_mean, f"Avg Median EV/EBITDA: {ev_ebitda_mean:.2f}",
                 color="red", ha="center", va="bottom", fontsize=9, alpha=0.8)

    ax2 = ax1.twinx()
    ax2.plot(data["fiscal_str"], data["total_mktcap"]/1e6, color="green", linestyle="--", label="Market Cap (Trillion)")
    ax2.set_ylabel("Market Cap (Trillion USD)", color="green")
    ax2.tick_params(axis="y", labelcolor="green")

    ax1.set_xticks(range(len(data["fiscal_str"])))
    ax1.set_xticklabels(data["fiscal_str"], rotation=60, ha="right")

    # vlines range is also automatically adjusted to data length
    ax1.vlines(
        x=range(len(data["fiscal_str"])),
        ymin=ax1.get_ylim()[0],
        ymax=ax1.get_ylim()[1],
        colors="lightgray",
        linestyles="--",
        linewidth=0.5,
        alpha=0.7
    )

    plt.title(f"{sector_name} - Median EV/EBITDA vs Market Cap")
    fig.tight_layout()
    plt.show()

print("--- All processes complete. ---")

```

<figure><img src="https://2201909190-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FPpEHhcMaleDgz0Cju56y%2Fuploads%2Fgit-blob-2c6ae6ed9c67dd09f4ad4fbc0ea4d8989da32da6%2Fimage%20(29).png?alt=media" alt=""><figcaption></figcaption></figure>

<figure><img src="https://2201909190-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FPpEHhcMaleDgz0Cju56y%2Fuploads%2Fgit-blob-09c5d4c5dfb6fef714c12cf0e3fc01bb8d9eaf00%2Fimage%20(30).png?alt=media" alt=""><figcaption></figcaption></figure>

<figure><img src="https://2201909190-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FPpEHhcMaleDgz0Cju56y%2Fuploads%2Fgit-blob-94dc2f01e6c75b9fde9b617eb75d2e53b78126b7%2Fimage%20(31).png?alt=media" alt=""><figcaption></figcaption></figure>

<figure><img src="https://2201909190-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FPpEHhcMaleDgz0Cju56y%2Fuploads%2Fgit-blob-a0c09c41bebf6bf8948f932a7f55292fc05a06e0%2Fimage%20(32).png?alt=media" alt=""><figcaption></figcaption></figure>

...

***

### **7. Correlation Analysis in FX Markets**

```python
import pandas as pd
import numpy as np
from finter.data import ContentFactory
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, kendalltau
from scipy.spatial.distance import pdist, squareform

# Load data
cf = ContentFactory('raw', 20180101, 20251027)
fx_data = cf.get_df("content.fnguide.ftp.economy.currency.1d", fill_nan=True)
fx = fx_data[['EURUSD', 'GBPUSD', 'USDJPY', 'USDKRW', 'AUDUSD']].copy()
fx = fx.ffill(limit_area='inside', limit=3).dropna()
returns = fx.pct_change().dropna()

print("Professional Correlation Analysis Methods\n")


# ================================================================
# Method 1: Rolling Correlation (Dynamic)
# ================================================================

def method1_rolling_correlation():
    """
    Time-varying correlation
    - Captures market regime changes
    - Identifies correlation spikes during crisis periods
    """
    print("="*70)
    print("Method 1: Rolling (Dynamic) Correlation")
    print("="*70)
    print("Why: Correlations change over time! Static avg masks regime shifts.")
    
    # 60-day rolling correlation between EURUSD and GBPUSD
    window = 60
    rolling_corr = returns['EURUSD'].rolling(window).corr(returns['GBPUSD'])
    
    print(f"\nEUR/GBP Rolling Correlation (60-day window):")
    print(f"  Mean:    {rolling_corr.mean():.3f}")
    print(f"  Std Dev: {rolling_corr.std():.3f}")
    print(f"  Min:     {rolling_corr.min():.3f}")
    print(f"  Max:     {rolling_corr.max():.3f}")
    
    # Visualization
    fig, axes = plt.subplots(2, 1, figsize=(14, 8))
    
    # Price chart
    axes[0].plot(fx.index, fx['EURUSD'], label='EURUSD', linewidth=2)
    axes[0].plot(fx.index, fx['GBPUSD'], label='GBPUSD', linewidth=2)
    axes[0].set_title('EUR/USD and GBP/USD Exchange Rates', fontsize=13, fontweight='bold')
    axes[0].set_ylabel('Exchange Rate')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Rolling correlation
    axes[1].plot(rolling_corr.index, rolling_corr.values, linewidth=2.5, color='purple')
    axes[1].axhline(y=rolling_corr.mean(), color='red', linestyle='--', linewidth=2, 
                   label=f'Mean: {rolling_corr.mean():.3f}')
    axes[1].fill_between(rolling_corr.index, 0, rolling_corr.values, alpha=0.3)
    axes[1].set_title(f'Rolling Correlation ({window}-day)', fontsize=13, fontweight='bold')
    axes[1].set_ylabel('Correlation')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("\nKey Insight:")
    print("  - Correlation spikes during crisis periods (flight to safety)")
    print("  - More stable during normal market conditions")


# ================================================================
# Method 2: Exponential Weighted Correlation (EWMA)
# ================================================================

def method2_ewma_correlation():
    """
    Exponentially weighted correlation
    - Higher weight on recent data
    - Widely used by financial institutions for risk management
    """
    print("\n" + "="*70)
    print("Method 2: Exponentially Weighted Moving Average (EWMA) Correlation")
    print("="*70)
    print("Why: Recent data matters more! EWMA gives higher weight to recent obs.")
    
    # EWMA correlation (RiskMetrics uses span=94 for daily data)
    span = 94  # ~3 months with decay factor
    ewma_corr = returns['EURUSD'].ewm(span=span).corr(returns['GBPUSD'])
    
    # Compare with simple rolling
    rolling_corr = returns['EURUSD'].rolling(span).corr(returns['GBPUSD'])
    
    print(f"\nComparison (span={span}):")
    print(f"  EWMA Mean:    {ewma_corr.mean():.3f}")
    print(f"  Rolling Mean: {rolling_corr.mean():.3f}")
    print(f"  EWMA reacts faster to recent changes!")
    
    # Visualization
    plt.figure(figsize=(14, 6))
    plt.plot(rolling_corr.index, rolling_corr.values, 
            label=f'Simple Rolling ({span}-day)', linewidth=2, alpha=0.7)
    plt.plot(ewma_corr.index, ewma_corr.values, 
            label=f'EWMA (span={span})', linewidth=2.5, alpha=0.8)
    plt.axhline(y=0, color='black', linestyle='-', linewidth=1, alpha=0.3)
    plt.title('EWMA vs Simple Rolling Correlation', fontsize=14, fontweight='bold')
    plt.ylabel('Correlation')
    plt.legend(fontsize=11)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print("\nKey Insight:")
    print("  - EWMA adapts faster to regime changes")
    print("  - Used by JP Morgan's RiskMetrics methodology")


# ================================================================
# Method 3: Rank-Based Correlation (Spearman)
# ================================================================

def method3_rank_correlation():
    """
    Rank-based correlation (Spearman)
    - Captures non-linear relationships
    - Robust to outliers
    """
    print("\n" + "="*70)
    print("Method 3: Rank-Based Correlation (Spearman)")
    print("="*70)
    print("Why: Captures non-linear relationships & robust to outliers")
    
    # Calculate both Pearson and Spearman
    pearson_corr = returns.corr(method='pearson')
    spearman_corr = returns.corr(method='spearman')
    
    print("\nPearson (Linear) Correlation:")
    print(pearson_corr.round(3))
    
    print("\nSpearman (Rank) Correlation:")
    print(spearman_corr.round(3))
    
    # Difference matrix
    diff = spearman_corr - pearson_corr
    
    print("\nDifference (Spearman - Pearson):")
    print(diff.round(3))
    print("\n  Large differences indicate non-linear relationships!")
    
    # Visualization
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    # Pearson
    sns.heatmap(pearson_corr, annot=True, cmap='coolwarm', center=0,
                square=True, linewidths=1, fmt='.2f', vmin=-1, vmax=1,
                ax=axes[0], cbar_kws={"shrink": 0.8})
    axes[0].set_title('Pearson Correlation\n(Linear)', fontsize=13, fontweight='bold')
    
    # Spearman
    sns.heatmap(spearman_corr, annot=True, cmap='coolwarm', center=0,
                square=True, linewidths=1, fmt='.2f', vmin=-1, vmax=1,
                ax=axes[1], cbar_kws={"shrink": 0.8})
    axes[1].set_title('Spearman Correlation\n(Rank-based)', fontsize=13, fontweight='bold')
    
    # Difference
    sns.heatmap(diff, annot=True, cmap='RdBu_r', center=0,
                square=True, linewidths=1, fmt='.2f', vmin=-0.2, vmax=0.2,
                ax=axes[2], cbar_kws={"shrink": 0.8})
    axes[2].set_title('Difference\n(Spearman - Pearson)', fontsize=13, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    print("\nKey Insight:")
    print("  - If Spearman != Pearson, relationship is non-linear")
    print("  - Spearman is better for fat-tailed distributions (common in FX)")


# ================================================================
# Method 4: Conditional Correlation (Market Stress)
# ================================================================

def method4_conditional_correlation():
    """
    Conditional correlation
    - Different correlations under different market conditions
    - Crisis vs normal periods
    """
    print("\n" + "="*70)
    print("Method 4: Conditional Correlation (Market Regimes)")
    print("="*70)
    print("Why: Correlations differ in bull vs bear markets!")
    
    # Define market stress using VIX proxy (volatility)
    # Use USDKRW volatility as stress indicator for simplicity
    vol = returns['USDKRW'].rolling(20).std() * np.sqrt(252) * 100
    high_stress = vol > vol.quantile(0.75)  # Top 25% volatility
    low_stress = vol < vol.quantile(0.25)   # Bottom 25% volatility
    
    # Calculate correlation in different regimes
    corr_high_stress = returns[high_stress].corr()
    corr_low_stress = returns[low_stress].corr()
    corr_diff = corr_high_stress - corr_low_stress
    
    print(f"\nHigh Stress Periods ({high_stress.sum()} days):")
    print(corr_high_stress.round(3))
    
    print(f"\nLow Stress Periods ({low_stress.sum()} days):")
    print(corr_low_stress.round(3))
    
    print("\nDifference (High Stress - Low Stress):")
    print(corr_diff.round(3))
    
    # Visualization
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Stress indicator
    axes[0, 0].plot(vol.index, vol.values, linewidth=1.5, color='darkred')
    axes[0, 0].axhline(y=vol.quantile(0.75), color='red', linestyle='--', 
                      linewidth=2, label='High Stress Threshold')
    axes[0, 0].axhline(y=vol.quantile(0.25), color='green', linestyle='--', 
                      linewidth=2, label='Low Stress Threshold')
    axes[0, 0].fill_between(vol.index, 0, vol.values, 
                           where=high_stress, alpha=0.3, color='red', label='High Stress')
    axes[0, 0].fill_between(vol.index, 0, vol.values, 
                           where=low_stress, alpha=0.3, color='green', label='Low Stress')
    axes[0, 0].set_title('Market Stress Indicator (USD/KRW Vol)', fontsize=12, fontweight='bold')
    axes[0, 0].set_ylabel('Annualized Volatility (%)')
    axes[0, 0].legend(fontsize=9)
    axes[0, 0].grid(True, alpha=0.3)
    
    # High stress correlation
    sns.heatmap(corr_high_stress, annot=True, cmap='Reds', center=0,
                square=True, linewidths=1, fmt='.2f', vmin=-1, vmax=1,
                ax=axes[0, 1], cbar_kws={"shrink": 0.8})
    axes[0, 1].set_title('High Stress Correlation', fontsize=12, fontweight='bold')
    
    # Low stress correlation
    sns.heatmap(corr_low_stress, annot=True, cmap='Greens', center=0,
                square=True, linewidths=1, fmt='.2f', vmin=-1, vmax=1,
                ax=axes[1, 0], cbar_kws={"shrink": 0.8})
    axes[1, 0].set_title('Low Stress Correlation', fontsize=12, fontweight='bold')
    
    # Difference
    sns.heatmap(corr_diff, annot=True, cmap='RdBu_r', center=0,
                square=True, linewidths=1, fmt='.2f', vmin=-0.5, vmax=0.5,
                ax=axes[1, 1], cbar_kws={"shrink": 0.8})
    axes[1, 1].set_title('Difference (High - Low)', fontsize=12, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    print("\nKey Insight:")
    print("  - Correlations INCREASE during crisis (contagion effect)")
    print("  - Diversification benefits disappear when you need them most!")


# ================================================================
# Method 5: Distance Correlation (Detects Any Dependency)
# ================================================================

def method5_distance_correlation():
    """
    Distance correlation
    - Captures all types of relationships (linear and non-linear)
    - Most comprehensive method
    """
    print("\n" + "="*70)
    print("Method 5: Distance-Based Measures")
    print("="*70)
    print("Why: Captures ALL types of dependencies, not just linear!")
    
    # Simplified distance measure using correlation of absolute returns
    # (True distance correlation requires more complex calculation)
    
    # Standard correlation (linear)
    linear_corr = returns.corr()
    
    # Absolute return correlation (captures volatility co-movement)
    abs_corr = returns.abs().corr()
    
    print("\nLinear Return Correlation:")
    print(linear_corr.round(3))
    
    print("\nAbsolute Return Correlation (Volatility Co-movement):")
    print(abs_corr.round(3))
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    sns.heatmap(linear_corr, annot=True, cmap='coolwarm', center=0,
                square=True, linewidths=1, fmt='.2f', vmin=-1, vmax=1,
                ax=axes[0], cbar_kws={"shrink": 0.8})
    axes[0].set_title('Return Correlation\n(Direction)', fontsize=13, fontweight='bold')
    
    sns.heatmap(abs_corr, annot=True, cmap='YlOrRd', center=0.5,
                square=True, linewidths=1, fmt='.2f', vmin=0, vmax=1,
                ax=axes[1], cbar_kws={"shrink": 0.8})
    axes[1].set_title('Absolute Return Correlation\n(Volatility)', fontsize=13, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    print("\nKey Insight:")
    print("  - Returns may be uncorrelated, but volatilities can be correlated!")
    print("  - Important for risk management and portfolio construction")


# ================================================================
# Method 6: DCC-GARCH (Industry Standard for Risk Management)
# ================================================================

def method6_dcc_garch_concept():
    """
    DCC-GARCH concept explanation
    - Actual implementation is complex but concept is introduced
    - Gold standard used by banks and hedge funds
    """
    print("\n" + "="*70)
    print("Method 6: DCC-GARCH (Industry Gold Standard)")
    print("="*70)
    print("Why: Used by banks & hedge funds for risk management")
    print("\nNote: Full implementation requires specialized libraries (arch, statsmodels)")
    print("Here we show the concept using simplified approach")
    
    # Simulate GARCH-like volatility
    window = 20
    vol = returns.rolling(window).std()
    
    # Standardized returns (returns / volatility)
    standardized = returns / vol
    standardized = standardized.dropna()
    
    # Rolling correlation of standardized returns
    dcc_like = standardized['EURUSD'].rolling(60).corr(standardized['GBPUSD'])
    
    # Compare with regular correlation
    regular_corr = returns['EURUSD'].rolling(60).corr(returns['GBPUSD'])
    
    plt.figure(figsize=(14, 6))
    plt.plot(regular_corr.index, regular_corr.values, 
            label='Standard Rolling Correlation', linewidth=2, alpha=0.7)
    plt.plot(dcc_like.index, dcc_like.values, 
            label='DCC-like (Volatility-Adjusted)', linewidth=2.5, alpha=0.8)
    plt.axhline(y=0, color='black', linestyle='-', linewidth=1, alpha=0.3)
    plt.title('Standard vs DCC-GARCH Style Correlation', fontsize=14, fontweight='bold')
    plt.ylabel('Correlation')
    plt.legend(fontsize=11)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print("\nWhat is DCC-GARCH?")
    print("  - Dynamic Conditional Correlation - GARCH")
    print("  - Models time-varying volatility AND correlation")
    print("  - Accounts for volatility clustering")
    print("  - Used by Basel III for bank capital requirements")
    print("\n  Popular libraries: arch, statsmodels, rugarch (R)")


# ================================================================
# Summary Comparison
# ================================================================

def summary_comparison():
    """Summary of all methods"""
    print("\n" + "="*70)
    print("SUMMARY: Which Method Should You Use?")
    print("="*70)
    
    comparison = pd.DataFrame({
        'Method': [
            'Static Pearson',
            'Rolling Correlation',
            'EWMA',
            'Spearman',
            'Conditional',
            'Distance/Abs',
            'DCC-GARCH'
        ],
        'Complexity': [
            'Very Low',
            'Low',
            'Low',
            'Low',
            'Medium',
            'Medium',
            'High'
        ],
        'Use Case': [
            'Quick exploration',
            'Time-varying relationships',
            'Risk management (short-term)',
            'Non-linear relationships',
            'Regime analysis',
            'Volatility co-movement',
            'Professional risk management'
        ],
        'Industry Usage': [
            'Academic/Teaching',
            'Portfolio Management',
            'Risk Management (JP Morgan)',
            'Alternative Data',
            'Hedge Funds',
            'Advanced Research',
            'Banks/Regulators'
        ]
    })
    
    print("\n")
    print(comparison.to_string(index=False))
    
    print("\n" + "="*70)
    print("RECOMMENDATIONS:")
    print("="*70)
    print("  For Students:        Start with Rolling Correlation")
    print("  For Quant Research:  Use Spearman + Conditional")
    print("  For Risk Management: EWMA or DCC-GARCH")
    print("  For Trading:         Rolling + Regime Detection")
    print("="*70)


# ================================================================
# Run All Examples
# ================================================================

if __name__ == "__main__":
    method1_rolling_correlation()
    method2_ewma_correlation()
    method3_rank_correlation()
    method4_conditional_correlation()
    method5_distance_correlation()
    method6_dcc_garch_concept()
    summary_comparison()
    
    print("\nAll professional correlation methods demonstrated!")
    print("Key Takeaway: Static correlation is just the starting point!")
```

***

### 8. GDP Nowcasting Model

```python
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from finter.data import ContentFactory
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

# =====================================================
# 0) Setup
# =====================================================
# We need a longer history to train a model.
data_start = 20050101
data_end   = 20251026
# --- FIX: Convert integer date to datetime object for pandas functions ---
data_end_dt = pd.to_datetime(str(data_end), format='%Y%m%d')
# --- END FIX ---
cf = ContentFactory(universe_name='us_stock', start=data_start, end=data_end)

# =====================================================
# 1) Bias-Free Helper Function (Point-in-Time Parser)
# =====================================================
def parse_pit_data(df_raw, pit_date, value_name, freq='M'):
    """
    Parses the string-list format from the 'live' data *as it was known*
    at a specific point-in-time (pit_date).
    """
    if df_raw.empty or 'value' not in df_raw.columns:
        return pd.DataFrame(columns=['date', value_name])

    # Convert snapshot index to datetime
    if not isinstance(df_raw.index, pd.DatetimeIndex):
        try:
            df_raw.index = pd.to_datetime(df_raw.index)
        except Exception as e:
            print(f"Failed to convert raw index to datetime: {e}")
            return pd.DataFrame(columns=['date', value_name])

    # Find the latest snapshot *before or on* the pit_date
    snapshot_index = df_raw.index.searchsorted(pit_date, side='right') - 1
    
    if snapshot_index < 0:
        # No data available at this pit_date
        return pd.DataFrame(columns=['date', value_name])

    # Get the point-in-time snapshot
    val_str = df_raw.iloc[snapshot_index]["value"]
    
    if isinstance(val_str, str):
        data = ast.literal_eval(val_str)
    else:
        data = val_str # Already a list

    df_parsed = pd.DataFrame(data, columns=["date", value_name])
    
    # Convert dates based on frequency
    if freq == 'Q':
        df_parsed["date"] = pd.to_datetime(df_parsed["date"]).dt.to_period('Q').dt.to_timestamp()
    elif freq == 'M':
        df_parsed["date"] = pd.to_datetime(df_parsed["date"], format="%Y-%m")
    elif freq == 'W':
        df_parsed["date"] = pd.to_datetime(df_parsed["date"])
        
    df_parsed = df_parsed.sort_values("date").reset_index(drop=True)
    df_parsed[value_name] = pd.to_numeric(df_parsed[value_name], errors='coerce')
    return df_parsed.set_index("date")

# =====================================================
# 2) Helper Function to Process PIT Data
# =====================================================
def process_pit_features(gdp_raw, indpro_raw, payems_raw, icsa_raw, pit_date):
    """
    Loads, parses, and transforms all features as known on 'pit_date'.
    """
    # 1. Parse all datasets at point-in-time
    gdp_pit = parse_pit_data(gdp_raw, pit_date, "gdp_level", freq='Q')
    indpro_pit = parse_pit_data(indpro_raw, pit_date, "indpro", freq='M')
    payems_pit = parse_pit_data(payems_raw, pit_date, "payems", freq='M')
    icsa_w_pit = parse_pit_data(icsa_raw, pit_date, "icsa", freq='W')

    if gdp_pit.empty or indpro_pit.empty or payems_pit.empty or icsa_w_pit.empty:
        return None, None # Not enough data at this PIT

    # 2. Create Target (Y)
    gdp_y = ((gdp_pit["gdp_level"] / gdp_pit["gdp_level"].shift(1)) ** 4 - 1) * 100
    gdp_y = gdp_y.to_frame(name="gdp_qoq_ann_growth")

    # 3. Create Features (X)
    # INDPRO
    indpro_x = (indpro_pit["indpro"].pct_change(1) * 100).resample("QS").mean().to_frame(name="indpro_qoq_avg")
    
    # PAYEMS
    payems_x = payems_pit["payems"].diff().resample("QS").sum().to_frame(name="payems_qoq_sum")
    
    # ICSA (Weekly -> Monthly -> Quarterly)
    icsa_m = icsa_w_pit['icsa'].resample('M').mean()
    icsa_x = (icsa_m.pct_change(1) * 100).resample("QS").mean().to_frame(name="icsa_qoq_avg")

    # 4. Combine features
    features_q_x = pd.concat([indpro_x, payems_x, icsa_x], axis=1)
    
    return gdp_y, features_q_x

# =====================================================
# 3) Load All Raw Snapshot Data (Once)
# =====================================================
print("Loading all raw data snapshots (this may take a moment)...")
df_gdp_raw = cf.get_df("live-gdpc1").dropna(subset=["value"])
df_indpro_raw = cf.get_df("live-indpro").dropna(subset=["value"])
df_payems_raw = cf.get_df("live-payems").dropna(subset=["value"])
df_icsa_raw = cf.get_df("live-icsa").dropna(subset=["value"])
print("Raw data loading complete.")

# =====================================================
# 4) Run Point-in-Time (PIT) Walk-Forward Backtest
# =====================================================
features = ["indpro_qoq_avg", "payems_qoq_sum", "icsa_qoq_avg"]
target = "gdp_qoq_ann_growth"
model = LinearRegression()

# Define test range. We will predict for these quarters.
# --- FIX: Use the datetime object 'data_end_dt' ---
test_quarters = pd.date_range(start='2014-01-01', end=data_end_dt, freq='QS')
# --- END FIX ---
results_list = []

print(f"Starting bias-free backtest from {test_quarters[0].date()} to {test_quarters[-1].date()}...")

for target_quarter in tqdm(test_quarters):
    # We assume we make the prediction 10 days after the quarter ends,
    # giving time for the 3rd month's data to be released (preliminarily).
    prediction_date = target_quarter + pd.DateOffset(months=3, days=10)
    
    try:
        # 1. Get all data *as it was known* on the prediction_date
        gdp_y_pit, features_q_pit = process_pit_features(
            df_gdp_raw, df_indpro_raw, df_payems_raw, df_icsa_raw, prediction_date
        )
        
        if gdp_y_pit is None:
            # print(f"Skipping {target_quarter}: Not enough data in snapshot.")
            continue
            
        # 2. Create Training Set (data *before* the quarter we are predicting)
        df_train = pd.concat([gdp_y_pit, features_q_pit], axis=1)
        df_train = df_train.loc[df_train.index < target_quarter].dropna()
        
        if len(df_train) < 20: # Need enough data to train
            # print(f"Skipping {target_quarter}: Not enough training data.")
            continue
            
        X_train = df_train[features]
        Y_train = df_train[target]
        
        # 3. Create Test Vector (data *for* the quarter we are predicting)
        X_test = features_q_pit.loc[target_quarter:target_quarter]
        
        if X_test.empty or X_test.isnull().values.any():
            # This means the monthly data for the target_quarter was not yet
            # fully available in the snapshot on 'prediction_date'.
            # print(f"Skipping {target_quarter}: Feature data not yet available.")
            continue
            
        # 4. Train and Predict
        model.fit(X_train, Y_train)
        prediction = model.predict(X_test)[0]
        
        results_list.append({
            "date": target_quarter,
            "Predicted GDP Growth": prediction
        })

    except Exception as e:
        # print(f"Error processing {target_quarter}: {e}")
        pass

print("Backtest complete.")

# =====================================================
# 5) Get Final "Ground Truth" Actuals for Comparison
# =====================================================
# For comparison, we use the *latest, most revised* GDP data as "truth".
gdp_y_final, _ = process_pit_features(
    df_gdp_raw, df_indpro_raw, df_payems_raw, df_icsa_raw, pd.Timestamp.now()
)
gdp_y_final = gdp_y_final.rename(columns={"gdp_qoq_ann_growth": "Actual GDP Growth"})

# Combine predictions with actuals
df_pred = pd.DataFrame(results_list).set_index("date")
results = pd.concat([gdp_y_final, df_pred], axis=1)
results = results.loc[test_quarters[0]:].dropna(subset=["Predicted GDP Growth"])

# =====================================================
# 6) Visualize Results (Bias-Free)
# =====================================================
plt.figure(figsize=(15, 7))
plt.plot(results.index, results["Actual GDP Growth"], label="Actual GDP Growth (Final Revised)", color="blue", marker='o', linestyle='-')
plt.plot(results.index, results["Predicted GDP Growth"], label="Predicted GDP Growth (Bias-Free)", color="red", marker='x', linestyle='--')

plt.title("GDP Nowcasting Model (Point-in-Time): Actual vs. Predicted", fontsize=16)
plt.xlabel("Date", fontsize=12)
plt.ylabel("QoQ Annualized Growth Rate (%)", fontsize=12)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.axhline(0, color="black", linestyle="-", linewidth=0.5)
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.tight_layout()
plt.show()

# =====================================================
# 7) Make a "Real" Nowcast for the Current Quarter
# =====================================================
# Predict 2025-Q3 (using data as of today, Oct 28, 2025)
today = pd.Timestamp.now()
current_quarter = pd.to_datetime("2025-07-01") # Q3

try:
    # 1. Get *today's* data snapshot
    gdp_y_today, features_q_today = process_pit_features(
        df_gdp_raw, df_indpro_raw, df_payems_raw, df_icsa_raw, today
    )
    
    # 2. Create training set (all data *before* current quarter)
    df_train_today = pd.concat([gdp_y_today, features_q_today], axis=1)
    df_train_today = df_train_today.loc[df_train_today.index < current_quarter].dropna()
    
    # 3. Get features for the current quarter
    X_predict_now = features_q_today.loc[current_quarter:current_quarter]
    
    if X_predict_now.isnull().values.any():
        print(f"\n--- Cannot Nowcast {current_quarter.to_period('Q')}: ---")
        print("Monthly data for the quarter is not yet fully available.")
        print(X_predict_now)
    else:
        # 4. Train and Predict
        model.fit(df_train_today[features], df_train_today[target])
        nowcast = model.predict(X_predict_now)[0]
        
        print(f"\n--- Live Nowcast Prediction (as of {today.date()}) ---")
        print(f"Predicted GDP Growth for {current_quarter.to_period('Q')}: {nowcast:.2f}%")
        print("Based on features:")
        print(X_predict_now)

except Exception as e:
    print(f"\nCould not generate live nowcast: {e}")


```

<figure><img src="https://2201909190-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FPpEHhcMaleDgz0Cju56y%2Fuploads%2Fgit-blob-5a533717037296881fdfce0eeedce1883e030fc2%2Fimage%20(33).png?alt=media" alt=""><figcaption></figcaption></figure>

***

### 9. Quantitative Comparative Analysis: US vs. Korean Indices

```python
############################################################################
# This script performs a quantitative analysis of four major stock indices:
# KOSPI, KOSDAQ, S&P 500, and NASDAQ.
#
# The analysis includes:
# 1. Normalized Performance: Visual comparison of long-term growth.
# 2. Time-Lag Adjusted Correlation: Measures correlation between
#    US markets (T-1) and Korean markets (T).
# 3. Rolling Volatility: Tracks the 60-day annualized volatility.
# 4. Risk-Return Profile: Compares annualized return vs. volatility.
# 5. Cointegration Test: Checks for a long-term equilibrium relationship
#    between KOSPI(T) and S&P 500(T-1).
# 6. Granger Causality Test: Tests if US markets predict Korean markets.
# 7. Rolling Beta: Calculates KOSPI's 1-year rolling sensitivity
#    to the S&P 500 (T-1).
#
############################################################################

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from finter.data import ContentFactory
import statsmodels.api as sm
from statsmodels.tsa.stattools import coint, grangercausalitytests
import warnings

# Suppress unnecessary warnings
warnings.filterwarnings("ignore")

# Set graph style
plt.style.use('ggplot')
# Font settings for CJK (commented out, adjust as needed)
# plt.rcParams['font.family'] = 'AppleGothic' # Mac
# plt.rcParams['font.family'] = 'Malgun Gothic' # Windows
plt.rcParams['axes.unicode_minus'] = False # Fix for minus sign display

############################################################
# A. Load Data (Select 4 Indices)
############################################################
print("Loading 4 major index data from pre-loaded 'bm' DataFrame...")

# (User's pre-executed code)
# start, end = 20000105, 20251028
# cf_raw = ContentFactory('raw', start, end)
# bm = cf_raw.get_df("content.factset.api.price_volume.world-index-price_close.1d")

# --- (Start from here) ---
# 1. Select only the 4 indices needed for analysis
tickers = ["KOSPI", "KOSDAQ", "S&P500", "NASDAQ_COMPOSITE"]
all_indices_df = bm[tickers]

# 2. Rename columns for simplicity
all_indices_df.columns = ["KOSPI", "KOSDAQ", "S&P 500", "NASDAQ"]
all_indices = all_indices_df.astype(float)

# 3. Handle NaN values due to holidays (fill forward)
all_indices.ffill(limit_area='inside', limit=3)
# Drop any remaining NaNs at the beginning
all_indices.dropna(inplace=True) 

print("Data selection and preprocessing complete.")
print(all_indices.tail())


############################################################
# B. Analysis 1: Normalized Performance
############################################################
# (Same as before - compares individual index performance)
normalized_indices = (all_indices / all_indices.iloc[0] * 100)
plt.figure(figsize=(14, 7))
normalized_indices.plot(ax=plt.gca(), linewidth=1.5)
plt.title("Normalized Performance (Start=100)")
plt.ylabel("Normalized Price")
plt.legend(loc='upper left')
plt.show()


############################################################
# C. Analysis 2: Returns Correlation (Time-Lag Adjusted)
############################################################
print("\n[C. Correlation Analysis (Time-Lag Adjusted)]")

# Convert Price -> Log Returns for analysis
log_returns = np.log(all_indices / all_indices.shift(1)).dropna()

# --- Time-Lag Adjustment ---
# Create a time-lag adjusted DataFrame for comparison
log_returns_aligned = pd.DataFrame({
    'KOSPI (T)': log_returns['KOSPI'],
    'KOSDAQ (T)': log_returns['KOSDAQ'],
    'S&P 500 (T-1)': log_returns['S&P 500'].shift(1),
    'NASDAQ (T-1)': log_returns['NASDAQ'].shift(1)
}).dropna()

corr_matrix_aligned = log_returns_aligned.corr()

print("Time-Lag Adjusted Correlation Matrix:")
print(corr_matrix_aligned)

# Visualize correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix_aligned, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix of Daily Log Returns (US T-1 vs KR T)")
plt.show()
print("  -> Insight: The correlation between S&P 500(T-1) and KOSPI(T) will likely be higher than the non-lagged version.")


############################################################
# D. Analysis 3: Rolling Volatility
############################################################
# (Same as before - compares individual index volatility)
print("\n[D. Rolling Volatility Analysis]")
rolling_volatility = log_returns.rolling(window=60).std() * np.sqrt(252)
plt.figure(figsize=(14, 7))
rolling_volatility.plot(ax=plt.gca(), linewidth=1.5)
plt.title("Rolling Volatility (60-Day Annualized)")
plt.ylabel("Annualized Volatility")
# (Visualization code same as before)
plt.axvline(pd.to_datetime('2008-09-15'), color='red', linestyle='--', label='Lehman Crisis')
plt.axvline(pd.to_datetime('2020-03-11'), color='blue', linestyle='--', label='COVID-19 Pandemic')
plt.legend(loc='upper left')
plt.show()

############################################################
# E. Analysis 4: Risk-Return Profile
############################################################
# (Same as before - compares individual index characteristics)
print("\n[E. Risk-Return Profile]")
ann_returns = log_returns.mean() * 252
ann_volatility = log_returns.std() * np.sqrt(252)
sharpe_ratio = ann_returns / ann_volatility

summary_df = pd.DataFrame({
    'Annualized Return': ann_returns,
    'Annualized Volatility': ann_volatility,
    'Sharpe Ratio': sharpe_ratio
})
print(summary_df.sort_values(by='Sharpe Ratio', ascending=False))


############################################################
# F. Advanced Analysis 1: Cointegration Test (Time-Lag Adjusted)
############################################################
print("\n[F. Cointegration Test (KOSPI vs S&P 500, Time-Lag Adjusted)]")

# --- Time-Lag Adjustment ---
# Cointegration test uses the original price series
prices_aligned = pd.DataFrame({
    'KOSPI_T': all_indices['KOSPI'],
    'SP500_T-1': all_indices['S&P 500'].shift(1)
}).dropna()

score, pvalue, _ = coint(prices_aligned['KOSPI_T'], prices_aligned['SP500_T-1'])
print(f"  Test Score: {score:.4f}")
print(f"  P-value: {pvalue:.4f}")

if pvalue < 0.05:
    print("  Result (p < 0.05): Reject H0. The series are cointegrated.")
    print("  -> Insight: KOSPI(T) and S&P 500(T-1) share a long-term equilibrium relationship.")
else:
    print("  Result (p >= 0.05): Fail to reject H0. The series are NOT cointegrated.")
    print("  -> Insight: The two series do not share a long-term equilibrium relationship.")

    
############################################################
# G. Advanced Analysis 2: Granger Causality Test
############################################################
print("\n[G. Granger Causality Test (Lags 1-5)]")
# (This section intentionally uses the original data
#  as the test itself evaluates lags.)
max_lag = 5

# Test 1: Does S&P 500 Granger-cause KOSPI?
print("  Test 1: Does S&P 500 Granger-cause KOSPI?")
# log_returns is already calculated
data_g1 = log_returns[['KOSPI', 'S&P 500']].dropna()
results_g1 = grangercausalitytests(data_g1, max_lag, verbose=False)

for lag in range(1, max_lag + 1):
    pvalue = results_g1[lag][0]['ssr_ftest'][1]
    print(f"    Lag {lag}: P-value = {pvalue:.4f} {'(Significant)' if pvalue < 0.05 else ''}")
print("  -> Insight: If the P-value at Lag 1 is < 0.05, it confirms that S&P 500(T-1)")
print("               significantly predicts KOSPI(T), as proven by the data.")

# Test 2: Does NASDAQ Granger-cause KOSDAQ?
print("\n  Test 2: Does NASDAQ Granger-cause KOSDAQ?")
data_g2 = log_returns[['KOSDAQ', 'NASDAQ']].dropna()
results_g2 = grangercausalitytests(data_g2, max_lag, verbose=False)

for lag in range(1, max_lag + 1):
    pvalue = results_g2[lag][0]['ssr_ftest'][1]
    print(f"    Lag {lag}: P-value = {pvalue:.4f} {'(Significant)' if pvalue < 0.05 else ''}")


############################################################
# H. Advanced Analysis 3: Rolling Beta (Time-Lag Adjusted)
############################################################
print("\n[H. Rolling Beta Analysis (KOSPI vs S&P 500, Time-Lag Adjusted)]")
# Use % returns for beta calculation
pct_returns = all_indices.pct_change()

# --- Time-Lag Adjustment ---
# KOSPI (T) vs S&P 500 (T-1)
pct_returns_aligned = pd.DataFrame({
    'KOSPI_T': pct_returns['KOSPI'],
    'SP500_T-1': pct_returns['S&P 500'].shift(1)
}).dropna()

# 1-year (252 trading days) rolling window
window = 252 
betas = []

Y = pct_returns_aligned['KOSPI_T']
X = pct_returns_aligned['SP500_T-1']

print("  Calculating rolling beta... (This may take a moment)")
for i in range(window, len(pct_returns_aligned)):
    Y_window = Y.iloc[i-window:i]
    X_window = X.iloc[i-window:i]
    
    X_window_const = sm.add_constant(X_window)
    
    model = sm.OLS(Y_window, X_window_const).fit()
    betas.append(model.params.iloc[1]) # [1] is the coefficient for SP500_T-1 (beta)

# Convert to Pandas Series for visualization
rolling_beta = pd.Series(betas, index=pct_returns_aligned.index[window:])

# Visualize rolling beta
plt.figure(figsize=(14, 7))
rolling_beta.plot(linewidth=1.5, label='KOSPI 1Y Rolling Beta (vs S&P 500 T-1)')
plt.title('KOSPI 1-Year Rolling Beta (vs S&P 500 T-1)')
plt.ylabel('Beta')
plt.xlabel('Date')
plt.axhline(rolling_beta.mean(), color='red', linestyle='--', 
            label=f'Average Beta: {rolling_beta.mean():.2f}')
plt.axhline(1.0, color='black', linestyle=':', label='Beta = 1.0 (Market)')
plt.legend()
plt.show()





```

***

### 10. FINTER Backtest 1 : High Dividend & EPS Momentum Strategy

```python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from finter import BaseAlpha
from finter.data import ContentFactory
from finter.backtest import Simulator
from finter.performance import PortfolioAnalyzer  # Import performance analysis module

# ----------------------------------------------------
# 1. Helper Function and Constants
# ----------------------------------------------------
def calculate_previous_start_date(start_date, lookback_days):
    """Calculates the start date before the lookback period."""
    start = datetime.strptime(str(start_date), "%Y%m%d")
    previous_start = start - timedelta(days=lookback_days)
    return int(previous_start.strftime("%Y%m%d"))

LOOKBACK_DAYS_6M = 126
LOOKBACK_DAYS_3M = 63
LOOKBACK_DAYS_1M = 21

# ----------------------------------------------------
# 2. Alpha Class Definition (Strategy Logic)
# ----------------------------------------------------
class Alpha(BaseAlpha):
    """
    Strategy:
    1. Top 10% Dividend Yield (div_rate > 0.9)
    2. EPS (Earnings Per Share) > 0
    3. EPS 3-Month Momentum > 0
    4. Equal-weighted investment in stocks meeting the above conditions
    """
    def get(self, start, end):
        print(f'Alpha.get() called: start={start}, end={end}')
        
        # Calculate pre-start date for data loading
        pre_start = calculate_previous_start_date(start, 365) # Load 1 year of data for factor calculation
        
        # Load data via finter ContentFactory
        cf = ContentFactory('us_stock', pre_start, end)
        self.close = cf.get_df('price_close')
        self.div = cf.get_df('indicated_annual_dividend')
        self.eps = cf.get_df('eps')

        # Calculate factors
        # 1. 3M EPS Momentum
        self.eps_change = self.eps.pct_change(LOOKBACK_DAYS_3M, fill_method=None)
        
        # 2. Dividend Yield Rank (cross-sectional, pct=True)
        self.div_rate = self.div.rank(pct=True, axis=1)

        # 3. Buy Conditions (Screening)
        self.selected_stocks = self.div_rate * (self.div_rate > 0.9) * (self.eps_change > 0) * (self.eps > 0)

        # 4. Calculate Positions (Equal-weighted)
        self.selected_stocks = self.selected_stocks[~self.close.isna()]
        self.stock_ratio = self.selected_stocks.div(self.selected_stocks.sum(axis=1), axis=0).fillna(0)
        self.position = self.stock_ratio * 1e8 # Base position size for the finter simulator

        # 5. Shift(1) to prevent look-ahead bias
        alpha = self.position.shift(1).fillna(0)
        
        # Return the final position sliced for the start ~ end period
        alpha = alpha.loc[str(start):str(end)]
        return alpha

# ----------------------------------------------------
# 3. Run Backtest
# ----------------------------------------------------

# Set period
start, end = 20000101, 20240101

# Load Benchmark (NASDAQ) data
print("Loading benchmark data...")
cf_raw = ContentFactory('raw', start, end)
bm_df = cf_raw.get_df("content.factset.api.price_volume.world-index-price_close.1d")

# Execute strategy (generate positions)
print("Executing alpha strategy...")
alpha_strategy = Alpha()
position = alpha_strategy.get(start, end)

# Run simulator
print("Running simulation (USD, KRW)...")
simulator = Simulator("us_stock")
results_usd = simulator.run(position, start=start, end=end, debug=False)
results_krw = simulator.run(position, currency="KRW", start=start, end=end, debug=False) # KRW base

# ----------------------------------------------------
# 4. Visualize Results
# ----------------------------------------------------
print("Visualizing results...")
plt.figure(figsize=(14, 7))

# 1. Strategy NAV (USD, KRW)
results_usd.summary.nav.plot(label='Strategy NAV (USD)')
results_krw.summary.nav_in_currency.plot(label='Strategy NAV (KRW)', linestyle=':')

# 2. Benchmark NAV
# Align benchmark start date with backtest start date
bm = bm_df['NASDAQ_COMPOSITE'].ffill().loc[results_usd.summary.index[0]:]
# Normalize benchmark to the backtest's starting NAV (1000)
bm_normalized = bm / bm.iloc[0] * 1000 
bm_normalized.plot(label='NASDAQ Composite (BM)', linestyle='--', color='black')

plt.title(f'Alpha Strategy vs NASDAQ ({start} - {end})')
plt.legend()
plt.grid(True)
plt.ylabel('Normalized Value (Start=1000)')
plt.show()

# ----------------------------------------------------
# 5. Performance Analysis (PortfolioAnalyzer)
# ----------------------------------------------------
print("\n" + "="*50)
print("     Strategy Performance Analysis (USD Base)")
print("="*50)
# Providing the benchmark (bm) to nav2stats also calculates relative metrics like alpha and beta.
stats_usd = PortfolioAnalyzer.nav2stats(results_usd.summary["nav"], period=None, benchmark=bm_normalized.loc[results_usd.summary.index])
print(stats_usd)

print("\n" + "="*50)
print("     Strategy Performance Analysis (KRW Base)")
print("="*50)
# It's difficult to compare KRW NAV directly with a USD benchmark, so we'll look at absolute performance.
stats_krw = PortfolioAnalyzer.nav2stats(results_krw.summary["nav_in_currency"], period=None)
print(stats_krw)
```

<figure><img src="https://2201909190-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FPpEHhcMaleDgz0Cju56y%2Fuploads%2Fgit-blob-ac4df61dffee5b46ecbe4e263d41916bb3550909%2Fimage%20(18).png?alt=media" alt=""><figcaption></figcaption></figure>

***

### 11. FINTER Backtest 2 : Asset Turnover Factor (Long-Short)

```python
#################################################################
# Finter Backtest: Asset Turnover Factor (Long-Short)
#################################################################

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from finter import BaseAlpha
from finter.data import ContentFactory
from finter.backtest.v0 import Simulator
from finter.performance import PortfolioAnalyzer # Import performance analysis module

# ----------------------------------------------------
# 1. Setup
# ----------------------------------------------------
start, end = 20150101, 20250530
cf = ContentFactory('us_stock', start, end)
cf.show()

# ----------------------------------------------------
# 2. Helper Function
# ----------------------------------------------------

# Function to calculate the previous start date based on a given lookback period
def calculate_previous_start_date(start_date, lookback_days):
    """Calculates the start date before the lookback period."""
    start = datetime.strptime(str(start_date), "%Y%m%d")  # Convert start_date to datetime object
    previous_start = start - timedelta(days=lookback_days)  # Calculate previous start date
    return int(previous_start.strftime("%Y%m%d"))  # Return previous start date in yyyyMMdd format

# ----------------------------------------------------
# 3. Alpha Class Definition
# ----------------------------------------------------

# Alpha class inheriting from BaseAlpha
class Alpha(BaseAlpha):
    
    # Function to safely apply a transformation to the data
    # This seems designed to handle point-in-time fundamental data stored as dictionaries
    def safe_apply(self, x):
        if pd.isna(x):
            return x  # Return NaN values as they are
        else:
            # Only compare keys that can be converted to float
            valid_items = {float(k): v for k, v in x.items() if self.is_float(k)}
            if valid_items:
                max_key = max(valid_items.keys())  # Find the maximum key (latest date)
                return valid_items[max_key]  # Return the value corresponding to the maximum key
            else:
                return None  # Return None if no keys can be converted to float

    # Function to check if a value can be converted to float
    def is_float(self, value):
        try:
            float(value)
            return True
        except ValueError:
            return False

    # Function to generate a long/short position DataFrame
    def get_long_short_df(self, factor, close):
        # Function to scale the DataFrame (equal weighting)
        def scale_df(df):
            row_sums = df.sum(axis=1)  # Calculate the sum of each row
            scaling_factors = np.where(row_sums != 0, row_sums, 1)  # Set scaling factors, 1 where row sum is 0
            return df.div(scaling_factors, axis=0)  # Scale the DataFrame

        rank = factor.rank(axis=1, pct=True) # Calculate cross-sectional factor rank
        long = rank >= 0.66  # Select stocks in the top 33% rank
        short = rank <= 0.33 # Select stocks in the bottom 33% rank
        
        # This complex logic matches tickers between factor and price data
        # It assumes tickers have different formats (e.g., 'ABC-US' vs 'ABC-US-EQ')
        close_columns_short = [col[:6] for col in close.columns]  # Extract the first 6 characters of close column names

        long_columns = {col[:6]: col for col in long.columns}  # Map long column names
        short_columns = {col[:6]: col for col in short.columns} # Map short column names
        new_long_df = pd.DataFrame(index=close.index, columns=close.columns)  # Create a new long DataFrame
        new_short_df = pd.DataFrame(index=close.index, columns=close.columns) # Create a new short DataFrame

        for short_col, close_col in zip(close_columns_short, close.columns):
            matching_long_col = long_columns.get(short_col)  # Find matching long column
            matching_short_col = short_columns.get(short_col) # Find matching short column
            if matching_long_col:
                new_long_df[close_col] = long[matching_long_col]  # Copy values from the matching column
            if matching_short_col:
                new_short_df[close_col] = short[matching_short_col] # Copy values from the matching columns

        # Forward fill the boolean signal and apply price filter
        long_df = new_long_df[close > 0].ffill(limit=60)
        short_df = new_short_df[close > 0].ffill(limit=60)

        long_df = scale_df(long_df)  # Scale the long DataFrame
        short_df = scale_df(short_df) # Scale the short DataFrame
        return long_df * 1e8, short_df * 1e8  # Adjust scale and return

    # Method to generate alpha
    def get(self, start, end):
        pre_start = calculate_previous_start_date(start, 730)  # Calculate start date for data retrieval (2 years)
        cf = ContentFactory('us_stock', pre_start, end)  # Create an instance of ContentFactory
        
        # Retrieve and process asset data
        # NOTE: Changed ffill(limit=4) to ffill() to correctly persist quarterly data until the next update.
        self.atq = cf.get_df('atq').applymap(lambda x : self.safe_apply(x)).ffill()
        # Retrieve and process sales data
        self.saleq = cf.get_df('saleq').applymap(lambda x : self.safe_apply(x)).ffill()
        
        self.close = cf.get_df('price_close')  # Retrieve closing price data

        # Calculate asset turnover
        # NOTE: Changed shift(12) (12 days) to shift(63) (approx. 1 quarter) for a more meaningful factor.
        avg_assets = (self.atq + self.atq.shift(63)) / 2
        asset_turnover = self.saleq / avg_assets
        
        # Persist the calculated factor for one quarter (63 trading days) to stabilize ranks
        asset_turnover = asset_turnover.ffill(limit=63)

        long_position, short_position = self.get_long_short_df(asset_turnover, self.close)  # Calculate long/short positions

        long_position = long_position.loc[:,long_position.sum() != 0] # for memory saving
        short_position = short_position.loc[:,short_position.sum() != 0] # for memory saving

        alpha = long_position.fillna(0).shift(1)  # Shift positions to avoid look-ahead bias
        self.short = short_position.fillna(0).shift(1).loc[str(start): str(end)]
        return alpha.loc[str(start): str(end)]  # Return alpha values for the given period

# ----------------------------------------------------
# 4. Strategy Instantiation
# ----------------------------------------------------
alpha = Alpha()
position = alpha.get(start, end) # This is the Long-Only position
short_position = alpha.short # This is the Short-Only position

# ----------------------------------------------------
# 5. Backtest Execution
# ----------------------------------------------------

cash = 1000 # Initial amount of investment
position = position.iloc[:-2] # Remove last two days as in original code
short_position = short_position.iloc[:-2]

# Find the first valid trading day
first_idx_long = position.replace(0, np.nan).dropna(how='all').index[0]
first_idx_short = short_position.replace(0, np.nan).dropna(how='all').index[0]
first_idx = max(first_idx_long, first_idx_short) # Start when both signals are valid

position = position.fillna(0)
short_position = short_position.fillna(0)

# Create the Long-Short (market neutral) position
long_short_position = position - short_position

sim = Simulator(market_type="us_stock")
start_str = first_idx.strftime("%Y%m%d")
end_str = position.index[-1].strftime("%Y%m%d")

print(f"Running backtests from {start_str} to {end_str}...")

# Run Long-Only backtest
long_bt = sim.run(position,
                  start=start_str, end=end_str,
                  buy_fee_tax=25, sell_fee_tax=25, slippage=10,
                  initial_cash=cash, drip="reinvest",
                  rebalancing_method="by_position", core_type="basic", debug=False)

# Run Short-Only backtest
short_bt = sim.run(short_position,
                   start=start_str, end=end_str,
                   buy_fee_tax=25, sell_fee_tax=25, slippage=10,
                   initial_cash=cash, drip="reinvest",
                   rebalancing_method="by_position", core_type="basic", debug=False)

# Run Long-Short backtest
long_short_bt = sim.run(long_short_position,
                        start=start_str, end=end_str,
                        buy_fee_tax=25, sell_fee_tax=25, slippage=10,
                        initial_cash=cash, drip="reinvest",
                        rebalancing_method="by_position", core_type="basic", debug=False)

print("Backtests completed.")

# ----------------------------------------------------
# 6. Visualization (Corrected with Subplots)
# ----------------------------------------------------

# Get benchmark data
cf_raw = ContentFactory('raw', start_str, end_str)
bm = cf_raw.get_df("content.factset.api.price_volume.world-index-price_close.1d") # Get benchmark data
bm_nav = bm['NASDAQ_COMPOSITE'].ffill().loc[long_bt.summary.index[0]:]
bm_nav = bm_nav / bm_nav.iloc[0] * cash # Normalize to initial cash

# --- Create 2 subplots (2 rows, 1 column) ---
# sharex=True means they share the same X-axis (date)
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 12), sharex=True)

# --- Plot 1: Long-Only, Short-Only vs. Benchmark ---
ax1.set_title('Long-Only vs. Short-Only Portfolios')
long_bt.summary.nav.plot(ax=ax1, label='Long Portfolio')
short_bt.summary.nav.plot(ax=ax1, label='Short Portfolio')
bm_nav.plot(ax=ax1, label='NASDAQ (Benchmark)', color='black', linestyle=':')
ax1.set_ylabel(f'NAV (Start = ${cash})')
ax1.legend()
ax1.grid(True)

# --- Plot 2: Long-Short Spread (Market Neutral) ---
ax2.set_title('Long-Short Spread (Market Neutral)')
long_short_bt.summary.nav.plot(ax=ax2, label='Long-Short Spread', color='green')
ax2.set_ylabel(f'NAV (Start = ${cash})')
ax2.legend()
ax2.grid(True)

plt.xlabel('Date')
plt.tight_layout() # Adjust plots to prevent overlap
plt.show()


# ----------------------------------------------------
# 7. Performance Analysis (FIXED)
# ----------------------------------------------------
print("\n" + "="*50)
print("     Performance Analysis: Long-Only Portfolio")
print("="*50)
stats_long = PortfolioAnalyzer.nav2stats(
    long_bt.summary["nav"], 
    period=None
)
print(stats_long)

print("\n" + "="*50)
print("     Performance Analysis: Short-Only Portfolio")
print("="*50)
stats_short = PortfolioAnalyzer.nav2stats(
    short_bt.summary["nav"], 
    period=None
)
print(stats_short)

print("\n" + "="*50)
print("     Performance Analysis: Long-Short Portfolio")
print("="*50)

stats_long_short = PortfolioAnalyzer.nav2stats(
    long_short_bt.summary["nav"], 
    period=None
)
print(stats_long_short)
```

<figure><img src="https://2201909190-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FPpEHhcMaleDgz0Cju56y%2Fuploads%2Fgit-blob-98e070795779df54187f6ffc4abba0b1e89e1888%2Fimage%20(21).png?alt=media" alt=""><figcaption></figcaption></figure>

***

### 12. FINTER Backtest 3 : RSI Long Only Strategy

```python
#################################################################
# Finter Backtest: RSI Momentum Factor (Long-Only vs Benchmark)
#################################################################

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

from finter import BaseAlpha
from finter.data import ContentFactory
from finter.backtest.v0 import Simulator
from finter.performance import PortfolioAnalyzer


# ----------------------------------------------------
# 1. Setup
# ----------------------------------------------------
start, end = 20000101, 20251026
cf = ContentFactory('us_stock', start, end)
cf.show()  # optional


# ----------------------------------------------------
# 2. Helper Function
# ----------------------------------------------------
def calculate_previous_start_date(start_date, lookback_days):
    start_dt = datetime.strptime(str(start_date), "%Y%m%d")
    prev_dt = start_dt - timedelta(days=lookback_days)
    return int(prev_dt.strftime("%Y%m%d"))


# ----------------------------------------------------
# 3. Alpha Class Definition (RSI Factor)
# ----------------------------------------------------
class Alpha(BaseAlpha):


    universe = "us_stock"
    momentum_period = 21
    smooth_period = 21

    def compute_rsi(self, price: pd.DataFrame) -> pd.DataFrame:
        delta = price.diff()
        gain = delta.where(delta > 0, 0)
        loss = -delta.where(delta < 0, 0)

        avg_gain = gain.rolling(self.momentum_period, min_periods=1).mean()
        avg_loss = loss.rolling(self.momentum_period, min_periods=1).mean()

        rs = avg_gain / avg_loss.replace(0, np.nan)
        rsi = 100 - (100 / (1 + rs))
        rsi_smooth = rsi.rolling(self.smooth_period, min_periods=1).mean()
        return rsi_smooth

    def get(self, start, end):
        preload_start = calculate_previous_start_date(start, 365)
        cf_full = ContentFactory('us_stock', preload_start, end)
        close = cf_full.get_df("price_close").dropna(how="all", axis=1).sort_index()

        # RSI calculation
        rsi_smooth = self.compute_rsi(close)

        # cross-sectional rank (Top 33% Long)
        rank_cs = rsi_smooth.rank(axis=1, pct=True)
        long_mask = rank_cs >= 0.66
        tradable = close > 0
        long_df = (long_mask & tradable).ffill(limit=60)

      
        w = long_df.astype(float)
        row_sum = w.sum(axis=1)
        long_scaled = w.div(np.where(row_sum != 0, row_sum, 1), axis=0)

   
        long_position = long_scaled * 1e8
        long_position = long_position.shift(1).fillna(0)  # 룩어헤드 방지
        return long_position.loc[str(start): str(end)]


# ----------------------------------------------------
# 4. Strategy Instantiation & Backtest
# ----------------------------------------------------
alpha = Alpha()
position = alpha.get(start, end).fillna(0)


first_valid = position.replace(0, np.nan).dropna(how='all').index[0]
start_str = first_valid.strftime("%Y%m%d")
end_str = position.index[-1].strftime("%Y%m%d")

print(f"Running Long-Only backtest from {start_str} to {end_str}...")

sim = Simulator(market_type="us_stock")
bt = sim.run(
    position,
    start=start_str,
    end=end_str,
    buy_fee_tax=25,
    sell_fee_tax=25,
    slippage=10,
    initial_cash=1000,
    drip="reinvest",
    rebalancing_method="by_position",
    core_type="basic",
    debug=False,
)
print("Backtest completed.")


# ----------------------------------------------------
# 5. Benchmark Comparison (NASDAQ Composite)
# ----------------------------------------------------
try:
    cf_bench = ContentFactory('raw', start_str, end_str)
    bm = cf_bench.get_df("content.factset.api.price_volume.world-index-price_close.1d")

    print("Available benchmark columns:", bm.columns.tolist())
    bm_col = [col for col in bm.columns if "NASDAQ" in col.upper()][0]

    bm_nav = bm[bm_col].ffill().loc[bt.summary.index[0]:]
    bm_nav = bm_nav / bm_nav.iloc[0] * 1000 
except Exception as e:
    print(" Benchmark load failed:", e)
    bm_nav = None


# ----------------------------------------------------
# 6. Visualization: Portfolio NAV vs Benchmark
# ----------------------------------------------------
plt.figure(figsize=(14, 7))
plt.title("RSI Momentum Long-Only Strategy vs NASDAQ Benchmark")

bt.summary.nav.plot(label="RSI Long Portfolio", linewidth=2)
if bm_nav is not None:
    bm_nav.plot(label="NASDAQ Benchmark", linestyle="--", color="black")

plt.ylabel("Net Asset Value ($)")
plt.xlabel("Date")
plt.grid(True, linestyle="--", alpha=0.6)
plt.legend()
plt.tight_layout()
plt.show()


# ----------------------------------------------------
# 7. Performance Analysis
# ----------------------------------------------------
print("\n" + "="*60)
print("     Performance Analysis: RSI Long-Only Portfolio")
print("="*60)
stats = PortfolioAnalyzer.nav2stats(bt.summary["nav"], period=None)
print(stats)

if bm_nav is not None:
    print("\n" + "="*60)
    print("     Benchmark Comparison: NASDAQ Composite")
    print("="*60)
    bm_stats = PortfolioAnalyzer.nav2stats(bm_nav, period=None)
    print(bm_stats)


```

***

### 13. FINTER Backtest 4 : Double-Momentum Strategy

```python
#################################################################
# Backtest Runner: Quarterly Rebalance Double-Momentum / DD Strategy
# 
# Strategy Logic Summary
# ----------------------
# This strategy combines two factors:
#   (1) Momentum: Difference between 12-month and 3-month returns (12M - 3M).
#       → Picks stocks with strong long-term momentum but not overextended recently.
#   (2) Drawdown (DD): Measures how far the stock price is below its 12-month high.
#       → Selects stocks that have recently pulled back (potentially undervalued within trend).
#
# Core Rules
# -----------
# - Universe: S&P500 constituents (active stocks only)
# - Rebalancing: Quarterly at the *beginning* of each new quarter
# - Entry: Equal-weight among selected stocks (top 20% momentum + specific DD quantile)
# - Exit: Positions are maintained until the next rebalance date
# - Trade Timing: Buy next day after signal generation
#
# Components
# -----------
# - Alpha Class: Generates position weights based on momentum & drawdown filters
# - Simulator: Executes the backtest using Finter’s standard backtesting framework
# - PortfolioAnalyzer: Calculates performance metrics
# - Visualization: NAV vs Benchmark (e.g., NASDAQ)
#################################################################

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

from finter import BaseAlpha
from finter.data import ContentFactory
from finter.backtest.v0 import Simulator
from finter.performance import PortfolioAnalyzer


############################################################
# 0. Helper
############################################################
def calculate_previous_start_date(start_date, lookback_days):
    """Calculate the start date shifted backward by a given number of days."""
    start = datetime.strptime(str(start_date), "%Y%m%d")
    previous_start = start - timedelta(days=lookback_days)
    return int(previous_start.strftime("%Y%m%d"))


############################################################
# 1. Alpha Class (Quarterly Rebalance Implementation)
############################################################
class Alpha(BaseAlpha):
    def __init__(self, dd_quantile=2, rebalance_freq="Q",
                 liquidity_filter=False, low_vol_filter=False):
        """
        dd_quantile: Which drawdown quantile group (1–5) to buy
        rebalance_freq: 'Q' for quarterly rebalancing
        liquidity_filter: Whether to exclude low-volume stocks
        low_vol_filter: Placeholder for future use (low volatility filter)
        """
        self.dd_quantile = dd_quantile
        self.rebalance_freq = rebalance_freq
        self.liquidity_filter = liquidity_filter
        self.low_vol_filter = low_vol_filter
        
        # Load data once for the entire period
        overall_start, overall_end = 19980101, 20251231 
        pre_start = calculate_previous_start_date(overall_start, 365)

        print("Loading data once for all periods...")
        cf = ContentFactory("us_stock", pre_start, overall_end)

        # Adjusted prices and trading volume
        self.close = cf.get_df(
            "price_close",
            adj=True,
            adj_div=True,
            quantit_universe=False,
            fill_nan=False
        )
        self.volume = cf.get_df(
            "trading_volume",
            quantit_universe=False,
            fill_nan=False
        )

        # S&P500 universe (1 = member)
        spx_const = cf.get_df(
            "spx_constituent",
            quantit_universe=False,
            fill_nan=False
        )

        # Align indices and columns
        common_idx = self.close.index.intersection(spx_const.index)
        common_cols = self.close.columns.intersection(spx_const.columns)

        self.close = self.close.loc[common_idx, common_cols]
        self.volume = self.volume.loc[common_idx, common_cols]

        # Keep values only where stock is included in S&P500
        self.close = self.close.where(spx_const == 1)
        self.volume = self.volume.where(spx_const == 1)

        print("Data load complete.")

    def get(self, start, end, bm=None):
        """
        Generate position time-series for the backtest.
        Rebalances only at the beginning of each quarter.
        Maintains previous weights during the quarter (ffill).
        """
        close = self.close
        volume = self.volume
        
        # -----------------------------
        # 1) Factor Computation
        # -----------------------------
        mom_12m = close.pct_change(252)  # 12-month momentum
        mom_3m  = close.pct_change(63)   # 3-month momentum
        mom_123 = mom_12m - mom_3m       # Adjusted momentum factor

        trading_value = volume * close   # Liquidity proxy

        # Initialize signal matrix (dates × tickers)
        df_signal = pd.DataFrame(0.0, index=close.index, columns=close.columns)

        # -----------------------------
        # 2) Daily Signal Generation
        # -----------------------------
        for dt in close.index:
            today_scores = mom_123.loc[dt]
            if pd.isna(today_scores).all():
                continue

            today_scores = today_scores.dropna()
            if len(today_scores) == 0:
                continue

            # (a) Top 20% momentum selection
            k = max(1, int(len(today_scores) * 0.2))
            top_bucket = today_scores.nlargest(k).index

            # (b) Drawdown calculation: distance from recent 12-month high
            dd_series = {}
            for col in top_bucket:
                past_252 = close.loc[:dt, col].iloc[-252:].dropna()
                if len(past_252) == 0:
                    continue
                high_12_3 = past_252.max()
                price_today = close.loc[dt, col]

                if (
                    pd.notna(high_12_3) and 
                    high_12_3 > 0 and 
                    pd.notna(price_today)
                ):
                    dd_series[col] = (price_today / high_12_3) - 1  # Negative = pullback

            selected = []
            if len(dd_series) > 0:
                dd_series = pd.Series(dd_series)
                dd_series = dd_series[dd_series < 0]  # Exclude those at new highs

                if len(dd_series) > 0:
                    ranks = dd_series.rank(pct=True)

                    labels = pd.Series(index=dd_series.index, dtype=int)
                    labels[ranks <= 0.2] = 1
                    labels[(ranks > 0.2) & (ranks <= 0.4)] = 2
                    labels[(ranks > 0.4) & (ranks <= 0.6)] = 3
                    labels[(ranks > 0.6) & (ranks <= 0.8)] = 4
                    labels[ranks > 0.8] = 5

                    selected = labels[labels == self.dd_quantile].index.tolist()

            # (c) Optional Liquidity Filter
            if self.liquidity_filter and len(selected) > 0:
                mean_val = trading_value.loc[:dt].iloc[-63:].mean()
                spx_avg = mean_val.mean()
                selected = [
                    col for col in selected
                    if mean_val.get(col, 0) >= spx_avg
                ]

            # (d) Assign equal weight to selected stocks
            if len(selected) > 0:
                w = 1.0 / len(selected)
                df_signal.loc[dt, selected] = w

        # -----------------------------
        # 3) Quarterly Rebalancing Logic
        # -----------------------------
        todays_period = df_signal.index.to_period(self.rebalance_freq)
        yesterdays_period = (
            pd.Series(df_signal.index, index=df_signal.index)
            .shift(1)
            .dt.to_period(self.rebalance_freq)
        )

        mask = todays_period != yesterdays_period
        df_signal_period = df_signal[mask]

        # Apply one-day delay (signals executed next day)
        df_pos_shifted = df_signal_period.shift(1)

        # Carry positions forward until next rebalance
        df_pos_full = (
            df_pos_shifted
            .reindex(df_signal.index)
            .ffill()
            * 1e8  # Notional scaling
        ).fillna(0)

        # -----------------------------
        # 4) Return position within start–end range
        # -----------------------------
        return df_pos_full.loc[str(start):str(end)]


############################################################
# 2. Backtest Pipeline
############################################################

# Backtest period
backtest_start = 20150101
backtest_end   = 20241231

# Initialize Alpha
alpha = Alpha(
    dd_quantile=2,
    rebalance_freq="Q",
    liquidity_filter=False,
    low_vol_filter=False,
)

# Generate position DataFrame
position = alpha.get(backtest_start, backtest_end).fillna(0)

# Determine actual valid start date
first_valid_idx = position.replace(0, np.nan).dropna(how="all").index[0]
start_str = first_valid_idx.strftime("%Y%m%d")
end_str   = position.index[-1].strftime("%Y%m%d")

print(f"\nRunning backtest from {start_str} to {end_str} ...")

# Run simulation
sim = Simulator(market_type="us_stock")
bt = sim.run(
    position,
    start=start_str,
    end=end_str,
    buy_fee_tax=25,
    sell_fee_tax=25,
    slippage=10,
    initial_cash=1000,
    drip="reinvest",
    rebalancing_method="by_position",
    core_type="basic",
    debug=False,
)

print("Backtest completed.")


############################################################
# 3. Benchmark Comparison (e.g., NASDAQ)
############################################################
try:
    cf_bench = ContentFactory('raw', start_str, end_str)
    bench_df = cf_bench.get_df("content.factset.api.price_volume.world-index-price_close.1d")

    print("Benchmark columns:", bench_df.columns.tolist())
    bench_col = [c for c in bench_df.columns if "NASDAQ" in c.upper()][0]

    bench_nav = bench_df[bench_col].ffill().loc[bt.summary.index[0]:]
    bench_nav = bench_nav / bench_nav.iloc[0] * 1000  # Normalize to 1000
except Exception as e:
    print(" Benchmark load failed:", e)
    bench_nav = None


############################################################
# 4. NAV Visualization
############################################################
plt.figure(figsize=(14, 7))
plt.title("Quarterly Rebalance Momentum/DD Strategy vs Benchmark")

bt.summary.nav.plot(label="Strategy NAV", linewidth=2)

if bench_nav is not None:
    bench_nav.plot(label="Benchmark (e.g. NASDAQ)", linestyle="--", color="black")

plt.ylabel("Net Asset Value ($)")
plt.xlabel("Date")
plt.grid(True, linestyle="--", alpha=0.6)
plt.legend()
plt.tight_layout()
plt.show()


############################################################
# 5. Performance Metrics
############################################################
print("\n" + "="*60)
print("Performance Analysis: Strategy")
print("="*60)
stats = PortfolioAnalyzer.nav2stats(bt.summary["nav"], period=None)
print(stats)

if bench_nav is not None:
    print("\n" + "="*60)
    print("Performance Analysis: Benchmark")
    print("="*60)
    bench_stats = PortfolioAnalyzer.nav2stats(bench_nav, period=None)
    print(bench_stats)
```

***

### 14. FINTER Backtest 5 : Pairs Trading Strategy

```python
#################################################################
# Pair Trading Strategy (Statistical Arbitrage)
# - Uses cointegration between two stocks
# - Trades spread deviations based on hedge ratio
# - Executes mean-reversion trades with entry/exit thresholds
#################################################################

import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

from finter import BaseAlpha
from finter.data import ContentFactory
from finter.backtest.v0 import Simulator
from finter.performance import PortfolioAnalyzer


############################################################
# Helper Functions
############################################################
def calculate_previous_start_date(start_date, lookback_days):
    """Shift start date backward by a given number of days."""
    start = datetime.strptime(str(start_date), "%Y%m%d")
    previous_start = start - timedelta(days=lookback_days)
    return int(previous_start.strftime("%Y%m%d"))


def calculate_hedge_ratio(close_prices, pair):
    """Estimate hedge ratio between two assets using OLS regression."""
    model = sm.OLS(close_prices[pair[0]], close_prices[pair[1]])
    result = model.fit()
    hedge_ratio = result.params.values[0]
    print(result.summary())
    print(f"Hedge ratio estimated: {hedge_ratio:.4f}")
    return hedge_ratio


def calculate_spread(close_prices, hedge_ratio, pair):
    """Compute spread between two assets based on hedge ratio."""
    spread = close_prices[pair[0]] - hedge_ratio * close_prices[pair[1]]
    print(f"Hedge ratio = {hedge_ratio:.3f}")
    return spread


def adf_test(series):
    """Perform Augmented Dickey-Fuller test to check cointegration."""
    result = sm.tsa.stattools.adfuller(series, maxlag=1)
    return result[1]  # Return p-value


def define_entry_exit(spread):
    """Set entry (1σ) and exit (mean) thresholds."""
    entry_threshold = spread.mean() + 1 * spread.std()
    exit_threshold = spread.mean()
    return entry_threshold, exit_threshold


def execute_trades(spread, entry_threshold, exit_threshold):
    """
    Generate trading signals:
    - Short spread when above entry threshold
    - Exit (go long) when spread mean reverts
    """
    signal = pd.Series(index=spread.index, dtype=float)
    in_trade = False
    for date in spread.index:
        if not in_trade and spread[date] > entry_threshold:
            signal[date] = -1  # Short spread
            in_trade = True
        elif in_trade and spread[date] < exit_threshold:
            signal[date] = 1  # Long spread
            in_trade = False
        else:
            signal[date] = 0
    return signal


def signal_to_position(signal, pair, hedge_ratio):
    """Convert signals to position sizes for each asset."""
    signal = signal.replace(0, np.nan).ffill()
    position = pd.concat(
        [
            signal.rename(pair[0]) * -0.5e8,                # Short first asset
            signal.rename(pair[1]) * 0.5e8 * hedge_ratio    # Long second asset
        ],
        axis=1
    )
    return position


############################################################
# Alpha Class Definition
############################################################
class Alpha(BaseAlpha):
    def get(self, start, end):
        LOOKBACK_DAYS = 365
        pre_start = calculate_previous_start_date(start, LOOKBACK_DAYS)

        # Load data
        cf = ContentFactory(universe_name="us_stock", start=20120101, end=end)
        price = cf.get_df("price_close")

        # Define the stock symbols
        pair = ['00118601', '00107501']

        # Convert to log prices
        log_price = np.log(1 + price.pct_change().fillna(0)).cumsum()
        close_prices = log_price[pair].dropna()

        # Estimate hedge ratio or use fixed
        # hedge_ratio = calculate_hedge_ratio(close_prices, pair)
        hedge_ratio = 1.65

        # Compute spread
        spread = calculate_spread(close_prices, hedge_ratio, pair)
        p_value = adf_test(spread)
        print(f"The pair {pair} cointegration p-value: {p_value:.4f}")

        # Define thresholds
        # entry_threshold, exit_threshold = define_entry_exit(spread)
        spread_mean = 0.028
        spread_std = 0.094
        entry_threshold = spread_mean + spread_std
        exit_threshold = spread_mean

        # Generate trading signals
        signal = execute_trades(spread, entry_threshold, exit_threshold)

        # Convert to position format
        position = signal_to_position(signal, pair, hedge_ratio)
        position = position.fillna(0).shift(1)  # Trade next day (lookahead safe)
        position /= 2  # Adjust exposure scale

        return position.loc[str(start): str(end)]


############################################################
# Backtest Pipeline
############################################################
backtest_start = 20160101
backtest_end = 20241231

# Instantiate Alpha class
alpha = Alpha()

# Generate position DataFrame
position = alpha.get(backtest_start, backtest_end)

# Ensure valid start and end
first_valid_idx = position.replace(0, np.nan).dropna(how="all").index[0]
start_str = first_valid_idx.strftime("%Y%m%d")
end_str = position.index[-1].strftime("%Y%m%d")

print(f"\nRunning backtest from {start_str} to {end_str} ...")

# Initialize Simulator
sim = Simulator(market_type="us_stock")

bt = sim.run(
    position,
    start=start_str,
    end=end_str,
    buy_fee_tax=25,
    sell_fee_tax=25,
    slippage=10,
    initial_cash=1000,
    drip="reinvest",
    rebalancing_method="by_position",
    core_type="basic",
    debug=False,
)

print("Backtest completed successfully.")


############################################################
# NAV Visualization
############################################################
plt.figure(figsize=(12, 6))
plt.title("Pair Trading Strategy NAV")
bt.summary.nav.plot(label="Strategy NAV", color="blue", linewidth=2)
plt.ylabel("Net Asset Value ($)")
plt.xlabel("Date")
plt.grid(True, linestyle="--", alpha=0.6)
plt.legend()
plt.tight_layout()
plt.show()


############################################################
# Performance Metrics
############################################################
print("\n" + "="*60)
print("Performance Analysis: Pair Trading Strategy")
print("="*60)
stats = PortfolioAnalyzer.nav2stats(bt.summary["nav"], period=None)
print(stats)
```

**Browse Full Code:**

{% file src="<https://2201909190-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FPpEHhcMaleDgz0Cju56y%2Fuploads%2Fgit-blob-7bbd263ba4d9509916b7375d3f0dcfc9178819d7%2F%5BUS%20Stock%5D%20Pair%20Trading.ipynb?alt=media>" %}

***

### 15. FINTER Backtest 6 : Working Capital Strategy

```python
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from finter import BaseAlpha
from finter.data import ContentFactory, FinHelper, IdTable
from finter.calendar import iter_trading_days
from finter.backtest.v0 import Simulator
from finter.performance import PortfolioAnalyzer
import matplotlib.pyplot as plt

############################################################
# Helper Functions
############################################################
def calculate_previous_start_date(start_date, lookback_days):
    start = datetime.strptime(str(start_date), "%Y%m%d")
    previous_start = start - timedelta(days=lookback_days)
    return int(previous_start.strftime("%Y%m%d"))


def fill_within_and_prev_fiscal(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values(["id", "fiscal", "pit"])
    df = df.groupby(["id", "fiscal"], group_keys=False).apply(lambda x: x.ffill())
    df = df.sort_values(["id", "pit", "fiscal"])
    df = df.groupby(["id"], group_keys=False).apply(lambda x: x.ffill())
    return df


def latest_as_of_date(df: pd.DataFrame, asof_date: pd.Timestamp) -> pd.DataFrame:
    df_valid = df[df["pit"] <= asof_date]
    if df_valid.empty:
        return df_valid

    df_valid = df_valid.sort_values(["id", "fiscal", "pit"])
    df_latest_per_fiscal = df_valid.groupby(["id", "fiscal"], as_index=False).last()

    df_latest_per_id = (
        df_latest_per_fiscal
        .sort_values(["id", "pit", "fiscal"])
        .groupby("id", as_index=False)
        .last()
    )
    return df_latest_per_id


def build_primary_id_maps() -> tuple[dict, dict]:
    df_stock = IdTable("spglobal-usa").get_stock()
    df_stock = df_stock.query("tpci in ['0']").copy()

    df_stock["is_otc"] = df_stock["miccd"].isin(['XOTC', 'OOTC']).astype(int)
    df_primary = (
        df_stock.sort_values(["gvkey", "is_otc"], ascending=[True, True])
                .drop_duplicates(subset=["gvkey"], keep="first")
                .drop(columns="is_otc")
                .reset_index(drop=True)
    )
    df_primary["gvkeyiid"] = df_primary["gvkey"] + df_primary["iid"]

    map_gvkey_to_gvkeyiid = df_primary.set_index("gvkey")["gvkeyiid"].to_dict()
    map_gvkeyiid_to_ticker = df_primary.set_index("gvkeyiid")["tic"].to_dict()
    return map_gvkey_to_gvkeyiid, map_gvkeyiid_to_ticker

class Alpha(BaseAlpha):
    """
    Working Capital Strategy
    
    Best of both worlds:
    - Lazy loading for speed
    - Fixed start for consistency
    """
    
    universe = "us_stock"
    
    # ✅ Fixed start date
    FIXED_START = 19980101
    
    # Lazy loading flag
    _data_loaded = False
    
    def _ensure_data_loaded(self, end):
        """Load data once, cache for reuse"""
        if self._data_loaded:
            return
        
        pre_start = calculate_previous_start_date(self.FIXED_START, 730)
        
        print(f"[Alpha] Loading data from {pre_start} to {end}...")
        self.cf = ContentFactory("us_stock", start=pre_start, end=end)
        
        self._load_data()
        
        self._data_loaded = True
        print(f"[Alpha] Data loaded successfully")
    
    def _load_data(self):
        """Load and preprocess all data"""
        self.map_gvkey_to_gvkeyiid, self.map_gvkeyiid_to_ticker = build_primary_id_maps()

        atq  = self.cf.get_df("pit-atq",  mode="unpivot").dropna(how="all", axis=1).astype({"fiscal": int})
        actq = self.cf.get_df("pit-actq", mode="unpivot").dropna(how="all", axis=1).astype({"fiscal": int})
        lctq = self.cf.get_df("pit-lctq", mode="unpivot").dropna(how="all", axis=1).astype({"fiscal": int})
        invtq = self.cf.get_df("pit-invtq", mode="unpivot").dropna(how="all", axis=1).astype({"fiscal": int})

        atq  = fill_within_and_prev_fiscal(atq)
        actq = fill_within_and_prev_fiscal(actq)
        lctq = fill_within_and_prev_fiscal(lctq)
        invtq = fill_within_and_prev_fiscal(invtq)

        u_atq, u_actq, u_lctq, u_invtq = FinHelper.unify_idx(atq, actq, lctq, invtq)

        df_assets = pd.concat([
            u_atq.rename(columns={"value": "TotalAssets"}),
            u_actq.rename(columns={"value": "CurrentAssets"}),
            u_lctq.rename(columns={"value": "CurrentLiabilities"}),
            u_invtq.rename(columns={"value": "Inventory"}),
        ], axis=1).reset_index()

        df_assets = df_assets.sort_values(["id", "pit", "fiscal"])
        
        df_assets["WorkingCapital"] = df_assets["CurrentAssets"] - df_assets["CurrentLiabilities"]
        df_assets["WC_t_minus4"] = df_assets.groupby("id")["WorkingCapital"].shift(4)
        df_assets["Delta_WC_Pct"] = (
            (df_assets["WorkingCapital"] - df_assets["WC_t_minus4"]) / 
            df_assets["CurrentAssets"]
        )

        df_assets["Inv_t_minus4"] = df_assets.groupby("id")["Inventory"].shift(4)
        df_assets["Delta_Inv_Pct"] = (
            (df_assets["Inventory"] - df_assets["Inv_t_minus4"]) / 
            df_assets["CurrentAssets"]
        )

        df_assets["gvkeyiid"] = df_assets["id"].map(self.map_gvkey_to_gvkeyiid)
        df_assets["ticker"]   = df_assets["gvkeyiid"].map(self.map_gvkeyiid_to_ticker)
        df_assets = df_assets.dropna(subset=["Delta_WC_Pct", "gvkeyiid", "ticker"])

        self.df_assets = df_assets

        spx_const = self.cf.get_df("spx_constituent", quantit_universe=False, fill_nan=False)
        spx_const.index = pd.to_datetime(spx_const.index)
        self.spx_const = spx_const
    
    def get(self, start, end):
        """
        Generate position matrix
        
        CRITICAL: Ignores 'start' parameter!
        Always uses FIXED_START for consistency.
        """
        from datetime import datetime
        
        # ✅ Lazy load (once)
        self._ensure_data_loaded(end)
        
        # ✅ Use pre-loaded data
        df_assets = self.df_assets
        spx_const = self.spx_const

        # ✅ CRITICAL: Use FIXED_START
        trading_days = list(iter_trading_days(self.FIXED_START, end, exchange="us"))
        rows = []

        for asof_date in trading_days:
            snap = latest_as_of_date(
                df_assets[["id", "fiscal", "pit", "Delta_WC_Pct", "Delta_Inv_Pct", "gvkeyiid"]],
                pd.Timestamp(asof_date)
            )
            
            if snap.empty:
                continue
            
            if asof_date in spx_const.index:
                spx_on_date = spx_const.loc[asof_date]
                spx_tickers = spx_on_date[spx_on_date == 1].index.tolist()
                snap = snap[snap['gvkeyiid'].isin(spx_tickers)]
            else:
                valid_dates = spx_const.index[spx_const.index <= asof_date]
                if len(valid_dates) > 0:
                    nearest_date = valid_dates[-1]
                    spx_on_date = spx_const.loc[nearest_date]
                    spx_tickers = spx_on_date[spx_on_date == 1].index.tolist()
                    snap = snap[snap['gvkeyiid'].isin(spx_tickers)]
                else:
                    continue
            
            snap = snap.dropna(subset=["Delta_WC_Pct", "gvkeyiid"]).copy()
            if len(snap) < 5:
                continue

            snap_with_inv = snap.dropna(subset=["Delta_Inv_Pct"])
            if len(snap_with_inv) >= 5:
                inv_threshold = snap_with_inv["Delta_Inv_Pct"].quantile(0.8)
                snap = snap_with_inv[snap_with_inv["Delta_Inv_Pct"] < inv_threshold]

            if len(snap) < 5:
                continue

            snap = snap.sort_values("Delta_WC_Pct", ascending=False)
            k = max(1, int(len(snap) * 0.1))
            pick = snap.head(k).copy()

            pick["weight"] = 1.0 / len(pick)
            pick["rebalance_date"] = pd.Timestamp(asof_date)

            rows.append(pick[["rebalance_date", "gvkeyiid", "weight"]])

        if not rows:
            trading_dates_dt = [datetime(d.year, d.month, d.day) for d in trading_days]
            return pd.DataFrame(0, index=trading_dates_dt, columns=[])

        df_port = pd.concat(rows, ignore_index=True)
        df_port = df_port.groupby(["rebalance_date", "gvkeyiid"], as_index=False)["weight"].mean()
        df_port["weight"] = df_port.groupby("rebalance_date")["weight"].transform(lambda x: x / x.sum())

        position = df_port.pivot(index="rebalance_date", columns="gvkeyiid", values="weight").sort_index()

        trading_dates_dt = [datetime(d.year, d.month, d.day) for d in trading_days]
        position = position.reindex(trading_dates_dt, fill_value=0)

        position = position * 1e8
        position = position.shift(1).fillna(0)
        
        # Slice to requested period
        position = position.loc[str(start):str(end)]
        
        return position

```

***

### 16. FINTER Backtest 7 : Sharpe-weighted Portfolio Model

```python
import pandas as pd
import numpy as np
from datetime import datetime
from finter import BasePortfolio
from finter.data import ContentFactory, IdTable
from finter.modeling import DateConverter
from finter.modeling.utils import daily2period
from finter.backtest import Simulator
from finter.performance import PortfolioAnalyzer


############################################################
# 0. Helper: Rolling Sharpe Calculation
############################################################
def rolling_sharpe(returns: pd.Series, window: int = 60) -> pd.Series:
    """
    Calculate a rolling Sharpe-like score = rolling mean / rolling std
    over a specified window (default 60 trading days).

    Args:
        returns (pd.Series): Daily return series of an alpha strategy.
        window (int): Rolling window length in trading days.

    Returns:
        pd.Series: Time series of rolling Sharpe-like ratios.
    """
    mean_ = returns.rolling(window).mean()
    std_ = returns.rolling(window).std()
    sharpe = mean_ / std_
    return sharpe


############################################################
# 1. Portfolio Class
############################################################
class Portfolio(BasePortfolio):
    """
    Sharpe-weighted multi-alpha stock portfolio (monthly rebalance, next-day execution).

    Universe: us_stock

    Alpha models:
        1) us.compustat.stock.ywcho.alphathon2_yw_di
        2) us.compustat.stock.jyjung.insur_spxndx_roe
        3) us.compustat.stock.sypark.US_BDC_v4

    Core logic:
        1. Load each alpha’s per-stock position (weights/exposures).
        2. Compute 60-day rolling Sharpe-like scores for each alpha based on daily returns.
        3. Normalize the Sharpe scores into alpha-level weights per day.
        4. Blend alpha positions using those dynamic weights.
        5. Apply monthly rebalancing via month-end snapshot and shift(1) to simulate T+1 execution.
        6. Drop NaN-only and zero-only columns, validate index alignment, and return clean portfolio.
    """

    alpha_list = [
        "us.compustat.stock.ywcho.alphathon2_yw_di",
        "us.compustat.stock.jyjung.insur_spxndx_roe",
        "us.compustat.stock.sypark.US_BDC_v4",
    ]

    def __init__(self) -> None:
        """Load security ID table (gvkey, iid) for reference and debugging."""
        self.df_info = IdTable("spglobal-usa").get_stock()
        self.df_info["gvkeyiid"] = self.df_info["gvkey"] + self.df_info["iid"]

    def _is_date_like(self, x) -> bool:
        """
        Heuristic check: does `x` look like a date?
        Used to guess whether DataFrame columns represent dates.
        """
        if isinstance(x, (pd.Timestamp, np.datetime64)):
            return True
        x_str = str(x)
        if x_str.startswith("19") or x_str.startswith("20"):
            return True
        return False

    def _normalize_alpha_position_shape(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Normalize the alpha position DataFrame shape to (date_index x gvkeyiid_columns).

        Some alphas return (asset x date) tables, others (date x asset).
        If columns look like dates, we assume (asset x date) and transpose.

        Args:
            df (pd.DataFrame): Raw alpha position DataFrame.

        Returns:
            pd.DataFrame: Normalized DataFrame with date index and gvkeyiid columns.
        """
        if len(df.columns) == 0:
            return df

        first_col = df.columns[0]
        cols_are_dates = self._is_date_like(first_col)

        # If columns look like dates, assume shape is (asset x date) and transpose
        if cols_are_dates:
            df = df.T

        # Coerce index to datetime if possible
        try:
            df.index = pd.to_datetime(df.index)
        except Exception:
            pass

        # Sort index and drop duplicate timestamps
        df = df.sort_index()
        df = df.loc[~df.index.duplicated(keep="first")]

        return df

    def get(self, start: int, end: int) -> pd.DataFrame:
        """
        Required method by BasePortfolio: generate the target position (time x asset)
        that will be consumed by the Simulator.

        Pipeline:
            1. Prepare lookback window and trading calendar.
            2. Load per-alpha daily returns and compute rolling Sharpe scores.
            3. Normalize Sharpe scores into daily alpha weights.
            4. Load and align alpha per-stock positions.
            5. Blend alphas using time-varying Sharpe weights.
            6. Slice to the requested date range.
            7. Convert to monthly rebalance via month-end snapshots,
               then shift(1) to simulate next-day execution, and forward-fill.
            8. Drop all-NaN or all-zero columns and fill remaining NaNs with 0.
            9. Validate dates and trading calendar consistency.
        """

        # 1) Prepare date window and calendar
        lookback_days = 365
        data_start = DateConverter.get_pre_start(start, lookback_days)

        cf = ContentFactory("us_stock", data_start, end)
        tr_days = cf.trading_days

        start_dt = datetime.strptime(str(start), "%Y%m%d")
        end_dt = datetime.strptime(str(end), "%Y%m%d")

        # 2) Load daily returns for each alpha
        alpha_ret_dict_raw = self.alpha_return_dict("us_stock", data_start, end)
        alpha_ret_dict = {}
        for name in self.alpha_list:
            r = alpha_ret_dict_raw[name].reindex(tr_days).fillna(0)
            alpha_ret_dict[name] = r

        # 3) Compute rolling Sharpe-like scores and normalize to weights
        sharpe_df = pd.DataFrame({
            name: rolling_sharpe(alpha_ret_dict[name], window=60)
            for name in self.alpha_list
        })

        weight_df = pd.DataFrame(index=sharpe_df.index, columns=self.alpha_list, dtype=float)
        sharpe_sum = sharpe_df.sum(axis=1)

        for name in self.alpha_list:
            weight_df[name] = sharpe_df[name] / sharpe_sum

        # Fallback for cases where Sharpe totals are undefined or ~0
        equal_w = 1.0 / len(self.alpha_list)
        for t in weight_df.index:
            if (not np.isfinite(sharpe_sum.loc[t])) or (abs(sharpe_sum.loc[t]) < 1e-12):
                weight_df.loc[t, :] = equal_w

        # Stabilize early periods (ffill) and ensure no NaNs remain
        weight_df = weight_df.ffill().fillna(equal_w)

        # 4) Load and normalize per-alpha positions
        alpha_loader = self.alpha_loader_v2(data_start, end)
        position_dict = {}
        for name in self.alpha_list:
            raw_pos = alpha_loader.get_alpha(name)
            norm_pos = self._normalize_alpha_position_shape(raw_pos)

            # Align to trading days and fill missing exposures with 0
            norm_pos = norm_pos.reindex(tr_days).fillna(0)
            position_dict[name] = norm_pos

        # Sanity check: all alphas must share the same time index
        base_index = position_dict[self.alpha_list[0]].index
        assert all(position_dict[n].index.equals(base_index) for n in self.alpha_list[1:]), \
            "Alpha position indices are not aligned."

        # 5) Blend alphas using Sharpe-based weights
        blended_position = None
        for name in self.alpha_list:
            w_series = weight_df[name]  # daily alpha weight series
            weighted_pos = position_dict[name].multiply(w_series, axis=0)
            if blended_position is None:
                blended_position = weighted_pos
            else:
                blended_position = blended_position.add(weighted_pos, fill_value=0)

        # 6) Slice to requested range
        blended_position = blended_position.loc[str(start):str(end)]

        # 7) Monthly rebalance with T+1 execution:
        #    - Take month-end snapshots (monthly rebalance schedule).
        #    - Shift by 1 day so the rebalance decision applies next trading day.
        #    - Forward-fill to hold positions until the next rebalance.
        monthly_snapshot = daily2period(blended_position, "M")
        monthly_snapshot = monthly_snapshot.shift(1)  # T+1 execution
        monthly_position = monthly_snapshot.reindex(blended_position.index).ffill()

        # 8) Clean columns:
        #    - Drop columns that are entirely NaN (no exposure ever).
        #    - Drop columns that are always 0 (never held).
        #    - For any remaining partial NaNs (e.g. first rows before ffill),
        #      fill with 0 to indicate "no position."
        monthly_position = monthly_position.dropna(axis=1, how="all")

        non_zero_cols = (monthly_position.abs().sum(axis=0) > 0)
        monthly_position = monthly_position.loc[:, non_zero_cols]

        monthly_position = monthly_position.fillna(0)

        # 9) Final validation and integrity checks
        assert monthly_position.index[0] >= pd.Timestamp(start_dt), \
            "Position starts before requested start date."
        assert monthly_position.index[-1] <= pd.Timestamp(end_dt), \
            "Position ends after requested end date."
        assert monthly_position.notna().all().all(), \
            "NaN detected in final position."
        assert len(monthly_position.columns) > 0, \
            "All columns dropped (no active exposures)."

        cf_main = ContentFactory("us_stock", start, end)
        valid_trading_days = cf_main.trading_days
        assert set(monthly_position.index).issubset(set(valid_trading_days)), \
            "Position index must match valid trading days."

        self.position = monthly_position
        return monthly_position


############################################################
# 2. Execution / Backtest / Analysis
############################################################
if __name__ == "__main__":
    # Define backtest window
    start, end = 20150101, 20251014

    # Instantiate portfolio and compute positions
    pf = Portfolio()
    position = pf.get(start, end)

    # Set up simulator for us_stock universe
    sim = Simulator(market_type="us_stock")

    # Run backtest simulation with realistic assumptions
    simres = sim.run(
        position.clip(upper=1e8),   # Clip excessive exposures for stability
        start=start,
        end=end,
        buy_fee_tax=20,             # 0.2% buy fee
        sell_fee_tax=20,            # 0.2% sell fee
        slippage=10,                # 0.1% slippage
        initial_cash=100_000_000,   # Initial capital
        drip="reinvest",            # Reinvest cash distributions/dividends
        rebalancing_method="by_position",
        core_type="basic",
        debug=True
    )

    # Plot cumulative NAV performance
    nav = simres.summary["nav"]
    cumret_pct = (nav / nav.iloc[0] - 1.0) * 100.0
    cumret_pct.plot(
        grid=True,
        title="Sharpe-Weighted Multi-Alpha Stock Portfolio\n(Monthly Rebalance, T+1 Execution)",
        ylabel="Return (%)",
        xlabel="Date"
    )

    # Compute performance statistics
    stats = PortfolioAnalyzer.nav2stats(nav, period=None)
    print("===== Performance Statistics =====")
    print(stats)

    # Optional: example for model submission
    # from finter import JupyterLabSubmission
    # jls = JupyterLabSubmission(
    #     model_name="tutorial_us_stock_multi_alpha",
    #     model_universe="us_stock"
    # )
```

<figure><img src="https://2201909190-files.gitbook.io/~/files/v0/b/gitbook-x-prod.appspot.com/o/spaces%2FPpEHhcMaleDgz0Cju56y%2Fuploads%2Fgit-blob-22fd029d60134f48f43a52ff1649f0c75c79babc%2Fimage%20(35).png?alt=media" alt=""><figcaption></figcaption></figure>
