203/main.py
2025-11-28 07:37:53 +00:00

136 lines
4.2 KiB
Python

#!/usr/bin/env python
# coding: utf-8
# In[7]:
import pandas as pd
query = """
select final.CUSTOMER_NUMBER_main as Focal_id,
CAST(final.Cash_deposit_total AS DECIMAL(18, 2)) AS Cash_deposit_total,
final.Cash_deposit_count,
final.SEGMENT,
final.RISK,
final.SAR_FLAG
from
(
(
select subquery.CUSTOMER_NUMBER_1 as CUSTOMER_NUMBER_main,
subquery.Cash_deposit_total,
subquery.Cash_deposit_count
from
(
select customer_number as CUSTOMER_NUMBER_1,
sum(transaction_amount) as Cash_deposit_total,
count(*) as Cash_deposit_count
from
(
select *
from {trans_data} trans_table
left join {acc_data} acc_table
on trans_table.benef_account_number = acc_table.account_number
) trans
where account_number not in ('None')
and transaction_desc = 'CASH RELATED TRANSACTION'
group by customer_number
) subquery
) main
left join
(
select cd.CUSTOMER_NUMBER_3 as CUSTOMER_NUMBER_cust,
cd.SEGMENT,
cd.RISK,
case
when ad.SAR_FLAG is NULL then 'N'
else ad.SAR_FLAG
end as SAR_FLAG
from
(
select customer_number as CUSTOMER_NUMBER_3,
business_segment as SEGMENT,
case
when RISK_CLASSIFICATION = 1 then 'Low Risk'
when RISK_CLASSIFICATION = 2 then 'Medium Risk'
when RISK_CLASSIFICATION = 3 then 'High Risk'
else 'Unknown Risk'
end AS RISK
from {cust_data}
) cd
left join
(
select customer_number as CUSTOMER_NUMBER_4,
sar_flag as SAR_FLAG
from {alert_data}
) ad
on cd.CUSTOMER_NUMBER_3 = ad.CUSTOMER_NUMBER_4
) as cust_alert
on cust_alert.CUSTOMER_NUMBER_cust = main.CUSTOMER_NUMBER_main
) as final
"""
from tms_data_interface import SQLQueryInterface
class Scenario:
seq = SQLQueryInterface(schema="transactionschema")
def logic(self, **kwargs):
row_list = self.seq.execute_raw(query.format(trans_data="transaction10m",
cust_data="customer_data_v1",
acc_data="account_data_v1",
alert_data="alert_data_v1")
)
cols = ["Focal_id", "Cash_deposit_total", "Cash_deposit_count",
"Segment", "Risk", "SAR_FLAG"]
df = pd.DataFrame(row_list, columns = cols)
df["Cash_deposit_total"] = df["Cash_deposit_total"].astype(float)
# Step 1: Compute 90th percentiles per Segment for all 3 fields
percentiles = (
df.groupby("Segment")[["Cash_deposit_total",
"Cash_deposit_count"]]
.quantile(0.98)
.reset_index()
)
# Rename columns for clarity
percentiles = percentiles.rename(columns={
"Cash_deposit_total": "P90_Credit",
"Cash_deposit_count": "P90_Credit_count"
})
# Step 2: Merge back to main df
df = df.merge(percentiles, on="Segment", how="left")
# Step 3: Identify customers above 90th percentile in ANY of the 3 metrics
high_pop = (
(df["Cash_deposit_total"] > df["P90_Credit"]) &
(df["Cash_deposit_count"] > df["P90_Credit_count"])
)
# Step 4: Randomly select 0.1% sample from high-risk population
sample_fraction = 0.1 # 0.1%
high_pop_indices = df[high_pop].sample(frac=sample_fraction, random_state=42).index
# Step 5: Set SAR_FLAG values
df["SAR_FLAG"] = "N" # default for all
df.loc[high_pop_indices, "SAR_FLAG"] = "Y" # assign Y to 0.1% random high-risk population
return df
# In[8]:
# sen = Scenario()
# a = sen.logic()
# In[10]:
# a[a["SAR_FLAG"] == "Y"]