#!/usr/bin/env python # coding: utf-8 # In[7]: import pandas as pd query = """ select final.CUSTOMER_NUMBER_main as Focal_id, CAST(final.Cash_deposit_total AS DECIMAL(18, 2)) AS Cash_deposit_total, final.Cash_deposit_count, final.SEGMENT, final.RISK, final.SAR_FLAG from ( ( select subquery.CUSTOMER_NUMBER_1 as CUSTOMER_NUMBER_main, subquery.Cash_deposit_total, subquery.Cash_deposit_count from ( select customer_number as CUSTOMER_NUMBER_1, sum(transaction_amount) as Cash_deposit_total, count(*) as Cash_deposit_count from ( select * from {trans_data} trans_table left join {acc_data} acc_table on trans_table.benef_account_number = acc_table.account_number ) trans where account_number not in ('None') and transaction_desc = 'CASH RELATED TRANSACTION' group by customer_number ) subquery ) main left join ( select cd.CUSTOMER_NUMBER_3 as CUSTOMER_NUMBER_cust, cd.SEGMENT, cd.RISK, case when ad.SAR_FLAG is NULL then 'N' else ad.SAR_FLAG end as SAR_FLAG from ( select customer_number as CUSTOMER_NUMBER_3, business_segment as SEGMENT, case when RISK_CLASSIFICATION = 1 then 'Low Risk' when RISK_CLASSIFICATION = 2 then 'Medium Risk' when RISK_CLASSIFICATION = 3 then 'High Risk' else 'Unknown Risk' end AS RISK from {cust_data} ) cd left join ( select customer_number as CUSTOMER_NUMBER_4, sar_flag as SAR_FLAG from {alert_data} ) ad on cd.CUSTOMER_NUMBER_3 = ad.CUSTOMER_NUMBER_4 ) as cust_alert on cust_alert.CUSTOMER_NUMBER_cust = main.CUSTOMER_NUMBER_main ) as final """ from tms_data_interface import SQLQueryInterface class Scenario: seq = SQLQueryInterface(schema="transactionschema") def logic(self, **kwargs): row_list = self.seq.execute_raw(query.format(trans_data="transaction10m", cust_data="customer_data_v1", acc_data="account_data_v1", alert_data="alert_data_v1") ) cols = ["Focal_id", "Cash_deposit_total", "Cash_deposit_count", "Segment", "Risk", "SAR_FLAG"] df = pd.DataFrame(row_list, columns = cols) df["Cash_deposit_total"] = df["Cash_deposit_total"].astype(float) # Step 1: Compute 90th percentiles per Segment for all 3 fields percentiles = ( df.groupby("Segment")[["Cash_deposit_total", "Cash_deposit_count"]] .quantile(0.98) .reset_index() ) # Rename columns for clarity percentiles = percentiles.rename(columns={ "Cash_deposit_total": "P90_Credit", "Cash_deposit_count": "P90_Credit_count" }) # Step 2: Merge back to main df df = df.merge(percentiles, on="Segment", how="left") # Step 3: Identify customers above 90th percentile in ANY of the 3 metrics high_pop = ( (df["Cash_deposit_total"] > df["P90_Credit"]) & (df["Cash_deposit_count"] > df["P90_Credit_count"]) ) # Step 4: Randomly select 0.1% sample from high-risk population sample_fraction = 0.1 # 0.1% high_pop_indices = df[high_pop].sample(frac=sample_fraction, random_state=42).index # Step 5: Set SAR_FLAG values df["SAR_FLAG"] = "N" # default for all df.loc[high_pop_indices, "SAR_FLAG"] = "Y" # assign Y to 0.1% random high-risk population return df # In[8]: # sen = Scenario() # a = sen.logic() # In[10]: # a[a["SAR_FLAG"] == "Y"]