#!/usr/bin/env python # coding: utf-8 # In[3]: import pandas as pd # In[4]: from tms_data_interface import SQLQueryInterface seq = SQLQueryInterface(schema="transactionschema") # In[5]: seq.execute_raw("show tables") # In[6]: query = """ select final.CUSTOMER_NUMBER_main as Focal_id, final.Credit_transaction_amount, final.SEGMENT, final.RISK, final.SAR_FLAG from ( ( select subquery.CUSTOMER_NUMBER_1 as CUSTOMER_NUMBER_main, subquery.Credit_transaction_amount from ( ( select customer_number as CUSTOMER_NUMBER_1, sum(transaction_amount) as Credit_transaction_amount from ( select * from {trans_data} as trans_table left join {acc_data} as acc_table on trans_table.benef_account_number = acc_table.account_number where trans_table.transaction_desc = 'WIRE RELATED TRANSACTION' ) where account_number not in ('None') group by 1 ) credit ) subquery ) main left join ( select subquery.CUSTOMER_NUMBER_3 as CUSTOMER_NUMBER_cust, subquery.SEGMENT, subquery.RISK, case when subquery.SAR_FLAG is NULL then 'N' else subquery.SAR_FLAG end as SAR_FLAG from ( ( select customer_number as CUSTOMER_NUMBER_3, business_segment as SEGMENT, case when RISK_CLASSIFICATION = 1 then 'Low Risk' when RISK_CLASSIFICATION = 2 then 'Medium Risk' when RISK_CLASSIFICATION = 3 then 'High Risk' else 'Unknown Risk' end AS RISK from {cust_data} ) cd left join ( select customer_number as CUSTOMER_NUMBER_4, sar_flag as SAR_FLAG from {alert_data} ) ad on cd.CUSTOMER_NUMBER_3 = ad.CUSTOMER_NUMBER_4 ) subquery ) cust_alert on cust_alert.CUSTOMER_NUMBER_cust = main.CUSTOMER_NUMBER_main ) final """ # In[25]: from tms_data_interface import SQLQueryInterface class Scenario: seq = SQLQueryInterface(schema="transactionschema") def logic(self, **kwargs): row_list = self.seq.execute_raw(query.format(trans_data="transaction10m", cust_data="customer_data_v1", acc_data="account_data_v1", alert_data="alert_data_v1") ) cols = ["Focal_id", "Total_Wire_Deposit_Amt", "Segment", "Risk", "SAR_FLAG"] df = pd.DataFrame(row_list, columns = cols) df['Total_Wire_Deposit_Amt'] = df['Total_Wire_Deposit_Amt'].astype('int') # df['Segment'] = 'Individual' p98 = ( df.groupby("Segment")["Total_Wire_Deposit_Amt"] .quantile(0.98) .reset_index() .rename(columns={"Total_Wire_Deposit_Amt": "P98_Value"}) ) print(p98) # Merge percentile back to main dataframe df = df.merge(p98, on="Segment", how="left") # Step 2: Identify population above 98th percentile high_pop = df["Total_Wire_Deposit_Amt"] > df["P98_Value"] print(high_pop) # Step 3: From this high-risk population, select 0.1% random sample sample_fraction = 0.1 # 0.1% high_pop_indices = df[high_pop].sample(frac=sample_fraction, random_state=42).index # Step 4: Assign SAR_FLAG df["SAR_FLAG"] = "N" # default for all df.loc[high_pop_indices, "SAR_FLAG"] = "Y" # assign Y to random 0.1% above 98th percentile return df # In[28]: # sen = Scenario() # a = sen.logic() # In[29]: # a[a['SAR_FLAG'] == "Y"] # In[ ]: #tst cmt