201/main.ipynb
2025-11-27 06:33:27 +00:00

550 lines
21 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 53,
"id": "e706cfb0-2234-4c4c-95d8-d1968f656aa0",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "2f9a4ca7-c066-4d93-9957-0d9145f9265d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from tms_data_interface import SQLQueryInterface\n",
"seq = SQLQueryInterface(schema=\"transactionschema\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "fc212ace-ca7a-45f2-8137-f436c6123652",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"[['account_data_v1'],\n",
" ['account_data_v2'],\n",
" ['alert_data_v1'],\n",
" ['alert_data_v2'],\n",
" ['customer_data_v1'],\n",
" ['customer_data_v2'],\n",
" ['transaction10m'],\n",
" ['transaction60m']]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"seq.execute_raw(\"show tables\")"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "134d0b3d-5481-4975-af07-c80ab09d6dd2",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"query = \"\"\"\n",
" select final.CUSTOMER_NUMBER_main as Focal_id,\n",
" final.Credit_transaction_amount,\n",
" final.Total_no_of_credit_transactions,\n",
" final.Debit_transaction_amount,\n",
" final.Total_no_of_debit_transactions,\n",
" final.Wash_Ratio,\n",
" final.SEGMENT,\n",
" final.RISK,\n",
" final.SAR_FLAG\n",
" from \n",
" (\n",
" (\n",
" select subquery.CUSTOMER_NUMBER_1 as CUSTOMER_NUMBER_main,\n",
" subquery.Credit_transaction_amount,\n",
" subquery.Total_no_of_credit_transactions,\n",
" case\n",
" when subquery.Debit_transaction_amount is NULL then 0\n",
" else Debit_transaction_amount\n",
" end as Debit_transaction_amount,\n",
" case\n",
" when subquery.Total_no_of_debit_transactions is NULL then 0\n",
" else Total_no_of_debit_transactions\n",
" end as Total_no_of_debit_transactions,\n",
" case\n",
" when subquery.Debit_transaction_amount = 0\n",
" or subquery.Debit_transaction_amount is NULL then 0\n",
" else subquery.Credit_transaction_amount / subquery.Debit_transaction_amount\n",
" end as Wash_Ratio\n",
" from \n",
" (\n",
" (\n",
" select customer_number as CUSTOMER_NUMBER_1, \n",
" sum(transaction_amount) as Credit_transaction_amount, \n",
" count(*) as Total_no_of_credit_transactions\n",
" from \n",
" (\n",
" select * \n",
" from {trans_data} as trans_table left join {acc_data} as acc_table\n",
" on trans_table.benef_account_number = acc_table.account_number\n",
" )\n",
" where account_number not in ('None')\n",
" group by 1\n",
" ) credit left join\n",
" (\n",
" select customer_number as CUSTOMER_NUMBER_2, \n",
" sum(transaction_amount) as Debit_transaction_amount, \n",
" count(*) as Total_no_of_debit_transactions\n",
" from \n",
" (\n",
" select * \n",
" from {trans_data} as trans_table left join {acc_data} as acc_table\n",
" on trans_table.orig_account_number = acc_table.account_number\n",
" )\n",
" where account_number not in ('None')\n",
" group by 1\n",
" ) debit on credit.CUSTOMER_NUMBER_1 = debit.CUSTOMER_NUMBER_2 \n",
" ) subquery\n",
" ) main left join \n",
" (\n",
" select subquery.CUSTOMER_NUMBER_3 as CUSTOMER_NUMBER_cust,\n",
" subquery.SEGMENT,\n",
" subquery.RISK,\n",
" case\n",
" when subquery.SAR_FLAG is NULL then 'N'\n",
" else subquery.SAR_FLAG\n",
" end as SAR_FLAG \n",
" from\n",
" (\n",
" (\n",
" select customer_number as CUSTOMER_NUMBER_3, \n",
" business_segment as SEGMENT,\n",
" case\n",
" when RISK_CLASSIFICATION = 1 then 'Low Risk'\n",
" when RISK_CLASSIFICATION = 2 then 'Medium Risk'\n",
" when RISK_CLASSIFICATION = 3 then 'High Risk'\n",
" else 'Unknown Risk'\n",
" end AS RISK\n",
" from {cust_data}\n",
" ) cd left join\n",
" (\n",
" select customer_number as CUSTOMER_NUMBER_4, \n",
" sar_flag as SAR_FLAG\n",
" from {alert_data}\n",
" ) ad on cd.CUSTOMER_NUMBER_3 = ad.CUSTOMER_NUMBER_4\n",
" ) subquery\n",
" ) cust_alert on cust_alert.CUSTOMER_NUMBER_cust = main.CUSTOMER_NUMBER_main\n",
" ) final\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "d220561a-34c9-48d2-8e2f-5d174a87540b",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from tms_data_interface import SQLQueryInterface\n",
"\n",
"class Scenario:\n",
" seq = SQLQueryInterface(schema=\"transactionschema\")\n",
"\n",
" def logic(self, **kwargs):\n",
" row_list = self.seq.execute_raw(query.format(trans_data=\"transaction10m\",\n",
" cust_data=\"customer_data_v1\",\n",
" acc_data=\"account_data_v1\",\n",
" alert_data=\"alert_data_v1\")\n",
" )\n",
" cols = [\"Focal_id\", \"Credit_transaction_amount\",\n",
" \"Total_no_of_credit_transactions\",\n",
" \"Debit_transaction_amount\", \"Total_no_of_debit_transactions\",\n",
" \"Wash_Ratio\", \"Segment\", \"Risk\", \"SAR_FLAG\"]\n",
" df = pd.DataFrame(row_list, columns = cols)\n",
" df[[\"Credit_transaction_amount\",\n",
" \"Debit_transaction_amount\"]] = df[[\"Credit_transaction_amount\",\n",
" \"Debit_transaction_amount\"]].astype('int')\n",
" df[\"Wash_Ratio\"] = df[\"Wash_Ratio\"].astype('float')\n",
" # Step 1: Compute 90th percentiles per Segment for all 3 fields\n",
" percentiles = (\n",
" df.groupby(\"Segment\")[[\"Credit_transaction_amount\",\n",
" \"Debit_transaction_amount\",\n",
" \"Wash_Ratio\"]]\n",
" .quantile(0.98)\n",
" .reset_index()\n",
" )\n",
"\n",
" # Rename columns for clarity\n",
" percentiles = percentiles.rename(columns={\n",
" \"Credit_transaction_amount\": \"P90_Credit\",\n",
" \"Debit_transaction_amount\": \"P90_Debit\",\n",
" \"Wash_Ratio\": \"P90_Wash\"\n",
" })\n",
"\n",
" # Step 2: Merge back to main df\n",
" df = df.merge(percentiles, on=\"Segment\", how=\"left\")\n",
"\n",
" # Step 3: Identify customers above 90th percentile in ANY of the 3 metrics\n",
" high_pop = (\n",
" (df[\"Credit_transaction_amount\"] > df[\"P90_Credit\"]) |\n",
" (df[\"Debit_transaction_amount\"] > df[\"P90_Debit\"]) |\n",
" (df[\"Wash_Ratio\"] > df[\"P90_Wash\"])\n",
" )\n",
"\n",
" # Step 4: Randomly select 0.1% sample from high-risk population\n",
" sample_fraction = 0.1 # 0.1%\n",
" high_pop_indices = df[high_pop].sample(frac=sample_fraction, random_state=42).index\n",
"\n",
" # Step 5: Set SAR_FLAG values\n",
" df[\"SAR_FLAG\"] = \"N\" # default for all\n",
" df.loc[high_pop_indices, \"SAR_FLAG\"] = \"Y\" # assign Y to 0.1% random high-risk population\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "2e5a0ea9-64cd-4a8d-9a5d-e5e7b36a401a",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"sen = Scenario()\n",
"a = sen.logic()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "830c7ec3-9707-46db-9b27-ac4f9d46a03a",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Focal_id</th>\n",
" <th>Credit_transaction_amount</th>\n",
" <th>Total_no_of_credit_transactions</th>\n",
" <th>Debit_transaction_amount</th>\n",
" <th>Total_no_of_debit_transactions</th>\n",
" <th>Wash_Ratio</th>\n",
" <th>Segment</th>\n",
" <th>Risk</th>\n",
" <th>SAR_FLAG</th>\n",
" <th>P90_Credit</th>\n",
" <th>P90_Debit</th>\n",
" <th>P90_Wash</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>PN478710</td>\n",
" <td>2805352312</td>\n",
" <td>2020</td>\n",
" <td>2787126309</td>\n",
" <td>2025</td>\n",
" <td>1.006539</td>\n",
" <td>Corporate Banking</td>\n",
" <td>Low Risk</td>\n",
" <td>N</td>\n",
" <td>6.274828e+09</td>\n",
" <td>6.259298e+09</td>\n",
" <td>1.090121</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>PN483125</td>\n",
" <td>3890052135</td>\n",
" <td>2797</td>\n",
" <td>3968882113</td>\n",
" <td>2850</td>\n",
" <td>0.980138</td>\n",
" <td>Govt. Entities</td>\n",
" <td>Low Risk</td>\n",
" <td>N</td>\n",
" <td>6.112897e+09</td>\n",
" <td>6.072409e+09</td>\n",
" <td>1.112059</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>PN890403</td>\n",
" <td>4136296083</td>\n",
" <td>2937</td>\n",
" <td>3999785063</td>\n",
" <td>2824</td>\n",
" <td>1.034130</td>\n",
" <td>SME</td>\n",
" <td>Low Risk</td>\n",
" <td>N</td>\n",
" <td>5.709904e+09</td>\n",
" <td>5.559419e+09</td>\n",
" <td>1.118816</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>PN531475</td>\n",
" <td>4183673982</td>\n",
" <td>2861</td>\n",
" <td>3987068168</td>\n",
" <td>2770</td>\n",
" <td>1.049311</td>\n",
" <td>Corporate Banking</td>\n",
" <td>Low Risk</td>\n",
" <td>N</td>\n",
" <td>6.274828e+09</td>\n",
" <td>6.259298e+09</td>\n",
" <td>1.090121</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>PN147722</td>\n",
" <td>1775594615</td>\n",
" <td>1225</td>\n",
" <td>1641559222</td>\n",
" <td>1221</td>\n",
" <td>1.081651</td>\n",
" <td>SME</td>\n",
" <td>Low Risk</td>\n",
" <td>N</td>\n",
" <td>5.709904e+09</td>\n",
" <td>5.559419e+09</td>\n",
" <td>1.118816</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10009</th>\n",
" <td>PN955059</td>\n",
" <td>31106290</td>\n",
" <td>264</td>\n",
" <td>25266130</td>\n",
" <td>369</td>\n",
" <td>1.231146</td>\n",
" <td>Priority Banking</td>\n",
" <td>Low Risk</td>\n",
" <td>N</td>\n",
" <td>7.616620e+07</td>\n",
" <td>5.263062e+07</td>\n",
" <td>1.921224</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10010</th>\n",
" <td>PN602067</td>\n",
" <td>29780658</td>\n",
" <td>238</td>\n",
" <td>27796448</td>\n",
" <td>405</td>\n",
" <td>1.071384</td>\n",
" <td>Others</td>\n",
" <td>High Risk</td>\n",
" <td>N</td>\n",
" <td>7.897534e+07</td>\n",
" <td>5.488447e+07</td>\n",
" <td>1.931817</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10011</th>\n",
" <td>PN213487</td>\n",
" <td>41410071</td>\n",
" <td>274</td>\n",
" <td>23896844</td>\n",
" <td>368</td>\n",
" <td>1.732868</td>\n",
" <td>Others</td>\n",
" <td>Low Risk</td>\n",
" <td>N</td>\n",
" <td>7.897534e+07</td>\n",
" <td>5.488447e+07</td>\n",
" <td>1.931817</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10012</th>\n",
" <td>PN563065</td>\n",
" <td>34009021</td>\n",
" <td>251</td>\n",
" <td>32563582</td>\n",
" <td>375</td>\n",
" <td>1.044388</td>\n",
" <td>Others</td>\n",
" <td>Low Risk</td>\n",
" <td>N</td>\n",
" <td>7.897534e+07</td>\n",
" <td>5.488447e+07</td>\n",
" <td>1.931817</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10013</th>\n",
" <td>PN388875</td>\n",
" <td>30904340</td>\n",
" <td>236</td>\n",
" <td>21938266</td>\n",
" <td>344</td>\n",
" <td>1.408696</td>\n",
" <td>Mass Market</td>\n",
" <td>Medium Risk</td>\n",
" <td>N</td>\n",
" <td>7.921967e+07</td>\n",
" <td>5.290545e+07</td>\n",
" <td>1.915159</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10014 rows × 12 columns</p>\n",
"</div>"
],
"text/plain": [
" Focal_id Credit_transaction_amount Total_no_of_credit_transactions \\\n",
"0 PN478710 2805352312 2020 \n",
"1 PN483125 3890052135 2797 \n",
"2 PN890403 4136296083 2937 \n",
"3 PN531475 4183673982 2861 \n",
"4 PN147722 1775594615 1225 \n",
"... ... ... ... \n",
"10009 PN955059 31106290 264 \n",
"10010 PN602067 29780658 238 \n",
"10011 PN213487 41410071 274 \n",
"10012 PN563065 34009021 251 \n",
"10013 PN388875 30904340 236 \n",
"\n",
" Debit_transaction_amount Total_no_of_debit_transactions Wash_Ratio \\\n",
"0 2787126309 2025 1.006539 \n",
"1 3968882113 2850 0.980138 \n",
"2 3999785063 2824 1.034130 \n",
"3 3987068168 2770 1.049311 \n",
"4 1641559222 1221 1.081651 \n",
"... ... ... ... \n",
"10009 25266130 369 1.231146 \n",
"10010 27796448 405 1.071384 \n",
"10011 23896844 368 1.732868 \n",
"10012 32563582 375 1.044388 \n",
"10013 21938266 344 1.408696 \n",
"\n",
" Segment Risk SAR_FLAG P90_Credit P90_Debit \\\n",
"0 Corporate Banking Low Risk N 6.274828e+09 6.259298e+09 \n",
"1 Govt. Entities Low Risk N 6.112897e+09 6.072409e+09 \n",
"2 SME Low Risk N 5.709904e+09 5.559419e+09 \n",
"3 Corporate Banking Low Risk N 6.274828e+09 6.259298e+09 \n",
"4 SME Low Risk N 5.709904e+09 5.559419e+09 \n",
"... ... ... ... ... ... \n",
"10009 Priority Banking Low Risk N 7.616620e+07 5.263062e+07 \n",
"10010 Others High Risk N 7.897534e+07 5.488447e+07 \n",
"10011 Others Low Risk N 7.897534e+07 5.488447e+07 \n",
"10012 Others Low Risk N 7.897534e+07 5.488447e+07 \n",
"10013 Mass Market Medium Risk N 7.921967e+07 5.290545e+07 \n",
"\n",
" P90_Wash \n",
"0 1.090121 \n",
"1 1.112059 \n",
"2 1.118816 \n",
"3 1.090121 \n",
"4 1.118816 \n",
"... ... \n",
"10009 1.921224 \n",
"10010 1.931817 \n",
"10011 1.931817 \n",
"10012 1.931817 \n",
"10013 1.915159 \n",
"\n",
"[10014 rows x 12 columns]"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "150bb5ce-6be1-44fc-a606-6d375354626d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# a[a[\"SAR_FLAG\"] == \"Y\"]\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}