commit 35c02db444d85e747dd92b59ed7a98218a8d92b1 Author: user_client2024 Date: Tue Jun 16 16:39:19 2026 +0000 Initial commit diff --git a/.ipynb_checkpoints/main-checkpoint.ipynb b/.ipynb_checkpoints/main-checkpoint.ipynb new file mode 100644 index 0000000..c5ecb92 --- /dev/null +++ b/.ipynb_checkpoints/main-checkpoint.ipynb @@ -0,0 +1,176 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "id": "e706cfb0-2234-4c4c-95d8-d1968f656aa0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "query = \"\"\"\n", + " select final.CUSTOMER_NUMBER_main as Focal_id,\n", + " CAST(final.Cash_deposit_total AS DECIMAL(18, 2)) AS Cash_deposit_total,\n", + " final.Cash_deposit_count,\n", + " final.SEGMENT,\n", + " final.RISK,\n", + " final.SAR_FLAG\n", + "from \n", + "(\n", + " (\n", + " select subquery.CUSTOMER_NUMBER_1 as CUSTOMER_NUMBER_main,\n", + " subquery.Cash_deposit_total,\n", + " subquery.Cash_deposit_count\n", + " from \n", + " (\n", + " select customer_number as CUSTOMER_NUMBER_1, \n", + " sum(transaction_amount) as Cash_deposit_total, \n", + " count(*) as Cash_deposit_count\n", + " from \n", + " (\n", + " select * \n", + " from {trans_data} trans_table \n", + " left join {acc_data} acc_table\n", + " on trans_table.benef_account_number = acc_table.account_number\n", + " ) trans\n", + " where account_number not in ('None')\n", + " and transaction_desc = 'CASH RELATED TRANSACTION'\n", + " group by customer_number\n", + " ) subquery\n", + " ) main \n", + " left join \n", + " (\n", + " select cd.CUSTOMER_NUMBER_3 as CUSTOMER_NUMBER_cust,\n", + " cd.SEGMENT,\n", + " cd.RISK,\n", + " case\n", + " when ad.SAR_FLAG is NULL then 'N'\n", + " else ad.SAR_FLAG\n", + " end as SAR_FLAG \n", + " from\n", + " (\n", + " select customer_number as CUSTOMER_NUMBER_3, \n", + " business_segment as SEGMENT,\n", + " case\n", + " when RISK_CLASSIFICATION = 1 then 'Low Risk'\n", + " when RISK_CLASSIFICATION = 2 then 'Medium Risk'\n", + " when RISK_CLASSIFICATION = 3 then 'High Risk'\n", + " else 'Unknown Risk'\n", + " end AS RISK\n", + " from {cust_data}\n", + " ) cd \n", + " left join\n", + " (\n", + " select customer_number as CUSTOMER_NUMBER_4, \n", + " sar_flag as SAR_FLAG\n", + " from {alert_data}\n", + " ) ad \n", + " on cd.CUSTOMER_NUMBER_3 = ad.CUSTOMER_NUMBER_4\n", + " ) as cust_alert\n", + " on cust_alert.CUSTOMER_NUMBER_cust = main.CUSTOMER_NUMBER_main\n", + ") as final\n", + "\"\"\"\n", + "\n", + "from tms_data_interface import SQLQueryInterface\n", + "\n", + "class Scenario:\n", + " seq = SQLQueryInterface(schema=\"transactionschema\")\n", + "\n", + " def logic(self, **kwargs):\n", + " row_list = self.seq.execute_raw(query.format(trans_data=\"transaction10m\",\n", + " cust_data=\"customer_data_v1\",\n", + " acc_data=\"account_data_v1\",\n", + " alert_data=\"alert_data_v1\")\n", + " )\n", + " cols = [\"Focal_id\", \"Cash_deposit_total\", \"Cash_deposit_count\",\n", + " \"Segment\", \"Risk\", \"SAR_FLAG\"]\n", + " df = pd.DataFrame(row_list, columns = cols)\n", + " df[\"Cash_deposit_total\"] = df[\"Cash_deposit_total\"].astype(float)\n", + " \n", + " \n", + "\n", + " # Step 1: Compute 90th percentiles per Segment for all 3 fields\n", + " percentiles = (\n", + " df.groupby(\"Segment\")[[\"Cash_deposit_total\",\n", + " \"Cash_deposit_count\"]]\n", + " .quantile(0.98)\n", + " .reset_index()\n", + " )\n", + "\n", + " # Rename columns for clarity\n", + " percentiles = percentiles.rename(columns={\n", + " \"Cash_deposit_total\": \"P90_Credit\",\n", + " \"Cash_deposit_count\": \"P90_Credit_count\"\n", + " })\n", + "\n", + " # Step 2: Merge back to main df\n", + " df = df.merge(percentiles, on=\"Segment\", how=\"left\")\n", + "\n", + " # Step 3: Identify customers above 90th percentile in ANY of the 3 metrics\n", + " high_pop = (\n", + " (df[\"Cash_deposit_total\"] > df[\"P90_Credit\"]) &\n", + " (df[\"Cash_deposit_count\"] > df[\"P90_Credit_count\"])\n", + " )\n", + "\n", + " # Step 4: Randomly select 0.1% sample from high-risk population\n", + " sample_fraction = 0.1 # 0.1%\n", + " high_pop_indices = df[high_pop].sample(frac=sample_fraction, random_state=42).index\n", + "\n", + " # Step 5: Set SAR_FLAG values\n", + " df[\"SAR_FLAG\"] = \"N\" # default for all\n", + " df.loc[high_pop_indices, \"SAR_FLAG\"] = \"Y\" # assign Y to 0.1% random high-risk population\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1f20337b-8116-47e5-8743-1ba41e2df819", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# sen = Scenario()\n", + "# a = sen.logic()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6de62b37-00d1-4c88-b27b-9a70e05add91", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# a[a[\"SAR_FLAG\"] == \"Y\"]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/main.ipynb b/main.ipynb new file mode 100644 index 0000000..c5ecb92 --- /dev/null +++ b/main.ipynb @@ -0,0 +1,176 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "id": "e706cfb0-2234-4c4c-95d8-d1968f656aa0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "query = \"\"\"\n", + " select final.CUSTOMER_NUMBER_main as Focal_id,\n", + " CAST(final.Cash_deposit_total AS DECIMAL(18, 2)) AS Cash_deposit_total,\n", + " final.Cash_deposit_count,\n", + " final.SEGMENT,\n", + " final.RISK,\n", + " final.SAR_FLAG\n", + "from \n", + "(\n", + " (\n", + " select subquery.CUSTOMER_NUMBER_1 as CUSTOMER_NUMBER_main,\n", + " subquery.Cash_deposit_total,\n", + " subquery.Cash_deposit_count\n", + " from \n", + " (\n", + " select customer_number as CUSTOMER_NUMBER_1, \n", + " sum(transaction_amount) as Cash_deposit_total, \n", + " count(*) as Cash_deposit_count\n", + " from \n", + " (\n", + " select * \n", + " from {trans_data} trans_table \n", + " left join {acc_data} acc_table\n", + " on trans_table.benef_account_number = acc_table.account_number\n", + " ) trans\n", + " where account_number not in ('None')\n", + " and transaction_desc = 'CASH RELATED TRANSACTION'\n", + " group by customer_number\n", + " ) subquery\n", + " ) main \n", + " left join \n", + " (\n", + " select cd.CUSTOMER_NUMBER_3 as CUSTOMER_NUMBER_cust,\n", + " cd.SEGMENT,\n", + " cd.RISK,\n", + " case\n", + " when ad.SAR_FLAG is NULL then 'N'\n", + " else ad.SAR_FLAG\n", + " end as SAR_FLAG \n", + " from\n", + " (\n", + " select customer_number as CUSTOMER_NUMBER_3, \n", + " business_segment as SEGMENT,\n", + " case\n", + " when RISK_CLASSIFICATION = 1 then 'Low Risk'\n", + " when RISK_CLASSIFICATION = 2 then 'Medium Risk'\n", + " when RISK_CLASSIFICATION = 3 then 'High Risk'\n", + " else 'Unknown Risk'\n", + " end AS RISK\n", + " from {cust_data}\n", + " ) cd \n", + " left join\n", + " (\n", + " select customer_number as CUSTOMER_NUMBER_4, \n", + " sar_flag as SAR_FLAG\n", + " from {alert_data}\n", + " ) ad \n", + " on cd.CUSTOMER_NUMBER_3 = ad.CUSTOMER_NUMBER_4\n", + " ) as cust_alert\n", + " on cust_alert.CUSTOMER_NUMBER_cust = main.CUSTOMER_NUMBER_main\n", + ") as final\n", + "\"\"\"\n", + "\n", + "from tms_data_interface import SQLQueryInterface\n", + "\n", + "class Scenario:\n", + " seq = SQLQueryInterface(schema=\"transactionschema\")\n", + "\n", + " def logic(self, **kwargs):\n", + " row_list = self.seq.execute_raw(query.format(trans_data=\"transaction10m\",\n", + " cust_data=\"customer_data_v1\",\n", + " acc_data=\"account_data_v1\",\n", + " alert_data=\"alert_data_v1\")\n", + " )\n", + " cols = [\"Focal_id\", \"Cash_deposit_total\", \"Cash_deposit_count\",\n", + " \"Segment\", \"Risk\", \"SAR_FLAG\"]\n", + " df = pd.DataFrame(row_list, columns = cols)\n", + " df[\"Cash_deposit_total\"] = df[\"Cash_deposit_total\"].astype(float)\n", + " \n", + " \n", + "\n", + " # Step 1: Compute 90th percentiles per Segment for all 3 fields\n", + " percentiles = (\n", + " df.groupby(\"Segment\")[[\"Cash_deposit_total\",\n", + " \"Cash_deposit_count\"]]\n", + " .quantile(0.98)\n", + " .reset_index()\n", + " )\n", + "\n", + " # Rename columns for clarity\n", + " percentiles = percentiles.rename(columns={\n", + " \"Cash_deposit_total\": \"P90_Credit\",\n", + " \"Cash_deposit_count\": \"P90_Credit_count\"\n", + " })\n", + "\n", + " # Step 2: Merge back to main df\n", + " df = df.merge(percentiles, on=\"Segment\", how=\"left\")\n", + "\n", + " # Step 3: Identify customers above 90th percentile in ANY of the 3 metrics\n", + " high_pop = (\n", + " (df[\"Cash_deposit_total\"] > df[\"P90_Credit\"]) &\n", + " (df[\"Cash_deposit_count\"] > df[\"P90_Credit_count\"])\n", + " )\n", + "\n", + " # Step 4: Randomly select 0.1% sample from high-risk population\n", + " sample_fraction = 0.1 # 0.1%\n", + " high_pop_indices = df[high_pop].sample(frac=sample_fraction, random_state=42).index\n", + "\n", + " # Step 5: Set SAR_FLAG values\n", + " df[\"SAR_FLAG\"] = \"N\" # default for all\n", + " df.loc[high_pop_indices, \"SAR_FLAG\"] = \"Y\" # assign Y to 0.1% random high-risk population\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1f20337b-8116-47e5-8743-1ba41e2df819", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# sen = Scenario()\n", + "# a = sen.logic()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6de62b37-00d1-4c88-b27b-9a70e05add91", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# a[a[\"SAR_FLAG\"] == \"Y\"]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/main.py b/main.py new file mode 100644 index 0000000..7fb7723 --- /dev/null +++ b/main.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[7]: + + +import pandas as pd + +query = """ + select final.CUSTOMER_NUMBER_main as Focal_id, + CAST(final.Cash_deposit_total AS DECIMAL(18, 2)) AS Cash_deposit_total, + final.Cash_deposit_count, + final.SEGMENT, + final.RISK, + final.SAR_FLAG +from +( + ( + select subquery.CUSTOMER_NUMBER_1 as CUSTOMER_NUMBER_main, + subquery.Cash_deposit_total, + subquery.Cash_deposit_count + from + ( + select customer_number as CUSTOMER_NUMBER_1, + sum(transaction_amount) as Cash_deposit_total, + count(*) as Cash_deposit_count + from + ( + select * + from {trans_data} trans_table + left join {acc_data} acc_table + on trans_table.benef_account_number = acc_table.account_number + ) trans + where account_number not in ('None') + and transaction_desc = 'CASH RELATED TRANSACTION' + group by customer_number + ) subquery + ) main + left join + ( + select cd.CUSTOMER_NUMBER_3 as CUSTOMER_NUMBER_cust, + cd.SEGMENT, + cd.RISK, + case + when ad.SAR_FLAG is NULL then 'N' + else ad.SAR_FLAG + end as SAR_FLAG + from + ( + select customer_number as CUSTOMER_NUMBER_3, + business_segment as SEGMENT, + case + when RISK_CLASSIFICATION = 1 then 'Low Risk' + when RISK_CLASSIFICATION = 2 then 'Medium Risk' + when RISK_CLASSIFICATION = 3 then 'High Risk' + else 'Unknown Risk' + end AS RISK + from {cust_data} + ) cd + left join + ( + select customer_number as CUSTOMER_NUMBER_4, + sar_flag as SAR_FLAG + from {alert_data} + ) ad + on cd.CUSTOMER_NUMBER_3 = ad.CUSTOMER_NUMBER_4 + ) as cust_alert + on cust_alert.CUSTOMER_NUMBER_cust = main.CUSTOMER_NUMBER_main +) as final +""" + +from tms_data_interface import SQLQueryInterface + +class Scenario: + seq = SQLQueryInterface(schema="transactionschema") + + def logic(self, **kwargs): + row_list = self.seq.execute_raw(query.format(trans_data="transaction10m", + cust_data="customer_data_v1", + acc_data="account_data_v1", + alert_data="alert_data_v1") + ) + cols = ["Focal_id", "Cash_deposit_total", "Cash_deposit_count", + "Segment", "Risk", "SAR_FLAG"] + df = pd.DataFrame(row_list, columns = cols) + df["Cash_deposit_total"] = df["Cash_deposit_total"].astype(float) + + + + # Step 1: Compute 90th percentiles per Segment for all 3 fields + percentiles = ( + df.groupby("Segment")[["Cash_deposit_total", + "Cash_deposit_count"]] + .quantile(0.98) + .reset_index() + ) + + # Rename columns for clarity + percentiles = percentiles.rename(columns={ + "Cash_deposit_total": "P90_Credit", + "Cash_deposit_count": "P90_Credit_count" + }) + + # Step 2: Merge back to main df + df = df.merge(percentiles, on="Segment", how="left") + + # Step 3: Identify customers above 90th percentile in ANY of the 3 metrics + high_pop = ( + (df["Cash_deposit_total"] > df["P90_Credit"]) & + (df["Cash_deposit_count"] > df["P90_Credit_count"]) + ) + + # Step 4: Randomly select 0.1% sample from high-risk population + sample_fraction = 0.1 # 0.1% + high_pop_indices = df[high_pop].sample(frac=sample_fraction, random_state=42).index + + # Step 5: Set SAR_FLAG values + df["SAR_FLAG"] = "N" # default for all + df.loc[high_pop_indices, "SAR_FLAG"] = "Y" # assign Y to 0.1% random high-risk population + + return df + + +# In[8]: + + +# sen = Scenario() +# a = sen.logic() + + +# In[10]: + + +# a[a["SAR_FLAG"] == "Y"] +