generated from user_client2024/152
System save at 22/09/2025 17:03 by user_client2024
This commit is contained in:
parent
8a8c00ab77
commit
39dc2b32ef
@ -11,6 +11,49 @@
|
|||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"from tms_data_interface import SQLQueryInterface\n",
|
"from tms_data_interface import SQLQueryInterface\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"def apply_sar_flag(df, var1, var2, var3, random_state=42):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Apply percentile-based thresholds, split data into alerting and non-alerting,\n",
|
||||||
|
" flag random 10% of alerting data as 'Y', and merge back.\n",
|
||||||
|
"\n",
|
||||||
|
" Parameters:\n",
|
||||||
|
" df (pd.DataFrame): Input dataframe\n",
|
||||||
|
" var1 (str): First variable (for 50th percentile threshold)\n",
|
||||||
|
" var2 (str): Second variable (for 50th percentile threshold)\n",
|
||||||
|
" var3 (str): Third variable (for 90th percentile threshold)\n",
|
||||||
|
" random_state (int): Seed for reproducibility\n",
|
||||||
|
"\n",
|
||||||
|
" Returns:\n",
|
||||||
|
" pd.DataFrame: DataFrame with 'SAR_Flag' column added\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
" # Calculate thresholds\n",
|
||||||
|
" th1 = np.percentile(df[var1].dropna(), 50)\n",
|
||||||
|
" th2 = np.percentile(df[var2].dropna(), 50)\n",
|
||||||
|
" th3 = np.percentile(df[var3].dropna(), 90)\n",
|
||||||
|
"\n",
|
||||||
|
" # Split into alerting and non-alerting\n",
|
||||||
|
" alerting = df[(df[var1] >= th1) &\n",
|
||||||
|
" (df[var2] >= th2) &\n",
|
||||||
|
" (df[var3] >= th3)].copy()\n",
|
||||||
|
"\n",
|
||||||
|
" non_alerting = df.loc[~df.index.isin(alerting.index)].copy()\n",
|
||||||
|
"\n",
|
||||||
|
" # Assign SAR_Flag = 'N' for non-alerting\n",
|
||||||
|
" non_alerting['SAR_Flag'] = 'N'\n",
|
||||||
|
"\n",
|
||||||
|
" # Assign SAR_Flag for alerting data\n",
|
||||||
|
" alerting['SAR_Flag'] = 'N'\n",
|
||||||
|
" n_y = int(len(alerting) * 0.1) # 10% count\n",
|
||||||
|
" if n_y > 0:\n",
|
||||||
|
" y_indices = alerting.sample(n=n_y, random_state=random_state).index\n",
|
||||||
|
" alerting.loc[y_indices, 'SAR_Flag'] = 'Y'\n",
|
||||||
|
"\n",
|
||||||
|
" # Merge back and preserve original order\n",
|
||||||
|
" final_df = pd.concat([alerting, non_alerting]).sort_index()\n",
|
||||||
|
"\n",
|
||||||
|
" return final_df\n",
|
||||||
|
"\n",
|
||||||
"query = \"\"\"\n",
|
"query = \"\"\"\n",
|
||||||
"WITH time_windows AS (\n",
|
"WITH time_windows AS (\n",
|
||||||
" SELECT\n",
|
" SELECT\n",
|
||||||
@ -107,9 +150,14 @@
|
|||||||
" final_scenario_df['PARTICIPANT_VOLUME_PCT'] = final_scenario_df['PARTICIPANT_VOLUME']/\\\n",
|
" final_scenario_df['PARTICIPANT_VOLUME_PCT'] = final_scenario_df['PARTICIPANT_VOLUME']/\\\n",
|
||||||
" final_scenario_df['TOTAL_VOLUME'] * 100\n",
|
" final_scenario_df['TOTAL_VOLUME'] * 100\n",
|
||||||
" final_scenario_df['Segment'] = 'Default'\n",
|
" final_scenario_df['Segment'] = 'Default'\n",
|
||||||
" final_scenario_df['SAR_FLAG'] = 'N'\n",
|
" # final_scenario_df['SAR_FLAG'] = 'N'\n",
|
||||||
" final_scenario_df['Risk'] = 'Medium Risk'\n",
|
" final_scenario_df['Risk'] = 'Medium Risk'\n",
|
||||||
" final_scenario_df.dropna(inplace=True)\n",
|
" final_scenario_df.dropna(inplace=True)\n",
|
||||||
|
" final_scenario_df = apply_sar_flag(final_scenario_df,\n",
|
||||||
|
" 'PRICE_CHANGE_PCT',\n",
|
||||||
|
" 'PARTICIPANT_VOLUME_PCT',\n",
|
||||||
|
" 'TOTAL_VOLUME',\n",
|
||||||
|
" random_state=42)\n",
|
||||||
" # final_scenario_df['RUN_DATE'] = final_scenario_df['END_DATE']\n",
|
" # final_scenario_df['RUN_DATE'] = final_scenario_df['END_DATE']\n",
|
||||||
" return final_scenario_df\n"
|
" return final_scenario_df\n"
|
||||||
]
|
]
|
||||||
|
|||||||
50
main.ipynb
50
main.ipynb
@ -11,6 +11,49 @@
|
|||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"from tms_data_interface import SQLQueryInterface\n",
|
"from tms_data_interface import SQLQueryInterface\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"def apply_sar_flag(df, var1, var2, var3, random_state=42):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Apply percentile-based thresholds, split data into alerting and non-alerting,\n",
|
||||||
|
" flag random 10% of alerting data as 'Y', and merge back.\n",
|
||||||
|
"\n",
|
||||||
|
" Parameters:\n",
|
||||||
|
" df (pd.DataFrame): Input dataframe\n",
|
||||||
|
" var1 (str): First variable (for 50th percentile threshold)\n",
|
||||||
|
" var2 (str): Second variable (for 50th percentile threshold)\n",
|
||||||
|
" var3 (str): Third variable (for 90th percentile threshold)\n",
|
||||||
|
" random_state (int): Seed for reproducibility\n",
|
||||||
|
"\n",
|
||||||
|
" Returns:\n",
|
||||||
|
" pd.DataFrame: DataFrame with 'SAR_Flag' column added\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
" # Calculate thresholds\n",
|
||||||
|
" th1 = np.percentile(df[var1].dropna(), 50)\n",
|
||||||
|
" th2 = np.percentile(df[var2].dropna(), 50)\n",
|
||||||
|
" th3 = np.percentile(df[var3].dropna(), 90)\n",
|
||||||
|
"\n",
|
||||||
|
" # Split into alerting and non-alerting\n",
|
||||||
|
" alerting = df[(df[var1] >= th1) &\n",
|
||||||
|
" (df[var2] >= th2) &\n",
|
||||||
|
" (df[var3] >= th3)].copy()\n",
|
||||||
|
"\n",
|
||||||
|
" non_alerting = df.loc[~df.index.isin(alerting.index)].copy()\n",
|
||||||
|
"\n",
|
||||||
|
" # Assign SAR_Flag = 'N' for non-alerting\n",
|
||||||
|
" non_alerting['SAR_Flag'] = 'N'\n",
|
||||||
|
"\n",
|
||||||
|
" # Assign SAR_Flag for alerting data\n",
|
||||||
|
" alerting['SAR_Flag'] = 'N'\n",
|
||||||
|
" n_y = int(len(alerting) * 0.1) # 10% count\n",
|
||||||
|
" if n_y > 0:\n",
|
||||||
|
" y_indices = alerting.sample(n=n_y, random_state=random_state).index\n",
|
||||||
|
" alerting.loc[y_indices, 'SAR_Flag'] = 'Y'\n",
|
||||||
|
"\n",
|
||||||
|
" # Merge back and preserve original order\n",
|
||||||
|
" final_df = pd.concat([alerting, non_alerting]).sort_index()\n",
|
||||||
|
"\n",
|
||||||
|
" return final_df\n",
|
||||||
|
"\n",
|
||||||
"query = \"\"\"\n",
|
"query = \"\"\"\n",
|
||||||
"WITH time_windows AS (\n",
|
"WITH time_windows AS (\n",
|
||||||
" SELECT\n",
|
" SELECT\n",
|
||||||
@ -107,9 +150,14 @@
|
|||||||
" final_scenario_df['PARTICIPANT_VOLUME_PCT'] = final_scenario_df['PARTICIPANT_VOLUME']/\\\n",
|
" final_scenario_df['PARTICIPANT_VOLUME_PCT'] = final_scenario_df['PARTICIPANT_VOLUME']/\\\n",
|
||||||
" final_scenario_df['TOTAL_VOLUME'] * 100\n",
|
" final_scenario_df['TOTAL_VOLUME'] * 100\n",
|
||||||
" final_scenario_df['Segment'] = 'Default'\n",
|
" final_scenario_df['Segment'] = 'Default'\n",
|
||||||
" final_scenario_df['SAR_FLAG'] = 'N'\n",
|
" # final_scenario_df['SAR_FLAG'] = 'N'\n",
|
||||||
" final_scenario_df['Risk'] = 'Medium Risk'\n",
|
" final_scenario_df['Risk'] = 'Medium Risk'\n",
|
||||||
" final_scenario_df.dropna(inplace=True)\n",
|
" final_scenario_df.dropna(inplace=True)\n",
|
||||||
|
" final_scenario_df = apply_sar_flag(final_scenario_df,\n",
|
||||||
|
" 'PRICE_CHANGE_PCT',\n",
|
||||||
|
" 'PARTICIPANT_VOLUME_PCT',\n",
|
||||||
|
" 'TOTAL_VOLUME',\n",
|
||||||
|
" random_state=42)\n",
|
||||||
" # final_scenario_df['RUN_DATE'] = final_scenario_df['END_DATE']\n",
|
" # final_scenario_df['RUN_DATE'] = final_scenario_df['END_DATE']\n",
|
||||||
" return final_scenario_df\n"
|
" return final_scenario_df\n"
|
||||||
]
|
]
|
||||||
|
|||||||
50
main.py
50
main.py
@ -8,6 +8,49 @@ from datetime import datetime, timedelta
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from tms_data_interface import SQLQueryInterface
|
from tms_data_interface import SQLQueryInterface
|
||||||
|
|
||||||
|
def apply_sar_flag(df, var1, var2, var3, random_state=42):
|
||||||
|
"""
|
||||||
|
Apply percentile-based thresholds, split data into alerting and non-alerting,
|
||||||
|
flag random 10% of alerting data as 'Y', and merge back.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
df (pd.DataFrame): Input dataframe
|
||||||
|
var1 (str): First variable (for 50th percentile threshold)
|
||||||
|
var2 (str): Second variable (for 50th percentile threshold)
|
||||||
|
var3 (str): Third variable (for 90th percentile threshold)
|
||||||
|
random_state (int): Seed for reproducibility
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: DataFrame with 'SAR_Flag' column added
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Calculate thresholds
|
||||||
|
th1 = np.percentile(df[var1].dropna(), 50)
|
||||||
|
th2 = np.percentile(df[var2].dropna(), 50)
|
||||||
|
th3 = np.percentile(df[var3].dropna(), 90)
|
||||||
|
|
||||||
|
# Split into alerting and non-alerting
|
||||||
|
alerting = df[(df[var1] >= th1) &
|
||||||
|
(df[var2] >= th2) &
|
||||||
|
(df[var3] >= th3)].copy()
|
||||||
|
|
||||||
|
non_alerting = df.loc[~df.index.isin(alerting.index)].copy()
|
||||||
|
|
||||||
|
# Assign SAR_Flag = 'N' for non-alerting
|
||||||
|
non_alerting['SAR_Flag'] = 'N'
|
||||||
|
|
||||||
|
# Assign SAR_Flag for alerting data
|
||||||
|
alerting['SAR_Flag'] = 'N'
|
||||||
|
n_y = int(len(alerting) * 0.1) # 10% count
|
||||||
|
if n_y > 0:
|
||||||
|
y_indices = alerting.sample(n=n_y, random_state=random_state).index
|
||||||
|
alerting.loc[y_indices, 'SAR_Flag'] = 'Y'
|
||||||
|
|
||||||
|
# Merge back and preserve original order
|
||||||
|
final_df = pd.concat([alerting, non_alerting]).sort_index()
|
||||||
|
|
||||||
|
return final_df
|
||||||
|
|
||||||
query = """
|
query = """
|
||||||
WITH time_windows AS (
|
WITH time_windows AS (
|
||||||
SELECT
|
SELECT
|
||||||
@ -104,9 +147,14 @@ class Scenario:
|
|||||||
final_scenario_df['PARTICIPANT_VOLUME_PCT'] = final_scenario_df['PARTICIPANT_VOLUME']/\
|
final_scenario_df['PARTICIPANT_VOLUME_PCT'] = final_scenario_df['PARTICIPANT_VOLUME']/\
|
||||||
final_scenario_df['TOTAL_VOLUME'] * 100
|
final_scenario_df['TOTAL_VOLUME'] * 100
|
||||||
final_scenario_df['Segment'] = 'Default'
|
final_scenario_df['Segment'] = 'Default'
|
||||||
final_scenario_df['SAR_FLAG'] = 'N'
|
# final_scenario_df['SAR_FLAG'] = 'N'
|
||||||
final_scenario_df['Risk'] = 'Medium Risk'
|
final_scenario_df['Risk'] = 'Medium Risk'
|
||||||
final_scenario_df.dropna(inplace=True)
|
final_scenario_df.dropna(inplace=True)
|
||||||
|
final_scenario_df = apply_sar_flag(final_scenario_df,
|
||||||
|
'PRICE_CHANGE_PCT',
|
||||||
|
'PARTICIPANT_VOLUME_PCT',
|
||||||
|
'TOTAL_VOLUME',
|
||||||
|
random_state=42)
|
||||||
# final_scenario_df['RUN_DATE'] = final_scenario_df['END_DATE']
|
# final_scenario_df['RUN_DATE'] = final_scenario_df['END_DATE']
|
||||||
return final_scenario_df
|
return final_scenario_df
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user