import logging
import sys
import warnings

import numpy as np
import pandas as pd
from IPython.display import Image, Markdown, display
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

from windows_malware_classifier.preprocessing.data_preparation_tools import (
    load_parquet_data,
    optimize_memory_usage,
    impute_numeric_neural_network,
)
from windows_malware_classifier.preprocessing.feature_engineering_tools import (
    generate_polynomial_features,
    consolidate_entropy_features,
    create_missing_value_pattern_features,
    create_resource_metrics,
    create_section_relationship_features,
    create_timestamp_features,
    create_binary_indicators,
    create_string_metrics,
    evaluate_auto_engineered_features,
    evaluate_combined_features,
    create_feature_interactions,
    remove_correlated_features,
    validate_feature_engineering,
)
from windows_malware_classifier.visualization.distributions_plots import (
    plot_pca_comparison,
)

%load_ext autoreload
%autoreload 2

RANDOM_STATE = 42
warnings.filterwarnings("ignore")

logger = logging.getLogger(__name__)
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

train_df, test_df = load_parquet_data()

2025-05-18 17:23:22,736 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Attempting to load parquet data from: /Users/vytautasbunevicius/windows-malware-classifier/data/processed
2025-05-18 17:23:22,736 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Loading training data from: /Users/vytautasbunevicius/windows-malware-classifier/data/processed/train_df.parquet
2025-05-18 17:23:22,898 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Successfully loaded training data. Shape: (18952, 196)
2025-05-18 17:23:22,898 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Loading test data from: /Users/vytautasbunevicius/windows-malware-classifier/data/processed/test_df.parquet
2025-05-18 17:23:22,912 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Successfully loaded test data. Shape: (4716, 196)

train_df, test_df, stats = optimize_memory_usage(
    train_df=train_df, test_df=test_df, categorical_threshold=0.5, verbose=True
)

2025-05-18 17:23:22,963 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Initial memory usage - Train: 29.70MB, Test: 7.45MB
2025-05-18 17:23:23,140 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Optimization complete - Train: 13.99MB reduced (47.1%), Test: 3.48MB reduced (46.7%) | Conversions - Categorical: 0, Numeric: 174, Boolean: 10

original_train_df = train_df.copy()

train_df = create_binary_indicators(train_df)
test_df = create_binary_indicators(test_df)

display(
    Markdown(
        f"**Dataset shape after categorical engineering - Train: {train_df.shape}, Test: {test_df.shape}**"
    )
)

2025-05-18 17:23:23,243 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Created binary feature has_IMAGE_FILE_EXECUTABLE_IMAGE
2025-05-18 17:23:23,244 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Created binary feature has_IMAGE_FILE_RELOCS_STRIPPED
2025-05-18 17:23:23,245 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Created binary feature has_32BIT_MACHINE
2025-05-18 17:23:23,245 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Created binary feature has_LARGE_ADDRESS_AWARE
2025-05-18 17:23:23,246 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Created binary feature has_IMAGE_FILE_EXECUTABLE_IMAGE
2025-05-18 17:23:23,247 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Created binary feature has_IMAGE_FILE_RELOCS_STRIPPED
2025-05-18 17:23:23,247 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Created binary feature has_32BIT_MACHINE
2025-05-18 17:23:23,248 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Created binary feature has_LARGE_ADDRESS_AWARE

train_df, train_pca = consolidate_entropy_features(train_df, random_state=RANDOM_STATE)
test_df, _ = consolidate_entropy_features(test_df, random_state=RANDOM_STATE)

display(
    Markdown(f"**Number of entropy components retained: {train_pca.n_components_}**")
)
display(
    Markdown(
        f"**Explained variance ratio: {train_pca.explained_variance_ratio_.sum():.4f}**"
    )
)

2025-05-18 17:23:23,336 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - PCA maintained correlation with target: 0.1722 -> 0.1584
2025-05-18 17:23:23,372 - windows_malware_classifier.preprocessing.feature_engineering_tools - WARNING - Warning: PCA reduced correlation from 0.1691 to 0.1516
2025-05-18 17:23:23,373 - windows_malware_classifier.preprocessing.feature_engineering_tools - WARNING - Keeping high-importance entropy features alongside PCA components

original_entropy_cols = [
    col
    for col in original_train_df.columns
    if "entropy" in col and col != "is_malicious"
]

original_corrs = (
    original_train_df[original_entropy_cols]
    .corrwith(original_train_df["is_malicious"])
    .abs()
    .mean()
)

entropy_composite_cols = [col for col in train_df.columns if "entropy_composite" in col]

if entropy_composite_cols:
    new_corrs = (
        train_df[entropy_composite_cols].corrwith(train_df["is_malicious"]).abs().mean()
    )
    display(
        Markdown(
            f"**Entropy Feature Engineering Results:**\n"
            f"- Mean absolute correlation with target - Original: {original_corrs:.4f}, Engineered: {new_corrs:.4f}\n"
            f"- Number of entropy components: {len(entropy_composite_cols)}"
        )
    )
else:
    display(
        Markdown(
            f"**Original entropy correlation: {original_corrs:.4f}**\n\n"
            f"No entropy composite features found - original high-importance features may have been retained"
        )
    )

train_df = create_resource_metrics(train_df)
test_df = create_resource_metrics(test_df)

resource_cols = ["resource_complexity", "resource_risk"]
available_resource_cols = [col for col in resource_cols if col in train_df.columns]

if available_resource_cols:
    resource_corrs = train_df[available_resource_cols].corrwith(
        train_df["is_malicious"]
    )
    display(Markdown("**Resource composite correlations with target**"))
    display(resource_corrs.to_frame("correlation"))
else:
    display(
        Markdown(
            "**No resource composite features were created - required source columns may be missing**"
        )
    )

train_df = create_feature_interactions(train_df)
test_df = create_feature_interactions(test_df)

2025-05-18 17:23:23,727 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Created binary_content_composite for 18952 of 18952 rows
2025-05-18 17:23:23,740 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Created binary_content_composite for 4716 of 4716 rows

train_df = create_section_relationship_features(train_df)
test_df = create_section_relationship_features(test_df)

train_df = create_string_metrics(train_df)
test_df = create_string_metrics(test_df)

train_df = create_missing_value_pattern_features(train_df)
test_df = create_missing_value_pattern_features(test_df)

train_df = create_timestamp_features(train_df)
test_df = create_timestamp_features(test_df)

binary_cols = ["version_composite", "binary_content_composite"]
available_binary_cols = [col for col in binary_cols if col in train_df.columns]

if available_binary_cols:
    binary_corrs = train_df[available_binary_cols].corrwith(train_df["is_malicious"])
    display(Markdown("**Binary feature correlations with target**"))
    display(binary_corrs.to_frame("correlation"))
else:
    display(
        Markdown(
            "**No binary content features were created - required source columns may be missing**"
        )
    )

section_rel_cols = [
    col
    for col in train_df.columns
    if any(x in col for x in ["size_ratio", "entropy_anomaly", "size_discrepancy"])
]
if section_rel_cols:
    section_corrs = (
        train_df[section_rel_cols]
        .corrwith(train_df["is_malicious"])
        .abs()
        .sort_values(ascending=False)
        .head(5)
    )
    display(
        Markdown("**Top 5 section relationship features (correlation with target)**")
    )
    display(section_corrs.to_frame("correlation"))

string_cols = [
    col
    for col in train_df.columns
    if any(
        x in col
        for x in [
            "suspicious_strings",
            "string_density",
            "network_registry_combo",
            "suspicious_net_strings",
        ]
    )
]
if string_cols:
    string_corrs = (
        train_df[string_cols]
        .corrwith(train_df["is_malicious"])
        .abs()
        .sort_values(ascending=False)
    )
    display(Markdown("**String analysis features (correlation with target)**"))
    display(string_corrs.to_frame("correlation"))

timestamp_cols = [
    col
    for col in train_df.columns
    if any(
        x in col
        for x in [
            "timestamp_year",
            "timestamp_hour",
            "suspicious_timestamp",
            "timestamp_round",
        ]
    )
]
if timestamp_cols:
    time_corrs = (
        train_df[timestamp_cols]
        .corrwith(train_df["is_malicious"])
        .abs()
        .sort_values(ascending=False)
    )
    display(Markdown("**Timestamp features (correlation with target)**"))
    display(time_corrs.to_frame("correlation"))

missing_cols = [
    col
    for col in train_df.columns
    if any(
        x in col
        for x in [
            "missing_indicators",
            "text_features_missing",
            "section_features_missing",
        ]
    )
]
if missing_cols:
    missing_corrs = (
        train_df[missing_cols]
        .corrwith(train_df["is_malicious"])
        .abs()
        .sort_values(ascending=False)
    )
    display(Markdown("**Missing value pattern features (correlation with target)**"))
    display(missing_corrs.to_frame("correlation"))

2025-05-18 17:23:23,874 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Added section relationship features
2025-05-18 17:23:23,879 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Added section relationship features
2025-05-18 17:23:23,881 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Added enhanced string analysis features
2025-05-18 17:23:23,883 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Added enhanced string analysis features
2025-05-18 17:23:23,883 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Added missing value pattern features
2025-05-18 17:23:23,884 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Added missing value pattern features
2025-05-18 17:23:23,903 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Added timestamp analysis features
2025-05-18 17:23:23,910 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Added timestamp analysis features

enhanced_features = []
if all(
    var in locals()
    for var in [
        "section_rel_cols",
        "string_cols",
        "timestamp_cols",
        "missing_cols",
    ]
):
    enhanced_features = section_rel_cols + string_cols + timestamp_cols + missing_cols

if enhanced_features:
    nan_counts = train_df[enhanced_features].isna().sum()
    nan_features = nan_counts[nan_counts > 0]

    if not nan_features.empty:
        for col in enhanced_features:
            if col in train_df.columns and train_df[col].isna().any():
                train_df[col] = train_df[col].fillna(0)
                if col in test_df.columns:
                    test_df[col] = test_df[col].fillna(0)
    else:
        display(Markdown("**No NaN values found in enhanced features.**"))

train_df = remove_correlated_features(train_df, correlation_threshold=0.95)
test_df = remove_correlated_features(test_df, correlation_threshold=0.95)

display(
    Markdown(f"**Features after correlation-aware selection: {train_df.shape[1] - 1}**")
)

2025-05-18 17:23:25,217 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Removed 'is_dll' due to perfect correlation with 'is_exe'
2025-05-18 17:23:25,220 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Kept feature 'timestamp_year' with highest correlation (0.1853) to target
2025-05-18 17:23:25,221 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Kept feature 'section_0_virt_size' with highest correlation (0.0543) to target
2025-05-18 17:23:25,223 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Kept feature 'image_base' with highest correlation (0.0131) to target
2025-05-18 17:23:25,225 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Kept feature 'section_0_size' with highest correlation (0.1558) to target
2025-05-18 17:23:25,226 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Kept feature 'section_4_virt_size' with highest correlation (0.0241) to target
2025-05-18 17:23:25,229 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Kept feature 'resource_types' with highest correlation (0.0669) to target
2025-05-18 17:23:25,230 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Kept feature 'has_32BIT_MACHINE' with highest correlation (0.7168) to target
2025-05-18 17:23:25,236 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Removed 8 redundant features
2025-05-18 17:23:25,628 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Removed 'is_dll' due to perfect correlation with 'is_exe'
2025-05-18 17:23:25,630 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Kept feature 'timestamp_year' with highest correlation (0.1829) to target
2025-05-18 17:23:25,632 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Kept feature 'base_of_code' with highest correlation (0.0511) to target
2025-05-18 17:23:25,633 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Kept feature 'section_0_size' with highest correlation (0.1724) to target
2025-05-18 17:23:25,634 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Kept feature 'has_resources' with highest correlation (0.0554) to target
2025-05-18 17:23:25,635 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Kept feature 'has_32BIT_MACHINE' with highest correlation (0.7247) to target
2025-05-18 17:23:25,639 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Removed 6 redundant features

metrics = validate_feature_engineering(original_train_df, train_df)

display(Markdown("**Feature Engineering Validation Results:**"))

metrics_df = pd.DataFrame(metrics.items(), columns=["Metric", "Value"])
metrics_df["Value"] = metrics_df["Value"].round(4)

display(metrics_df)

X_orig = original_train_df.select_dtypes(include=["number"]).drop(
    "is_malicious", axis=1, errors="ignore"
)
y = original_train_df["is_malicious"]
X_eng = train_df.select_dtypes(include=["number"]).drop(
    "is_malicious", axis=1, errors="ignore"
)

X_orig_df = pd.DataFrame(X_orig)
X_eng_df = pd.DataFrame(X_eng)

display(Markdown("**Data Preparation for Modeling**"))

feature_info = pd.DataFrame(
    {
        "Feature Set": ["Original Features", "Engineered Features"],
        "Shape before processing": [X_orig_df.shape, X_eng_df.shape],
        "NaN values": [
            X_orig_df.isna().sum().sum(),
            X_eng_df.isna().sum().sum(),
        ],
    }
)
display(feature_info)

X_orig_df = X_orig_df.dropna(axis=1, how="all")
X_eng_df = X_eng_df.dropna(axis=1, how="all")

imputer = SimpleImputer(strategy="mean")
X_orig_imputed = imputer.fit_transform(X_orig_df)
X_eng_imputed = imputer.fit_transform(X_eng_df)

processed_info = pd.DataFrame(
    {
        "Feature Set": ["Original Features", "Engineered Features"],
        "Shape after processing": [X_orig_imputed.shape, X_eng_imputed.shape],
    }
)
display(processed_info)

display(Markdown("**Model Evaluation Results**"))

model = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)

results = {"Feature Set": [], "ROC-AUC": [], "Status": []}

try:
    auc_orig = roc_auc_score(
        y, model.fit(X_orig_imputed, y).predict_proba(X_orig_imputed)[:, 1]
    )
    results["Feature Set"].append("Original Features")
    results["ROC-AUC"].append(round(auc_orig, 4))
    results["Status"].append("Success")
except Exception as e:
    results["Feature Set"].append("Original Features")
    results["ROC-AUC"].append(None)
    results["Status"].append(f"Error: {str(e)}")
    auc_orig = None

try:
    auc_eng = roc_auc_score(
        y, model.fit(X_eng_imputed, y).predict_proba(X_eng_imputed)[:, 1]
    )
    results["Feature Set"].append("Engineered Features")
    results["ROC-AUC"].append(round(auc_eng, 4))
    results["Status"].append("Success")
except Exception as e:
    results["Feature Set"].append("Engineered Features")
    results["ROC-AUC"].append(None)
    results["Status"].append(f"Error: {str(e)}")
    auc_eng = None

results_df = pd.DataFrame(results)
display(results_df)

if auc_orig and auc_eng:
    improvement = ((auc_eng - auc_orig) / auc_orig) * 100
    display(Markdown(f"**ROC-AUC Improvement: {improvement:.2f}%**"))

fig = plot_pca_comparison(
    original_train_df,
    train_df,
    save_path="../images/feature_engineering/feature_space_comparison.png",
)

Image(filename="../images/feature_engineering/feature_space_comparison.png")

domain_engineered_train = train_df.copy()
domain_engineered_test = test_df.copy()

X_train = train_df.drop("is_malicious", axis=1).select_dtypes(include=["number"])
y_train = train_df["is_malicious"]
X_test = test_df.drop("is_malicious", axis=1).select_dtypes(include=["number"])
y_test = test_df["is_malicious"]

common_columns = list(set(X_train.columns).intersection(set(X_test.columns)))
print(f"Common columns between train and test: {len(common_columns)}")
print(f"Columns in train but not test: {set(X_train.columns) - set(X_test.columns)}")
print(f"Columns in test but not train: {set(X_test.columns) - set(X_train.columns)}")

X_train = X_train[common_columns]
X_test = X_test[common_columns]

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

print(f"Aligned data - Training: {X_train.shape}, Test: {X_test.shape}")

Common columns between train and test: 170
Columns in train but not test: {'resource_types'}
Columns in test but not train: {'section_alignment', 'section_0_entropy', 'has_resources', 'sections_max_entropy', 'section_4_size'}
Aligned data - Training: (18952, 170), Test: (4716, 170)

important_features = [
    "entropy",
    "sections_max_entropy",
    "section_0_entropy",
    "section_3_entropy",
    "section_4_entropy",
    "timestamp",
    "size_of_init_data",
    "entry_point",
    "avg_string_len",
    "image_base",
    "size_of_code",
    "size",
    "num_sections",
    "num_imports",
    "has_signature",
    "is_signature_clean",
]

X_train_domain = domain_engineered_train.drop("is_malicious", axis=1).select_dtypes(
    include=["number"]
)
X_test_domain = domain_engineered_test.drop("is_malicious", axis=1).select_dtypes(
    include=["number"]
)

feature_matrix, feature_matrix_test, selected_features, common_columns = (
    generate_polynomial_features(
        X_train_domain,
        X_test_domain,
        important_features=important_features,
        use_featuretools=True,
        polynomial_degree=2,
    )
)

2025-05-18 17:23:51,468 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Found 170 common columns between training and test datasets
2025-05-18 17:23:51,468 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Using 10 important features from 170 total common features
Built 110 features
Elapsed: 00:00 | Progress: 100%|██████████
2025-05-18 17:23:51,823 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Generated 100 new features using Featuretools
2025-05-18 17:23:51,823 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Selected 271 features for evaluation

domain_train_auc, domain_test_auc, domain_model, domain_cols = (
    evaluate_auto_engineered_features(X_train_domain, X_test_domain, y_train, y_test)
)

auto_train_auc, auto_test_auc, auto_model, auto_cols = (
    evaluate_auto_engineered_features(
        feature_matrix,
        feature_matrix_test,
        y_train,
        y_test,
    )
)

new_auto_features = [col for col in auto_cols if col not in domain_cols]

X_train_combined = pd.concat(
    [X_train_domain[domain_cols], feature_matrix[new_auto_features]], axis=1
)
X_test_combined = pd.concat(
    [X_test_domain[domain_cols], feature_matrix_test[new_auto_features]], axis=1
)

domain_feature_count = len(domain_cols)
auto_feature_count = len(new_auto_features)
logger.info(
    f"Combined feature set before evaluation: {domain_feature_count} domain features + {auto_feature_count} auto features = {domain_feature_count + auto_feature_count} total"
)

combined_train_auc, combined_test_auc, combined_model, combined_cols = (
    evaluate_combined_features(X_train_combined, X_test_combined, y_train, y_test)
)

results_df = pd.DataFrame(
    {
        "Feature Set": ["Domain-Engineered", "Automated", "Combined"],
        "Train ROC-AUC": [domain_train_auc, auto_train_auc, combined_train_auc],
        "Test ROC-AUC": [domain_test_auc, auto_test_auc, combined_test_auc],
        "Feature Count": [len(domain_cols), len(auto_cols), len(combined_cols)],
    }
)
display(Markdown("**Feature Set Performance Comparison**"))
display(results_df)

best_model_name = results_df.loc[results_df["Test ROC-AUC"].idxmax(), "Feature Set"]
best_auc = results_df["Test ROC-AUC"].max()

display(
    Markdown(
        f"**Best performing model: {best_model_name} with Test AUC = {best_auc:.4f}**"
    )
)

if best_model_name == "Domain-Engineered":
    best_model = domain_model
    best_cols = domain_cols
elif best_model_name == "Automated":
    best_model = auto_model
    best_cols = auto_cols
else:
    best_model = combined_model
    best_cols = combined_cols

try:
    importances = abs(best_model.coef_[0])
    feature_importance = pd.DataFrame({"Feature": best_cols, "Importance": importances})
    feature_importance = feature_importance.sort_values("Importance", ascending=False)

    display(Markdown("**Top 10 Most Important Features:**"))
    display(feature_importance.head(10))
except (AttributeError, IndexError) as e:
    display(Markdown("Could not extract feature importances from the model"))

2025-05-18 17:23:51,970 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Found 1 columns with NaN values
2025-05-18 17:23:51,971 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Dropping 1 columns with >10% NaN values
2025-05-18 17:23:51,981 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Found 1 columns with NaN values
2025-05-18 17:23:51,982 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Dropping 1 columns with >10% NaN values
2025-05-18 17:23:51,986 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Warning: Using only 169 common columns for evaluation
2025-05-18 17:24:02,992 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Used 169 features for evaluation
2025-05-18 17:24:03,152 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Found 8 columns with NaN values
2025-05-18 17:24:03,155 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Dropping 3 columns with >10% NaN values
2025-05-18 17:24:03,162 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Imputing 5 columns with <10% NaN values
2025-05-18 17:24:03,206 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Found 7 columns with NaN values
2025-05-18 17:24:03,206 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Dropping 3 columns with >10% NaN values
2025-05-18 17:24:03,209 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Imputing 4 columns with <10% NaN values
2025-05-18 17:24:03,217 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Warning: Using only 267 common columns for evaluation
2025-05-18 17:24:34,446 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Used 267 features for evaluation
2025-05-18 17:24:34,495 - __main__ - INFO - Combined feature set before evaluation: 169 domain features + 98 auto features = 267 total
2025-05-18 17:24:59,366 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Combined feature set after processing: 267 features
2025-05-18 17:24:59,374 - windows_malware_classifier.preprocessing.feature_engineering_tools - INFO - Domain features preserved: 0/0

# Create clean dataframes for PCA visualization with target variable
domain_engineered_train_clean = pd.DataFrame(X_train_domain)
feature_matrix_clean = pd.DataFrame(feature_matrix)

# Add the target variable back for visualization
domain_engineered_train_clean["is_malicious"] = y_train.reset_index(drop=True)
feature_matrix_clean["is_malicious"] = y_train.reset_index(drop=True)

# Replace infinite values with NaN in both dataframes
domain_engineered_train_clean = domain_engineered_train_clean.replace(
    [np.inf, -np.inf], np.nan
)
feature_matrix_clean = feature_matrix_clean.replace([np.inf, -np.inf], np.nan)

# Fill NaN values with 0 for visualization
domain_engineered_train_clean = domain_engineered_train_clean.fillna(0)
feature_matrix_clean = feature_matrix_clean.fillna(0)

fig = plot_pca_comparison(
    domain_engineered_train_clean,
    feature_matrix_clean,
    save_path="../images/feature_engineering/automated_feature_space_comparison.png",
)

Image(filename="../images/feature_engineering/automated_feature_space_comparison.png")

automated_train_df = pd.DataFrame(feature_matrix)
automated_test_df = pd.DataFrame(feature_matrix_test)

common_cols = list(set(automated_train_df.columns) & set(automated_test_df.columns))
automated_train_df = automated_train_df[common_cols]
automated_test_df = automated_test_df[common_cols]

automated_train_df["is_malicious"] = y_train.reset_index(drop=True)
automated_test_df["is_malicious"] = y_test.reset_index(drop=True)

automated_train_df = automated_train_df.replace([np.inf, -np.inf], np.nan)
automated_test_df = automated_test_df.replace([np.inf, -np.inf], np.nan)

automated_train_df, automated_test_df = impute_numeric_neural_network(
    automated_train_df, automated_test_df
)

if automated_train_df.isna().any().any() or automated_test_df.isna().any().any():
    logger.info(
        "Some NaN values remained after first imputation. Applying second pass."
    )
    automated_train_df, automated_test_df = impute_numeric_neural_network(
        automated_train_df, automated_test_df
    )

missing_before = (
    feature_matrix.isna().sum().sum() + feature_matrix_test.isna().sum().sum()
)
missing_after = (
    automated_train_df.isna().sum().sum() + automated_test_df.isna().sum().sum()
)
logger.info(
    f"Imputation complete: {missing_before} missing values before, {missing_after} missing values after"
)

automated_train_df.to_parquet(
    "../data/engineered/train_df_engineered.parquet", index=False
)
automated_test_df.to_parquet(
    "../data/engineered/test_df_engineered.parquet", index=False
)

logger.info(
    f"Saved fully imputed datasets - Training: {automated_train_df.shape}, Testing: {automated_test_df.shape}"
)

2025-05-18 17:25:03,740 - __main__ - INFO - Imputation complete: 23668 missing values before, 0 missing values after
2025-05-18 17:25:04,061 - __main__ - INFO - Saved fully imputed datasets - Training: (18952, 486), Testing: (4716, 486)

	correlation
section_4_size_discrepancy	0.223216
section_0_1_size_ratio	0.169285
section_3_size_discrepancy	0.024457
section_1_2_size_ratio	0.023159
section_2_size_discrepancy	0.023016

	Metric	Value
0	original_features	195.0000
1	transformed_features	193.0000
2	original_mean_correlation	0.1609
3	transformed_mean_correlation	0.1647
4	dimensionality_reduction	0.0103

	Feature	Importance
258	image_base * num_imports	5.401744e-12
5	entry_point * size	1.188293e-12
89	size * size_of_init_data	1.086800e-12
48	entry_point * size_of_code	4.662242e-13
26	image_base * num_sections	3.426649e-13
200	size * size_of_code	3.183942e-13
161	entry_point * size_of_init_data	2.833249e-13
115	section_2_chars	1.856899e-13
8	section_0_chars	1.288977e-13
59	section_1_chars	1.168257e-13

Feature Set	Train ROC-AUC	Test ROC-AUC	Feature Count
Domain-Engineered	0.909639	0.919924	168
Automated	0.928899	0.938000	266
Combined	0.928899	0.938000	266

Feature	Importance
image_base * num_imports	5.492935e-12
entry_point * size	1.353751e-12
size * size_of_init_data	1.009394e-12
entry_point * size_of_code	5.050794e-13
image_base * num_sections	3.411408e-13
size * size_of_code	2.783395e-13
section_2_chars	2.781856e-13
section_0_chars	1.934363e-13
section_1_chars	1.754262e-13
section_3_chars	1.256364e-13

02. Feature Engineering and Selection¶

Introduction¶

Engineering Objectives¶

Engineering Pipeline¶

Feature Engineering Implementation¶

Categorical Feature Engineering¶

Entropy Feature Consolidation¶

Resource Feature Integration¶

Binary Content Optimization¶

Additional Feature Engineering¶

Validation of Feature Engineering¶

Automated Feature Engineering¶

Basis for Feature Selection¶

Automated Feature Engineering Results¶

Feature Engineering Performance Comparison¶

Feature Importance Analysis¶

Feature Engineering Summary¶

Key Achievements¶

Automated Feature Engineering¶

Statistical Improvements¶

Impact on Malware Detection¶

	correlation
string_density	0.411785
suspicious_net_strings	0.142134
network_registry_combo	0.025392
suspicious_strings	NaN

	correlation
suspicious_timestamp	0.487101
timestamp_year	0.185278
timestamp_round	0.149566
timestamp_hour	0.129070

	Feature Set	Shape before processing	NaN values
0	Original Features	(18952, 173)	0
1	Engineered Features	(18952, 171)	18952

	Feature Set	ROC-AUC	Status
0	Original Features	0.9087	Success
1	Engineered Features	0.9091	Success

	Feature Set	Train ROC-AUC	Test ROC-AUC	Feature Count
0	Domain-Engineered	0.906382	0.917426	169
1	Automated	0.928570	0.937950	267
2	Combined	0.928089	0.937919	267

	correlation
resource_complexity	0.356470
resource_risk	0.304085