%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

import logging
import sys
import warnings

import numpy as np
from IPython.display import Image
from windows_malware_classifier.analysis.feature_analysis_tools import (
    calculate_shap_values,
    display_importance_rankings,
    display_shap_impacts,
    extract_high_correlations,
    run_statistical_tests,
)
from windows_malware_classifier.preprocessing.data_preparation_tools import (
    analyze_dataset_quality,
    detect_outliers_iqr,
    display_column_types,
    calculate_pe_statistics,
    load_malware_dataset,
    optimize_memory_usage,
    impute_numeric_neural_network,
)
from windows_malware_classifier.visualization.distributions_plots import (
    plot_category_distributions,
    plot_feature_histograms,
)

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
warnings.filterwarnings("ignore")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler(sys.stdout)],
)

logger = logging.getLogger(__name__)

train_df, test_df = load_malware_dataset(split_data=True, random_state=RANDOM_SEED)

2025-05-18 17:19:27,333 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Attempting to load dataset from: /Users/vytautasbunevicius/windows-malware-classifier/data/raw/malware_dataset.csv
2025-05-18 17:19:27,333 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Attempting to load dataset from: /Users/vytautasbunevicius/windows-malware-classifier/data/raw/malware_dataset.csv
2025-05-18 17:19:27,654 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Filtered dataset to PE files only
2025-05-18 17:19:27,654 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Filtered dataset to PE files only
2025-05-18 17:19:27,655 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Original dataset shape: (25117, 98)
2025-05-18 17:19:27,655 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Original dataset shape: (25117, 98)
2025-05-18 17:19:27,658 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - PE files dataset shape: (23895, 98)
2025-05-18 17:19:27,658 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - PE files dataset shape: (23895, 98)
2025-05-18 17:19:28,372 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Saved train dataset to /Users/vytautasbunevicius/windows-malware-classifier/data/raw/malware_dataset_train.csv
2025-05-18 17:19:28,372 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Saved train dataset to /Users/vytautasbunevicius/windows-malware-classifier/data/raw/malware_dataset_train.csv
2025-05-18 17:19:28,373 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Saved test dataset to /Users/vytautasbunevicius/windows-malware-classifier/data/raw/malware_dataset_test.csv
2025-05-18 17:19:28,373 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Saved test dataset to /Users/vytautasbunevicius/windows-malware-classifier/data/raw/malware_dataset_test.csv
2025-05-18 17:19:28,373 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Train shape: (19116, 98), Test shape: (4779, 98)
2025-05-18 17:19:28,373 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Train shape: (19116, 98), Test shape: (4779, 98)

train_df.head()

test_insights = calculate_pe_statistics(test_df)
logging.info("Test Set Insights:")
logging.info(test_insights)

2025-05-18 17:19:28,583 - root - INFO - Test Set Insights:
2025-05-18 17:19:28,583 - root - INFO - {'total_samples': 4779, 'malicious_count': 2904, 'benign_count': 1875, 'malware_ratio': 60.76585059635907, 'feature_count': 98, 'numeric_features': 86, 'categorical_features': 12, 'section_features': 31, 'security_features': 4, 'missing_values': 42364, 'missing_value_columns': 15, 'memory_usage': '3.61 MB', 'has_timestamps': True, 'unique_machine_types': 5, 'avg_file_size': {'value': 506699.3008997698, 'metric': 'bytes'}, 'malware_to_benign_ratio': 1.5488}

train_insights = calculate_pe_statistics(train_df)
logging.info("Training Set Insights:")
logging.info(train_insights)

2025-05-18 17:19:28,666 - root - INFO - Training Set Insights:
2025-05-18 17:19:28,666 - root - INFO - {'total_samples': 19116, 'malicious_count': 11737, 'benign_count': 7379, 'malware_ratio': 61.39882820673781, 'feature_count': 98, 'numeric_features': 86, 'categorical_features': 12, 'section_features': 31, 'security_features': 4, 'missing_values': 169956, 'missing_value_columns': 15, 'memory_usage': '14.44 MB', 'has_timestamps': True, 'unique_machine_types': 6, 'avg_file_size': {'value': 501682.720862105, 'metric': 'bytes'}, 'malware_to_benign_ratio': 1.5905949315625423}

# Display column types
result = display_column_types(train_df)
print(type(result))
result

<class 'pandas.core.frame.DataFrame'>

train_df, test_df, stats = optimize_memory_usage(
    train_df=train_df, test_df=test_df, categorical_threshold=0.5, verbose=True
)

2025-05-18 17:19:28,940 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Initial memory usage - Train: 28.03MB, Test: 7.01MB
2025-05-18 17:19:28,940 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Initial memory usage - Train: 28.03MB, Test: 7.01MB
2025-05-18 17:19:29,146 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Optimization complete - Train: 14.43MB reduced (51.5%), Test: 3.55MB reduced (50.7%) | Conversions - Categorical: 8, Numeric: 78, Boolean: 8
2025-05-18 17:19:29,146 - windows_malware_classifier.preprocessing.data_preparation_tools - INFO - Optimization complete - Train: 14.43MB reduced (51.5%), Test: 3.55MB reduced (50.7%) | Conversions - Categorical: 8, Numeric: 78, Boolean: 8

analysis_results = analyze_dataset_quality(
    train_df=train_df, test_df=test_df, verbose=True, parts_to_display=[1, 2, 3]
)

cross_set_duplicates = set(train_df["sha256"]).intersection(set(test_df["sha256"]))
train_df = train_df[~train_df["sha256"].isin(cross_set_duplicates)]
test_df = test_df[~test_df["sha256"].isin(cross_set_duplicates)]

train_df, test_df = impute_numeric_neural_network(train_df.copy(), test_df.copy())

analysis_results_ = analyze_dataset_quality(
    train_df=train_df, test_df=test_df, verbose=True, parts_to_display=[1, 2]
)

results = calculate_shap_values(
    df=train_df,
    target="is_malicious",
    n_estimators=100,
    binary_threshold=0.05,
    max_samples=10000,  # Use default value instead of None
    background_samples=50,
    batch_size=1500,
    random_state=RANDOM_SEED,
)

================================================================================
                            Feature Analysis Summary                            
================================================================================

Total samples: 18,952

- Numerical    :  29 features
- Categorical  :   6 features
- Binary       : 156 features
--------------------------------------------------------------------------------

 97%|=================== | 2914/3000 [00:15<00:00]

2025-05-18 17:21:19,447 - root - INFO - ✓ Successfully analyzed numerical features
2025-05-18 17:21:19,454 - root - INFO - ✓ Successfully analyzed categorical features
2025-05-18 17:21:27,007 - root - INFO - ✓ Successfully analyzed binary features

display_importance_rankings(results.importance_scores)
display_shap_impacts(results.shap_values)

====================================================================================================
                                  Top Feature Importance Analysis                                   
====================================================================================================

====================================================================================================
                                      Feature Impact Analysis                                       
====================================================================================================

entropy_numerical = ["sections_max_entropy", "section_4_entropy", "section_0_entropy"]

size_numerical = ["entry_point", "size_of_code", "size_of_stack_reserve"]

behavior_numerical = ["timestamp", "avg_string_len", "machine_type", "subsystem"]

section_categorical = ["file_type", "section_3_name", "characteristics_flags"]

all_numerical = entropy_numerical + size_numerical + behavior_numerical
all_categorical = section_categorical

fig = plot_feature_histograms(
    df=train_df,
    features=entropy_numerical,
    target="is_malicious",
    nbins=40,
    custom_layout={"title_text": "Distribution of Numerical Entropy Features by Class"},
    save_path="../images/eda/numerical_entropy_distribution.png",
)

Image(filename="../images/eda/numerical_entropy_distribution.png")

fig = plot_feature_histograms(
    df=train_df,
    features=size_numerical,
    target="is_malicious",
    nbins=40,
    custom_layout={"title_text": "Distribution of Size-Related Features by Class"},
    save_path="../images/eda/numerical_size_distribution.png",
)

Image(filename="../images/eda/numerical_size_distribution.png")

fig = plot_feature_histograms(
    df=train_df,
    features=behavior_numerical,
    target="is_malicious",
    nbins=40,
    custom_layout={"title_text": "Distribution of Behavioral Features by Class"},
    save_path="../images/eda/numerical_behavioral_features.png",
)

Image(filename="../images/eda/numerical_behavioral_features.png")

fig = plot_category_distributions(
    df=train_df,
    features=section_categorical,
    target="is_malicious",
    top_n=10,
    custom_layout={
        "title_text": "Distribution of Section Characteristics by Class",
        "height": 800,
        "width": 1600,
    },
    save_path="../images/eda/categorical_section_characteristics.png",
)

Image(filename="../images/eda/categorical_section_characteristics.png")

numerical_features = train_df.select_dtypes(include=[np.number]).columns.tolist()

anomalies = detect_outliers_iqr(train_df)

numerical_features = train_df.select_dtypes(include=[np.number]).columns.tolist()

corr_matrix = train_df[numerical_features].corr()

train_df.shape

(18952, 196)

regular_df, missing_df = extract_high_correlations(corr_matrix, threshold=0.95)

results = run_statistical_tests(train_df, alpha=0.01)
results_df = results.data

subtitle = f"Analysis of {len(train_df)} samples (α = 0.01)"
significant_mask = results_df["Significant"] == "Yes"
significant_tests = significant_mask.astype(int).sum()
total_tests = len(results_df)

print(
    f"{subtitle}\n"
    f"{significant_tests}/{total_tests} tests significant at α = 0.01 after Bonferroni correction"
)

display(results)

2025-05-18 17:21:30,525 - root - INFO - Raw p-value for H1: Maximum Section Entropy: 0.0
2025-05-18 17:21:30,529 - root - INFO - Raw p-value for H2: Third Section Entropy: 1.0
2025-05-18 17:21:30,532 - root - INFO - Raw p-value for H3: First Section Entropy: 0.0
2025-05-18 17:21:30,533 - root - INFO - Raw p-value for H4: Fourth Section Entropy: 1.0
2025-05-18 17:21:30,596 - root - INFO - Contingency Table for file_type:
is_malicious  0.00   1.00
file_type                
exe           1911  10901
dll           5305    835
2025-05-18 17:21:30,597 - root - INFO - Expected Frequencies for file_type:
[[4878.18657661 7933.81342339]
 [2337.81342339 3802.18657661]]
2025-05-18 17:21:30,654 - root - INFO - Contingency Table for characteristics:
is_malicious     0.00  1.00
characteristics            
11298.0            58     0
258.0             579  3682
259.0              53  1343
263.0              17     9
270.0              24   882
271.0              49  3010
290.0              39    80
291.0               3    22
302.0               0    10
303.0               2    15
33166.0             8   783
33167.0            16   282
33198.0             0    16
3330.0              6     1
33679.0             0    12
34.0              745   220
35.0               14    58
38.0               16     1
39.0               46     6
41358.0            15    38
47.0               77     7
547.0               0     6
551.0             106     1
558.0               7     0
559.0              48    14
771.0               0    35
775.0               8     2
782.0               2     8
783.0              21   361
815.0               4     4
8226.0           3478    68
8230.0            205     0
8238.0             64     0
8450.0            974   536
8454.0             22     1
8462.0            166   161
8482.0             82     9
8742.0            141     0
8750.0             47     4
8966.0             11     0
8974.0             36    11
Other              27    38
2025-05-18 17:21:30,655 - root - INFO - Expected Frequencies for characteristics:
[[2.20835796e+01 3.59164204e+01]
 [1.62238160e+03 2.63861840e+03]
 [5.31528915e+02 8.64471085e+02]
 [9.89953567e+00 1.61004643e+01]
 [3.44960743e+02 5.61039257e+02]
 [1.16471845e+03 1.89428155e+03]
 [4.53094133e+01 7.36905867e+01]
 [9.51878430e+00 1.54812157e+01]
 [3.80751372e+00 6.19248628e+00]
 [6.47277332e+00 1.05272267e+01]
 [3.01174335e+02 4.89825665e+02]
 [1.13463909e+02 1.84536091e+02]
 [6.09202195e+00 9.90797805e+00]
 [2.66525960e+00 4.33474040e+00]
 [4.56901646e+00 7.43098354e+00]
 [3.67425074e+02 5.97574926e+02]
 [2.74140988e+01 4.45859012e+01]
 [6.47277332e+00 1.05272267e+01]
 [1.97990713e+01 3.22009287e+01]
 [2.01798227e+01 3.28201773e+01]
 [3.19831152e+01 5.20168848e+01]
 [2.28450823e+00 3.71549177e+00]
 [4.07403968e+01 6.62596032e+01]
 [2.66525960e+00 4.33474040e+00]
 [2.36065851e+01 3.83934149e+01]
 [1.33262980e+01 2.16737020e+01]
 [3.80751372e+00 6.19248628e+00]
 [3.80751372e+00 6.19248628e+00]
 [1.45447024e+02 2.36552976e+02]
 [3.04601098e+00 4.95398902e+00]
 [1.35014436e+03 2.19585564e+03]
 [7.80540312e+01 1.26945969e+02]
 [2.43680878e+01 3.96319122e+01]
 [5.74934572e+02 9.35065428e+02]
 [8.75728155e+00 1.42427184e+01]
 [1.24505699e+02 2.02494301e+02]
 [3.46483748e+01 5.63516252e+01]
 [5.36859434e+01 8.73140566e+01]
 [1.94183200e+01 3.15816800e+01]
 [4.18826509e+00 6.81173491e+00]
 [1.78953145e+01 2.91046855e+01]
 [2.47488392e+01 4.02511608e+01]]
2025-05-18 17:21:30,700 - root - ERROR - Error in Chi-squared for section_3_name: Cannot setitem on a Categorical with a new category (Other), set the categories first
2025-05-18 17:21:30,707 - root - INFO - Raw p-value for H8: Average String Length: 0.0
2025-05-18 17:21:30,713 - root - INFO - P-values before correction: [0.0, 1.0, 0.0, 1.0, 0.0, 0.0]
2025-05-18 17:21:30,714 - root - INFO - Corrected p-values: [0.0, 1.0, 0.0, 1.0, 0.0, 0.0]
Analysis of 18952 samples (α = 0.01)
4/8 tests significant at α = 0.01 after Bonferroni correction

train_df.to_parquet("../data/processed/train_df.parquet", index=False)
test_df.to_parquet("../data/processed/test_df.parquet", index=False)

	filename	size	md5	sha256	entropy	is_malicious	is_pe	file_type	is_exe	is_dll	object_key	machine_type	timestamp	num_sections	characteristics	characteristics_flags	size_of_code	size_of_init_data	entry_point	base_of_code	image_base	section_alignment	file_alignment	major_os_version	subsystem	dll_characteristics	size_of_stack_reserve	size_of_heap_reserve	section_0_name	section_0_entropy	section_0_virt_size	section_0_size	section_0_chars	section_0_ptr_raw_data	section_1_name	section_1_entropy	section_1_virt_size	section_1_size	section_1_chars	section_1_ptr_raw_data	section_2_name	section_2_entropy	section_2_virt_size	section_2_size	section_2_chars	section_2_ptr_raw_data	section_3_name	section_3_entropy	section_3_virt_size	section_3_size	section_3_chars	section_3_ptr_raw_data	section_4_name	section_4_entropy	section_4_virt_size	section_4_size	section_4_chars	section_4_ptr_raw_data	sections_avg_entropy	sections_min_entropy	sections_max_entropy	num_imports	num_imported_dlls	suspicious_imports	has_resources	num_resources	resource_types	resource_entropy	has_debug	has_tls	num_strings	avg_string_len	num_ips	num_file_paths	contains_unicode	contains_nullbytes	suspicious_pattern_count	detected_patterns	is_text_file	line_count	avg_line_length	contains_base64	contains_hex_strings	byte_distribution
20968	1/nb0KdOOmE7UtMiwvFNmRwTqvfXKVMVGd.exe	571392	ed125c3cecce28197ac78d02b2b726dc	068f8f5419192944a9428ea625fe56e1e8ad5cc3554798...	7.73	1	1	exe	1	0	1/nb0KdOOmE7UtMiwvFNmRwTqvfXKVMVGd.exe	332.00	1595948062.00	3.00	270.00	IMAGE_FILE_EXECUTABLE_IMAGE\|IMAGE_FILE_LINE_NU...	568832.00	2048.00	576714.00	8192.00	4194304.00	8192.00	512.00	4.00	2.00	34112.00	1048576.00	1048576.00	.text	7.74	568528.00	568832.00	1610612768.00	512.00	.reloc	0.10	12.00	512.00	1107296320.00	569344.00	.rsrc	4.38	1464.00	1536.00	1073741888.00	569856.00	NaN	0.00	0.00	0.00	0.00	0.00	NaN	0.00	0.00	0.00	0.00	0.00	4.07	0.10	7.74	1.00	1.00	0.00	1	2.00	1.00	4.17	0	0	5520.00	15.15	11.00	0.00	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
10432	1/5541yjpjyOvUqiXjg5mtsk1IJMilPaUD.exe	724480	89d1c5b0a8b0b1f9c16580c8c2715a86	b6072e84d6cfb921a3fb0a38bc13e148a308b7b4158cd9...	7.15	1	1	exe	1	0	1/5541yjpjyOvUqiXjg5mtsk1IJMilPaUD.exe	332.00	708992537.00	8.00	33166.00	IMAGE_FILE_EXECUTABLE_IMAGE\|IMAGE_FILE_LINE_NU...	417792.00	305664.00	421408.00	4096.00	4194304.00	4096.00	512.00	4.00	2.00	0.00	1048576.00	1048576.00	CODE	6.61	417384.00	417792.00	1610612768.00	1024.00	DATA	3.91	4724.00	5120.00	3221225536.00	418816.00	BSS	0.00	3317.00	0.00	3221225472.00	423936.00	.idata	5.04	8688.00	8704.00	3221225536.00	423936.00	.tls	0.00	16.00	0.00	3221225472.00	432640.00	3.73	0.00	7.44	384.00	8.00	5.00	1	404.00	1.00	6.75	0	1	8830.00	8.06	0.00	0.00	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
20465	1/leGvSQva4e3YV52SQpSfaXLPdNdZDF7T.exe	178890	81765089205fd56e1fb8551217c7aae4	ed656132c965b692b3b0906e8ffad4f9d431a33f22653d...	5.49	1	1	exe	1	0	1/leGvSQva4e3YV52SQpSfaXLPdNdZDF7T.exe	332.00	1597988712.00	6.00	270.00	IMAGE_FILE_EXECUTABLE_IMAGE\|IMAGE_FILE_LINE_NU...	61440.00	147456.00	28576.00	4096.00	4194304.00	4096.00	4096.00	4.00	2.00	0.00	1048576.00	1048576.00	.text	6.52	59438.00	61440.00	1610612768.00	4096.00	.rdata	3.08	5840.00	8192.00	1073741888.00	65536.00	.data	3.09	12296.00	8192.00	3221225536.00	73728.00	.idata	3.53	2334.00	4096.00	3221225536.00	81920.00	.rsrc	4.86	106720.00	110592.00	1073741888.00	86016.00	3.51	0.00	6.52	83.00	5.00	2.00	1	5.00	1.00	2.60	1	0	992.00	7.46	0.00	0.00	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
16759	1/Vw2RIkAUVD38nXC7xmVYEDMB5NnvN45h.exe	976384	aaf02255794de006522a31b1e4a84d23	77cd50a78f234331630b2a437f8b01a7cbeee5d74b0ac4...	6.97	1	1	exe	1	0	1/Vw2RIkAUVD38nXC7xmVYEDMB5NnvN45h.exe	332.00	708992537.00	8.00	33166.00	IMAGE_FILE_EXECUTABLE_IMAGE\|IMAGE_FILE_LINE_NU...	637440.00	337920.00	641228.00	4096.00	4194304.00	4096.00	512.00	4.00	2.00	0.00	1048576.00	1048576.00	CODE	6.62	637204.00	637440.00	1610612768.00	1024.00	DATA	4.20	7560.00	7680.00	3221225536.00	638464.00	BSS	0.00	3413.00	0.00	3221225472.00	646144.00	.idata	4.87	9218.00	9728.00	3221225536.00	646144.00	.tls	0.00	16.00	0.00	3221225472.00	655872.00	3.72	0.00	7.22	402.00	10.00	5.00	1	317.00	1.00	6.26	0	1	12456.00	8.74	0.00	0.00	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2718	0/IMrn5gOSgoXbM4hipCIis0DBrRG9onG5.dll	61440	cf6fe5f60bdb122a741dc7f045247ea3	11147ba1376bf82a0547e7583dd16cfc2fb2d60a1138f6...	4.75	0	1	dll	0	1	0/IMrn5gOSgoXbM4hipCIis0DBrRG9onG5.dll	332.00	1377160472.00	3.00	8482.00	IMAGE_FILE_EXECUTABLE_IMAGE\|IMAGE_FILE_LARGE_A...	49152.00	8192.00	54126.00	8192.00	268435456.00	8192.00	4096.00	4.00	3.00	34144.00	1048576.00	1048576.00	.text	5.51	45940.00	49152.00	1610612768.00	4096.00	.rsrc	1.22	1152.00	4096.00	1073741888.00	53248.00	.reloc	0.01	12.00	4096.00	1107296320.00	57344.00	NaN	0.00	0.00	0.00	0.00	0.00	NaN	0.00	0.00	0.00	0.00	0.00	2.25	0.01	5.51	1.00	1.00	0.00	1	1.00	1.00	3.54	1	0	876.00	20.32	3.00	1.00	0.00	0.00	NaN	NaN	0.00	NaN	NaN	0.00	0.00	NaN

	feature	importance
25	sections_max_entropy	0.0614
20	section_4_entropy	0.0604
17	section_3_entropy	0.0414
28	avg_string_len	0.0413
7	section_0_entropy	0.0388
21	section_4_virt_size	0.0344
1	entropy	0.0333
2	timestamp	0.0329
22	section_4_ptr_raw_data	0.0189
18	section_3_virt_size	0.0145
0	size	0.0138
15	section_2_virt_size	0.0134
3	size_of_code	0.0113
8	section_0_virt_size	0.0110
19	section_3_ptr_raw_data	0.0105

	feature	importance
5	characteristics_flags_IMAGE_FILE_EXECUTABLE_IM...	0.0791
90	section_3_name_.pdata	0.0756
15	characteristics_flags_IMAGE_FILE_RELOCS_STRIPP...	0.0517
0	characteristics_flags_IMAGE_FILE_EXECUTABLE_IM...	0.0385
118	section_4_name_unknown	0.0353
8	characteristics_flags_IMAGE_FILE_EXECUTABLE_IM...	0.0206
11	characteristics_flags_IMAGE_FILE_RELOCS_STRIPP...	0.0191
1	characteristics_flags_IMAGE_FILE_EXECUTABLE_IM...	0.0175
43	section_1_name_.data	0.0146
93	section_3_name_.rsrc	0.0124
104	section_4_name_.didat	0.0115
109	section_4_name_.pdata	0.0107
98	section_3_name_unknown	0.0099
72	section_2_name_.rsrc	0.0093
69	section_2_name_.rdata	0.0080

	feature	importance
4681	is_text_file_missing_1	0.0440
4684	contains_base64_missing_1	0.0387
4678	contains_nullbytes_missing_1	0.0327
4677	contains_unicode_missing_1	0.0312
8	machine_type_34404.0	0.0234
1	file_type_exe	0.0231
4685	contains_hex_strings_missing_1	0.0210
3	is_dll_1.0	0.0207
682	subsystem_2.0	0.0164
683	subsystem_3.0	0.0134
79	characteristics_8226.0	0.0128
3291	has_exports_1.0	0.0126
758	size_of_stack_reserve_1048576.0	0.0116
4019	has_debug_1.0	0.0116
618	major_os_version_4.0	0.0089

	feature	mean_impact	feature_type
5	characteristics_flags_IMAGE_FILE_EXECUTABLE_IM...	0.0149	categorical
90	section_3_name_.pdata	0.0222	categorical
25	sections_max_entropy	0.0034	numerical
20	section_4_entropy	0.0029	numerical
15	characteristics_flags_IMAGE_FILE_RELOCS_STRIPP...	0.0018	categorical
4681	is_text_file_missing_1	0.0109	binary
28	avg_string_len	0.0017	numerical
7	section_0_entropy	0.0029	numerical
4684	contains_base64_missing_1	0.0075	binary
0	characteristics_flags_IMAGE_FILE_EXECUTABLE_IM...	0.0032	categorical
118	section_4_name_unknown	0.0012	categorical
21	section_4_virt_size	0.0057	numerical
1	entropy	0.0039	numerical
2	timestamp	0.0050	numerical
4678	contains_nullbytes_missing_1	0.0064	binary

01. Data Loading and Exploratory Data Analysis (EDA)¶

Introduction¶

Analysis Objectives¶

Analysis Pipeline¶

Success Metrics¶

Data Loading and Initial Inspection¶

Detailed Dataset Insights¶

Data Type Analysis¶

Interpretation of SHAP Analysis¶

Feature Distribution Analysis¶

Numerical Entropy Features¶

Numerical Behavioral Features¶

Categorical Section Characteristics¶

Outlier Detection (IQR)¶

Analysis of Outliers¶

Correlation Analysis¶

Interpretation of Correlation Analysis¶

Regular Feature Correlations¶

Missing Indicator Correlations¶

Feature Selection Decisions¶

Statistical Significance Testing¶

Interpretation of Statistical Test Results¶

Next Steps: Feature Engineering and Selection¶

Saving Processed Data¶

	feature	mean_impact	feature_type
17	section_3_entropy	-0.0010	numerical
11	characteristics_flags_IMAGE_FILE_RELOCS_STRIPP...	-0.0003	categorical
93	section_3_name_.rsrc	-0.0004	categorical
3	size_of_code	-0.0030	numerical
8	section_0_virt_size	-0.0013	numerical
19	section_3_ptr_raw_data	-0.0018	numerical
5	entry_point	-0.0032	numerical
98	section_3_name_unknown	-0.0002	categorical
72	section_2_name_.rsrc	-0.0008	categorical
618	major_os_version_4.0	-0.0001	binary
10	section_1_entropy	-0.0007	numerical
24	sections_min_entropy	-0.0005	numerical
14	section_2_entropy	-0.0012	numerical
46	characteristics_271.0	-0.0001	binary
16	section_2_ptr_raw_data	-0.0015	numerical

	Feature Name	Anomalies	Percentage	IQR Bounds	Flagged Values
0	size	1144	6.04%	[-775246.00, 1535842.00]	[1536000.00, 4239088.00]
1	entropy	569	3.00%	[3.48, 9.33]	[0.13, 3.48]
7	timestamp	5169	27.27%	[1083695160.50, 1908202284.50]	[0.00, 4294967295.00]
8	num_sections	504	2.66%	[-1.50, 10.50]	[11.00, 25.00]
9	characteristics	1175	6.20%	[-11694.00, 20178.00]	[33166.00, 49582.00]
10	size_of_code	862	4.55%	[-538880.00, 978688.00]	[978944.00, 125768626.00]
11	size_of_init_data	1422	7.50%	[-274432.00, 479232.00]	[479744.00, 3734747915.00]
12	size_of_uninit_data	1964	10.36%	[0.00, 0.00]	[1.00, 137035776.00]
13	entry_point	1015	5.36%	[-502775.25, 872766.75]	[873230.00, 1948744943.00]
14	base_of_code	559	2.95%	[-2048.00, 14336.00]	[20480.00, 137039872.00]
15	image_base	4447	23.46%	[-2566848512.00, 4289265664.00]	[4294967296.00, 18446735277616529408.00]
16	section_alignment	2	0.01%	[-2048.00, 14336.00]	[2097152.00, 2097152.00]
17	file_alignment	3410	17.99%	[512.00, 512.00]	[16.00, 4096.00]
18	major_os_version	3504	18.49%	[1.00, 9.00]	[0.00, 10.00]
19	minor_os_version	1321	6.97%	[0.00, 0.00]	[1.00, 51.00]
20	major_image_version	45	0.24%	[-7.50, 12.50]	[13.00, 21315.00]
21	minor_image_version	871	4.60%	[0.00, 0.00]	[1.00, 26001.00]
22	subsystem	22	0.12%	[0.50, 4.50]	[0.00, 16.00]
24	size_of_stack_reserve	5278	27.85%	[1048576.00, 1048576.00]	[0.00, 33554432.00]
25	size_of_heap_reserve	471	2.49%	[1048576.00, 1048576.00]	[0.00, 16777216.00]
27	section_0_entropy	2161	11.40%	[4.42, 7.94]	[0.00, 8.00]
28	section_0_virt_size	916	4.83%	[-543715.00, 995861.00]	[996536.00, 137035776.00]
29	section_0_size	821	4.33%	[-533504.00, 965632.00]	[967168.00, 4116480.00]
30	section_0_chars	2562	13.52%	[1610612768.00, 1610612768.00]	[1073741888.00, 4026531904.00]
31	section_0_ptr_raw_data	3534	18.65%	[-256.00, 1792.00]	[2048.00, 115712.00]
33	section_1_virt_size	2067	10.91%	[-74034.50, 127529.50]	[127596.00, 49270652.00]
34	section_1_size	2007	10.59%	[-72192.00, 124416.00]	[124928.00, 4079616.00]
35	section_1_chars	4537	23.94%	[268435568.00, 2415919088.00]	[0.00, 3763339296.00]
36	section_1_ptr_raw_data	817	4.31%	[-531904.00, 961600.00]	[962048.00, 4116992.00]
38	section_2_virt_size	2419	12.76%	[-33802.12, 56818.88]	[56864.00, 93568008.00]
39	section_2_size	2388	12.60%	[-13312.00, 23552.00]	[24064.00, 11509760.00]
41	section_2_ptr_raw_data	902	4.76%	[-605440.00, 1108736.00]	[1108992.00, 4122112.00]
43	section_3_virt_size	3268	17.24%	[-20458.50, 34097.50]	[34152.00, 40349696.00]
44	section_3_size	3222	17.00%	[-13824.00, 23040.00]	[23552.00, 6451200.00]
45	section_3_chars	2733	14.42%	[-1610612832.00, 2684354720.00]	[3221225472.00, 4026531904.00]
46	section_3_ptr_raw_data	1657	8.74%	[-367104.00, 611840.00]	[612864.00, 11669504.00]
48	section_4_virt_size	3593	18.96%	[-1932.00, 3220.00]	[3224.00, 30527488.00]
49	section_4_size	3428	18.09%	[-2304.00, 3840.00]	[4096.00, 30527488.00]
50	section_4_chars	3098	16.35%	[-1660944480.00, 2768240800.00]	[3221225472.00, 4026531904.00]
51	section_4_ptr_raw_data	3012	15.89%	[-205056.00, 341760.00]	[342016.00, 4120576.00]
52	sections_avg_entropy	211	1.11%	[1.39, 6.64]	[0.00, 7.99]
53	sections_min_entropy	33	0.17%	[-3.55, 5.97]	[6.05, 7.99]
54	sections_max_entropy	550	2.90%	[3.90, 9.44]	[0.00, 3.90]
55	num_imports	938	4.95%	[-246.50, 413.50]	[414.00, 3314.00]
56	num_imported_dlls	1388	7.32%	[-12.50, 23.50]	[24.00, 92.00]
57	suspicious_imports	344	1.82%	[-4.50, 7.50]	[8.00, 15.00]
59	num_exports	3507	18.50%	[-1.50, 2.50]	[3.00, 11116.00]
60	has_resources	1545	8.15%	[1.00, 1.00]	[0.00, 0.00]
61	num_resources	3006	15.86%	[-17.00, 31.00]	[32.00, 820.00]
63	resource_types	1547	8.16%	[1.00, 1.00]	[0.00, 2.00]
64	resource_entropy	2215	11.69%	[1.37, 5.85]	[0.00, 8.00]
67	has_tls	2938	15.50%	[0.00, 0.00]	[1.00, 1.00]
70	num_strings	1282	6.76%	[-7502.88, 14886.12]	[14887.00, 128647.00]
71	avg_string_len	1312	6.92%	[-3.46, 25.97]	[25.98, 5406.87]
72	num_urls	4255	22.45%	[-1.50, 2.50]	[3.00, 1001.00]
73	num_ips	3255	17.17%	[-3.00, 5.00]	[6.00, 5635.00]
74	num_emails	2480	13.09%	[0.00, 0.00]	[1.00, 350.00]
75	num_registry	33	0.17%	[0.00, 0.00]	[1.00, 50.00]
76	num_file_paths	3745	19.76%	[0.00, 0.00]	[1.00, 6774.00]
119	section_0_name_missing	23	0.12%	[0.00, 0.00]	[1.00, 1.00]
125	section_1_name_missing	96	0.51%	[0.00, 0.00]	[1.00, 1.00]
131	section_2_name_missing	526	2.78%	[0.00, 0.00]	[1.00, 1.00]

	Category	Count
0	High Anomaly Features (>10%)	29
1	Moderate Anomaly Features (5-10%)	11
2	Low Anomaly Features (<5%)	144

	Description	Value
0	Total Rows with Anomalies	18390
1	Percentage of Rows with Anomalies	97.03%
2	Features with Anomalies	62
3	Total Numerical Features	184

Metric	Count
Total correlated pairs (threshold > 0.95)	22
Regular feature pairs	7
Missing indicator pairs	15

	Feature 1	Feature 2	Correlation
0	is_exe	is_dll	-1.000
1	has_resources	resource_types	0.999
2	image_base	section_alignment	0.997
3	section_0_size	section_1_ptr_raw_data	0.994
4	section_4_virt_size	section_4_size	0.980
5	major_image_version	minor_image_version	0.976
6	size_of_uninit_data	section_0_virt_size	0.961

	Feature 1	Feature 2	Correlation
0	contains_unicode_missing	contains_nullbytes_missing	1.000
1	contains_unicode_missing	is_text_file_missing	1.000
2	contains_unicode_missing	contains_base64_missing	1.000
3	contains_unicode_missing	contains_hex_strings_missing	1.000
4	contains_nullbytes_missing	is_text_file_missing	1.000
5	contains_nullbytes_missing	contains_base64_missing	1.000
6	contains_nullbytes_missing	contains_hex_strings_missing	1.000
7	is_text_file_missing	contains_base64_missing	1.000
8	is_text_file_missing	contains_hex_strings_missing	1.000
9	contains_base64_missing	contains_hex_strings_missing	1.000
10	is_malicious	contains_unicode_missing	0.950
11	is_malicious	contains_nullbytes_missing	0.950
12	is_malicious	is_text_file_missing	0.950
13	is_malicious	contains_base64_missing	0.950
14	is_malicious	contains_hex_strings_missing	0.950

Hypothesis	Test	Feature	Statistic	P-value	P-value (corrected)	Effect Size	Direction	Significant
H1: Maximum Section Entropy	Mann-Whitney U	sections_max_entropy	67717109.0000	0	0	1.0000	Greater	Yes
H2: Third Section Entropy	Mann-Whitney U	section_3_entropy	33718258.5000	1	1	0.0000	Greater	No
H3: First Section Entropy	Mann-Whitney U	section_0_entropy	60879658.5000	0	0	0.0000	Greater	Yes
H4: Fourth Section Entropy	Mann-Whitney U	section_4_entropy	24744577.5000	1	1	0.0000	Greater	No
H5: File Type Distribution	Chi-squared	file_type	8993.0400	0	0	0.6889	N/A	Yes
H6: PE Characteristics	Chi-squared	characteristics	---	---	---	---	N/A	No
H7: Third Section Name	Chi-squared	section_3_name	---	---	---	---	N/A	No
H8: Average String Length	Mann-Whitney U	avg_string_len	22931036.5000	0	0	0.0000	Two-sided	Yes

	Dataset	MD5 Duplicates	SHA256 Duplicates
0	Train	38	38
1	Test	1	1
2	Cross_set	39	39

01. Data Loading and Exploratory Data Analysis (EDA)¶

Introduction¶

Analysis Objectives¶

Analysis Pipeline¶

Success Metrics¶

Data Loading and Initial Inspection¶

Detailed Dataset Insights¶

Data Type Analysis¶

Interpretation of SHAP Analysis¶

Feature Distribution Analysis¶

Numerical Entropy Features¶

Numerical Size-Related Features¶

Numerical Behavioral Features¶

Categorical Section Characteristics¶

Outlier Detection (IQR)¶

Analysis of Outliers¶

Correlation Analysis¶

Interpretation of Correlation Analysis¶

Regular Feature Correlations¶

Missing Indicator Correlations¶

Feature Selection Decisions¶

Statistical Significance Testing¶

Interpretation of Statistical Test Results¶

Next Steps: Feature Engineering and Selection¶

Saving Processed Data¶