Home / Blog / Data Science / Exploratory Data Analysis (EDA)

# Exploratory Data Analysis (EDA)

• June 16, 2023
• 12640
• 86

### Meet the Author : Mr. Bharani Kumar

Bharani Kumar Depuru is a well known IT personality from Hyderabad. He is the Founder and Director of Innodatatics Pvt Ltd and 360DigiTMG. Bharani Kumar is an IIT and ISB alumni with more than 18+ years of experience, he held prominent positions in the IT elites like HSBC, ITC Infotech, Infosys, and Deloitte. He is a prevalent IT consultant specializing in Industrial Revolution 4.0 implementation, Data Analytics practice setup, Artificial Intelligence, Big Data Analytics, Industrial IoT, Business Intelligence and Business Management. Bharani Kumar is also the chief trainer at 360DigiTMG with more than Ten years of experience and has been making the IT transition journey easy for his students. 360DigiTMG is at the forefront of delivering quality education, thereby bridging the gap between academia and industry.

First Moment Business Decision / Measures of Central Tendency

## SQL code

import pandas as pd

# mean
mean = data['Value'].mean()
print(mean)

# median
mean = data['Value'].median()
print(median)

# mode
mean = data['Value'].mode()
print(mode)

# mean
SELECT AVG(column) AS mean_column
FROM TABLE_Name;

# median
SELECT column AS median_Column5
FROM (
SELECT column, ROW_NUMBER() OVER (ORDER BY column) AS row_num,
COUNT(*) OVER () AS total_count
FROM TABLE_Name
) AS subquery
WHERE row_num = (total_count + 1) / 2 OR row_num = (total_count + 2) / 2;

# mode
SELECT column AS mode_Column3
FROM (
SELECT column, COUNT(*) AS frequency
FROM TABLE_Name
GROUP BY column
ORDER BY frequency DESC
LIMIT 1
) AS subquery;
Second Moment Business Decision / Measures of Dispersion

## SQL code

import pandas as pd

# Read the data from Excel

# Standard Deviation of Salary
salary_stddev = data['Salary'].std()
print(""Standard Deviation of Salary:"", salary_stddev)

# Range of Years of Experience
experience_range = data['Years of Experience'].max() - data['Years of Experience'].min()
print(""Range of Years of Experience:"", experience_range)

# Variance of Performance Rating
performance_variance = data['Performance Rating'].var()
print(""Variance of Performance Rating:"", performance_variance)
# Standard Deviation of Column4
SELECT STDDEV(Column4) AS Column4_stddev
FROM TABLE_Name;

# Range of Years of Column5:
SELECT MAX(Column5) - MIN(Column5) AS Column5_range
FROM TABLE_Name;

# Variance of Performance Column6
SELECT VARIANCE(Column6) AS performance_variance
FROM TABLE_Name;
Third Moment Business Decision / Skewness

## SQL code

import pandas as pd
skewness = data['Value'].skew()
print(skewness)
SELECT
(
SUM(POWER(column- (SELECT AVG(column) FROM TABLE_NAME), 3)) /
(COUNT(*) * POWER((SELECT STDDEV(column) FROM TABLE_NAME), 3))
) AS skewness

FROM TABLE_NAME;
Fourth Moment Business Decision / Kurtosis

## SQL code

import pandas as pd
kurtosis = data['Value'].kurtosis()
print(kurtosis)
SELECT
(
(SUM(POWER(column- (SELECT AVG(column) FROM TABLE_NAME), 4)) /
(COUNT(*) * POWER((SELECT STDDEV(column) FROM TABLE_NAME), 4))) - 3
) AS kurtosis
FROM TABLE_NAME;
Graphical Representation
import pandas as pd
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))

# Plotting the original salary column
plt.hist(df['Column'], bins=20)
plt.title('Any Name')
plt.tight_layout()
plt.show()

# There are many functions for Graphical representations in Python for Univariate, Bivariate, and Multivariate data
SQL is focused on querying and manipulating data at the database level rather than generating visual representations of the data. SQL is used to extract the necessary data from the database, and then the retrieved data is typically processed and visualized using other programming languages or tools.
Typecasting

## SQL code

import pandas as pd
data['Value'] = data['Value'].astype(int)
print(data['Value'].dtypes)
SELECT CAST(column AS CHAR(55)) AS column _str
FROM TABLE_NAME;
Handling Duplicates

## SQL code

import pandas as pd
data.drop_duplicates(inplace = True)
print(data)
# Count duplicates
SELECT column, COUNT(*) as duplicate_count
FROM TABLE_NAME
GROUP BY column
HAVING COUNT(*) > 1;
/*This query will return the ""Column1"" columns and their respective count of duplicates in the ""TABLE_Name"" table.*/

# Drop duplicates
CREATE TABLE temp_TABLE_NAME AS
SELECT DISTINCT *
FROM TABLE_NAME;

TRUNCATE TABLE TABLE_NAME;

INSERT INTO TABLE_NAME
SELECT * FROM temp_TABLE_NAME;

DROP TABLE temp_TABLE_Name;
/*In this Query, a temporary table named ""temp_TABLE_NAME"" is created to hold the unique records using the DISTINCT keyword. Then, the original ""TABLE_NAME"" table is truncated (emptied), and the unique records are reinserted into it. Finally, the temporary table is dropped.*/
Outlier Treatment

## SQL code

import pandas as pd
Q1 = data['Value'].quantile(0.25)
Q3 = data['Value'].quantile(0.75)
IQR = Q3 - Q1
data = data[(data['Value'] >= Q1 - 1.5IQR) & (data['Value'] <= Q3 + 1.5IQR)]
print(data)
UPDATE TABLE_Name AS e
JOIN (
SELECT
Column1,
Column2,
Column3,
Column4,
Column5,
Column6,
Column7,
NTILE(4) OVER (ORDER BY Column5) AS Column5_quartile
FROM TABLE_Name
) AS subquery ON e.Column1 = subquery.Column1
SET e.Column5 = (
SELECT AVG(Column5)
FROM (
SELECT
Column1,
Column2,
Column3,
Column4,
Column5,
Column6,
Column7,
NTILE(4) OVER (ORDER BY Column5) AS Column5_quartile
FROM TABLE_Name
) AS temp
WHERE Column5_quartile = subquery.Column5_quartile
)
WHERE subquery.Column5_quartile IN (1, 4);
Zero & near Zero Variance features

## SQL code

import pandas as pd
variance = data.var()
near_zero_var_features = variance[variance < 0.01]
print(near_zero_var_features)
SELECT
VARIANCE(column1) AS Column4_variance,
VARIANCE(column2) AS Column5_variance,
VARIANCE(column3) AS Column6_variance
FROM TABLE_Name;
Missing Values

## SQL code

import pandas as pd
data.dropna(inplace = True)
print(data)
SELECT
COUNT(*) AS total_rows,
SUM(CASE WHEN column1IS NULL THEN 1 ELSE 0 END) AS column1_missing,
SUM(CASE WHEN column2 IS NULL THEN 1 ELSE 0 END) AS column2 _missing,
SUM(CASE WHEN column3 IS NULL THEN 1 ELSE 0 END) AS column3 _missing
FROM TABLE_Name;
/*This query provides the count of total rows and the number of missing columns for each column */

# delete null columns
DELETE FROM TABLE_Name
WHERE column IS NULL;
Normalization

## SQL code

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data['Value'] = scaler.fit_transform(data[['Value']])
print(data)
select * from TABLE_Name;
CREATE TABLE TABLE_Name_scaled AS
SELECT
Column1,
Column2,
Column3,
Column4,
(Column4 - min_Column4) / (max_Column4 - min_Column4) AS scaled_Column4,
(Column5 - min_Column5) / (max_Column5 - min_Column5) AS scaled_Column5,
(Column6 - min_Column6) / (max_Column6 - min_Column6) AS scaled_Column6
FROM (
SELECT
Column1,
Column2,
Column3,
Column4,
Column4,
Column5,
Column6,
(SELECT MIN(Column4) FROM TABLE_Name) AS min_Column4,
(SELECT MAX(Column4) FROM TABLE_Name) AS max_Column4,
(SELECT MIN(Column5) FROM TABLE_Name) AS min_Column5,
(SELECT MAX(Column5) FROM TABLE_Name) AS max_Column5,
(SELECT MIN(Column6) FROM TABLE_Name) AS min_Column6,
(SELECT MAX(Column6) FROM TABLE_Name) AS max_Column6
FROM TABLE_Name
) AS scaled_data;
Discretization/Binning/Grouping

## SQL code

import pandas as pd
data['Value_bins'] = pd.cut(data['Value'], bins=3, labels=['Low', 'Medium', 'High'])
print(data)
SELECT
Column1,
Column2,
Column3,
Column4,
Column5,
Column6,
Column7,
CASE
WHEN Column5 < 50000 THEN 'Low'
WHEN Column5 >= 50000 AND Column5 < 100000 THEN 'Medium'
WHEN Column5 >= 100000 THEN 'High'
ELSE 'Unknown'
END AS Column5_group
FROM TABLE_Name;
Dummy Variable Creation

## SQL code

import pandas as pd
dummy_vars = pd.get_dummies(data['Category'])
data = pd.concat([data, dummy_vars], axis=1)
print(data)
select * from TABLE_Name;
SELECT
Column1,
Column2,
Column4,
Column5,
CASE WHEN Column3 = 'HR' THEN 1 ELSE 0 END AS is_hr,
CASE WHEN Column3 = 'Finance' THEN 1 ELSE 0 END AS is_finance,
CASE WHEN Column3 = 'IT' THEN 1 ELSE 0 END AS is_it,
CASE WHEN Column3 = 'Sales' THEN 1 ELSE 0 END AS is_sales,
CASE WHEN Column3 = 'Marketing' THEN 1 ELSE 0 END AS is_marketing
FROM TABLE_Name;
Transformations

## SQL code

import pandas as pd
data['Value_log'] = np.log(data['Value'])
print(data)
# Create the new table
CREATE TABLE TABLE_Name_transformed AS
SELECT
Column1,
Column2,
Column3,
Column4,
Column5,
Column6,
Column7,
LOG(Column5) AS Column5_log,
SQRT(Column5) AS Column5_sqrt
FROM TABLE_Name;