Home / Blog / Data Science / Exploratory Data Analysis (EDA)

Exploratory Data Analysis (EDA)

  • June 16, 2023
  • 17355
  • 86
Author Images

Meet the Author : Mr. Bharani Kumar

Bharani Kumar Depuru is a well known IT personality from Hyderabad. He is the Founder and Director of Innodatatics Pvt Ltd and 360DigiTMG. Bharani Kumar is an IIT and ISB alumni with more than 18+ years of experience, he held prominent positions in the IT elites like HSBC, ITC Infotech, Infosys, and Deloitte. He is a prevalent IT consultant specializing in Industrial Revolution 4.0 implementation, Data Analytics practice setup, Artificial Intelligence, Big Data Analytics, Industrial IoT, Business Intelligence and Business Management. Bharani Kumar is also the chief trainer at 360DigiTMG with more than Ten years of experience and has been making the IT transition journey easy for his students. 360DigiTMG is at the forefront of delivering quality education, thereby bridging the gap between academia and industry.

Read More >
First Moment Business Decision / Measures of Central Tendency

Python code

SQL code

import pandas as pd
data = pd.read_excel('data.xlsx')

# mean
mean = data['Value'].mean()
print(mean)

# median
mean = data['Value'].median()
print(median)

# mode
mean = data['Value'].mode()
print(mode)
 
# mean
SELECT AVG(column) AS mean_column
FROM TABLE_Name;

# median
SELECT column AS median_Column5
FROM (
SELECT column, ROW_NUMBER() OVER (ORDER BY column) AS row_num,
COUNT(*) OVER () AS total_count
FROM TABLE_Name
) AS subquery
WHERE row_num = (total_count + 1) / 2 OR row_num = (total_count + 2) / 2;

# mode
SELECT column AS mode_Column3
FROM (
SELECT column, COUNT(*) AS frequency
FROM TABLE_Name
GROUP BY column
ORDER BY frequency DESC
LIMIT 1
) AS subquery;
Second Moment Business Decision / Measures of Dispersion

Python code

SQL code

import pandas as pd

# Read the data from Excel
data = pd.read_excel('data.xlsx')

# Standard Deviation of Salary
salary_stddev = data['Salary'].std()
print(""Standard Deviation of Salary:"", salary_stddev)

# Range of Years of Experience
experience_range = data['Years of Experience'].max() - data['Years of Experience'].min()
print(""Range of Years of Experience:"", experience_range)

# Variance of Performance Rating
performance_variance = data['Performance Rating'].var()
print(""Variance of Performance Rating:"", performance_variance)
# Standard Deviation of Column4
SELECT STDDEV(Column4) AS Column4_stddev
FROM TABLE_Name;

# Range of Years of Column5:
SELECT MAX(Column5) - MIN(Column5) AS Column5_range
FROM TABLE_Name;

# Variance of Performance Column6
SELECT VARIANCE(Column6) AS performance_variance
FROM TABLE_Name;
Third Moment Business Decision / Skewness

Python code

SQL code

import pandas as pd
data = pd.read_excel('data.xlsx')
skewness = data['Value'].skew()
print(skewness)
SELECT
(
SUM(POWER(column- (SELECT AVG(column) FROM TABLE_NAME), 3)) /
(COUNT(*) * POWER((SELECT STDDEV(column) FROM TABLE_NAME), 3))
) AS skewness

FROM TABLE_NAME;
Fourth Moment Business Decision / Kurtosis

Python code

SQL code

import pandas as pd
data = pd.read_excel('data.xlsx')
kurtosis = data['Value'].kurtosis()
print(kurtosis)
SELECT
(
(SUM(POWER(column- (SELECT AVG(column) FROM TABLE_NAME), 4)) /
(COUNT(*) * POWER((SELECT STDDEV(column) FROM TABLE_NAME), 4))) - 3
) AS kurtosis
FROM TABLE_NAME;
Graphical Representation
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_excel('data.xlsx')
plt.figure(figsize=(10, 6))

# Plotting the original salary column
plt.hist(df['Column'], bins=20)
plt.title('Any Name')
plt.tight_layout()
plt.show()

# There are many functions for Graphical representations in Python for Univariate, Bivariate, and Multivariate data
SQL is focused on querying and manipulating data at the database level rather than generating visual representations of the data. SQL is used to extract the necessary data from the database, and then the retrieved data is typically processed and visualized using other programming languages or tools.
Typecasting

Python code

SQL code

import pandas as pd
data = pd.read_excel('data.xlsx')
data['Value'] = data['Value'].astype(int)
print(data['Value'].dtypes)
SELECT CAST(column AS CHAR(55)) AS column _str
FROM TABLE_NAME;
Handling Duplicates

Python code

SQL code

import pandas as pd
data = pd.read_excel('data.xlsx')
data.drop_duplicates(inplace = True)
print(data)
# Count duplicates
SELECT column, COUNT(*) as duplicate_count
FROM TABLE_NAME
GROUP BY column
HAVING COUNT(*) > 1;
/*This query will return the ""Column1"" columns and their respective count of duplicates in the ""TABLE_Name"" table.*/

# Drop duplicates
CREATE TABLE temp_TABLE_NAME AS
SELECT DISTINCT *
FROM TABLE_NAME;

TRUNCATE TABLE TABLE_NAME;

INSERT INTO TABLE_NAME
SELECT * FROM temp_TABLE_NAME;

DROP TABLE temp_TABLE_Name;
/*In this Query, a temporary table named ""temp_TABLE_NAME"" is created to hold the unique records using the DISTINCT keyword. Then, the original ""TABLE_NAME"" table is truncated (emptied), and the unique records are reinserted into it. Finally, the temporary table is dropped.*/
Outlier Treatment

Python code

SQL code

import pandas as pd
data = pd.read_excel('data.xlsx')
Q1 = data['Value'].quantile(0.25)
Q3 = data['Value'].quantile(0.75)
IQR = Q3 - Q1
data = data[(data['Value'] >= Q1 - 1.5IQR) & (data['Value'] <= Q3 + 1.5IQR)]
print(data)
UPDATE TABLE_Name AS e
JOIN (
SELECT
Column1,
Column2,
Column3,
Column4,
Column5,
Column6,
Column7,
NTILE(4) OVER (ORDER BY Column5) AS Column5_quartile
FROM TABLE_Name
) AS subquery ON e.Column1 = subquery.Column1
SET e.Column5 = (
SELECT AVG(Column5)
FROM (
SELECT
Column1,
Column2,
Column3,
Column4,
Column5,
Column6,
Column7,
NTILE(4) OVER (ORDER BY Column5) AS Column5_quartile
FROM TABLE_Name
) AS temp
WHERE Column5_quartile = subquery.Column5_quartile
)
WHERE subquery.Column5_quartile IN (1, 4);
Zero & near Zero Variance features

Python code

SQL code

import pandas as pd
data = pd.read_excel('data.xlsx')
variance = data.var()
near_zero_var_features = variance[variance < 0.01]
print(near_zero_var_features)
SELECT
VARIANCE(column1) AS Column4_variance,
VARIANCE(column2) AS Column5_variance,
VARIANCE(column3) AS Column6_variance
FROM TABLE_Name;
Missing Values

Python code

SQL code

import pandas as pd
data = pd.read_excel('data.xlsx')
data.dropna(inplace = True)
print(data)
SELECT
COUNT(*) AS total_rows,
SUM(CASE WHEN column1IS NULL THEN 1 ELSE 0 END) AS column1_missing,
SUM(CASE WHEN column2 IS NULL THEN 1 ELSE 0 END) AS column2 _missing,
SUM(CASE WHEN column3 IS NULL THEN 1 ELSE 0 END) AS column3 _missing
FROM TABLE_Name;
/*This query provides the count of total rows and the number of missing columns for each column */

# delete null columns
DELETE FROM TABLE_Name
WHERE column IS NULL;
Normalization

Python code

SQL code

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
data = pd.read_excel('data.xlsx')
scaler = MinMaxScaler()
data['Value'] = scaler.fit_transform(data[['Value']])
print(data)
select * from TABLE_Name;
CREATE TABLE TABLE_Name_scaled AS
SELECT
Column1,
Column2,
Column3,
Column4,
(Column4 - min_Column4) / (max_Column4 - min_Column4) AS scaled_Column4,
(Column5 - min_Column5) / (max_Column5 - min_Column5) AS scaled_Column5,
(Column6 - min_Column6) / (max_Column6 - min_Column6) AS scaled_Column6
FROM (
SELECT
Column1,
Column2,
Column3,
Column4,
Column4,
Column5,
Column6,
(SELECT MIN(Column4) FROM TABLE_Name) AS min_Column4,
(SELECT MAX(Column4) FROM TABLE_Name) AS max_Column4,
(SELECT MIN(Column5) FROM TABLE_Name) AS min_Column5,
(SELECT MAX(Column5) FROM TABLE_Name) AS max_Column5,
(SELECT MIN(Column6) FROM TABLE_Name) AS min_Column6,
(SELECT MAX(Column6) FROM TABLE_Name) AS max_Column6
FROM TABLE_Name
) AS scaled_data;
Discretization/Binning/Grouping

Python code

SQL code

import pandas as pd
data = pd.read_excel('data.xlsx')
data['Value_bins'] = pd.cut(data['Value'], bins=3, labels=['Low', 'Medium', 'High'])
print(data)
SELECT
Column1,
Column2,
Column3,
Column4,
Column5,
Column6,
Column7,
CASE
WHEN Column5 < 50000 THEN 'Low'
WHEN Column5 >= 50000 AND Column5 < 100000 THEN 'Medium'
WHEN Column5 >= 100000 THEN 'High'
ELSE 'Unknown'
END AS Column5_group
FROM TABLE_Name;
Dummy Variable Creation

Python code

SQL code

import pandas as pd
data = pd.read_excel('data.xlsx')
dummy_vars = pd.get_dummies(data['Category'])
data = pd.concat([data, dummy_vars], axis=1)
print(data)
select * from TABLE_Name;
SELECT
Column1,
Column2,
Column4,
Column5,
CASE WHEN Column3 = 'HR' THEN 1 ELSE 0 END AS is_hr,
CASE WHEN Column3 = 'Finance' THEN 1 ELSE 0 END AS is_finance,
CASE WHEN Column3 = 'IT' THEN 1 ELSE 0 END AS is_it,
CASE WHEN Column3 = 'Sales' THEN 1 ELSE 0 END AS is_sales,
CASE WHEN Column3 = 'Marketing' THEN 1 ELSE 0 END AS is_marketing
FROM TABLE_Name;
Transformations

Python code

SQL code

import pandas as pd
data = pd.read_excel('data.xlsx')
data['Value_log'] = np.log(data['Value'])
print(data)
# Create the new table
CREATE TABLE TABLE_Name_transformed AS
SELECT
Column1,
Column2,
Column3,
Column4,
Column5,
Column6,
Column7,
LOG(Column5) AS Column5_log,
SQRT(Column5) AS Column5_sqrt
FROM TABLE_Name;

Data Science Training Institutes in Other Locations

 
 
Read
Success Stories
Make an Enquiry