- Today, we’ll dive deeper into Exploratory Data Analysis (EDA).
- We’ll focus on:
- Identifying patterns, relationships, and anomalies.
- Building more complex visualizations using
ggplot2
.
2025-08-22
ggplot2
.# Basic scatter plot ggplot(mtcars, aes(x = hp, y = mpg)) + geom_point() + labs(title = "HP vs MPG (Overall)", x = "Horsepower", y = "Miles Per Gallon")
facet_wrap()
or facet_grid()
.facet_wrap()
is ideal for a single categorical variable.facet_grid()
is used for two categorical variables.# Faceting by cylinder count ggplot(mtcars, aes(x = hp, y = mpg)) + geom_point() + facet_wrap(~ cyl_factor) + labs(title = "HP vs MPG by Cylinder Count", x = "Horsepower", y = "Miles Per Gallon")
geom_smooth()
geom_smooth()
adds a smoothed conditional mean to a plot.# Scatter plot with a smooth line ggplot(mtcars, aes(x = hp, y = mpg)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + # Add linear regression line, no standard error labs(title = "HP vs MPG with Linear Trend", x = "Horsepower", y = "Miles Per Gallon")
geom_smooth()
with faceting shows how trends vary across different groups.ggplot(mtcars, aes(x = hp, y = mpg)) + geom_point() + facet_wrap(~ cyl_factor) + geom_smooth(method = "lm", se = FALSE, color = "blue") + labs(title = "HP vs MPG by Cylinder Count with Linear Trends", x = "Horsepower", y = "Miles Per Gallon")
# Calculate correlation matrix for numerical variables cor_matrix <- cor(mtcars[, c("mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb")]) print(round(cor_matrix, 2))
## mpg cyl disp hp drat wt qsec vs am gear carb ## mpg 1.00 -0.85 -0.85 -0.78 0.68 -0.87 0.42 0.66 0.60 0.48 -0.55 ## cyl -0.85 1.00 0.90 0.83 -0.70 0.78 -0.59 -0.81 -0.52 -0.49 0.53 ## disp -0.85 0.90 1.00 0.79 -0.71 0.89 -0.43 -0.71 -0.59 -0.56 0.39 ## hp -0.78 0.83 0.79 1.00 -0.45 0.66 -0.71 -0.72 -0.24 -0.13 0.75 ## drat 0.68 -0.70 -0.71 -0.45 1.00 -0.71 0.09 0.44 0.71 0.70 -0.09 ## wt -0.87 0.78 0.89 0.66 -0.71 1.00 -0.17 -0.55 -0.69 -0.58 0.43 ## qsec 0.42 -0.59 -0.43 -0.71 0.09 -0.17 1.00 0.74 -0.23 -0.21 -0.66 ## vs 0.66 -0.81 -0.71 -0.72 0.44 -0.55 0.74 1.00 0.17 0.21 -0.57 ## am 0.60 -0.52 -0.59 -0.24 0.71 -0.69 -0.23 0.17 1.00 0.79 0.06 ## gear 0.48 -0.49 -0.56 -0.13 0.70 -0.58 -0.21 0.21 0.79 1.00 0.27 ## carb -0.55 0.53 0.39 0.75 -0.09 0.43 -0.66 -0.57 0.06 0.27 1.00
# Convert correlation matrix to long format for ggplot2 melted_cor <- melt(cor_matrix) ggplot(melted_cor, aes(x = Var1, y = Var2, fill = value)) + geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, limit = c(-1,1), name = "Correlation") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + labs(title = "Correlation Matrix of mtcars variables")
ggplot(mtcars, aes(x = hp)) + geom_density(fill = "lightblue", alpha = 0.7,adjust = 0.3) + labs(title = "Density Plot of Horsepower (mtcars)", x = "Horsepower", y = "Density")
# Calculate Q1, Q3, and IQR for 'hp' Q1 <- quantile(mtcars$hp, 0.25) Q3 <- quantile(mtcars$hp, 0.75) IQR_val <- Q3 - Q1 # Define outlier bounds lower_bound <- Q1 - 1.5 * IQR_val upper_bound <- Q3 + 1.5 * IQR_val # Identify outliers based on IQR method mtcars %>% filter(hp < lower_bound | hp > upper_bound) %>% select(model, hp)
## model hp ## 1 Maserati Bora 335
# Using 'hp' from mtcars for anomaly detection ggplot(mtcars, aes(y = hp)) + geom_boxplot() + labs(title = "Box Plot of Horsepower (mtcars)", y = "Horsepower")
# Calculate Z-scores for 'hp' mtcars$hp_zscore <- scale(mtcars$hp) # Identify outliers based on Z-score > 2 mtcars %>% filter(abs(hp_zscore) > 2) %>% select(model,hp,hp_zscore)
## model hp hp_zscore ## 1 Maserati Bora 335 2.746567
# Example: Stacked bar chart of transmission type by cylinder count ggplot(mtcars, aes(x = cyl_factor, fill = as.factor(am))) + geom_bar(position = "fill") + # 'fill' shows proportions labs(title = "Proportion of Transmission Types by Cylinder", x = "Cylinders", y = "Proportion", fill = "Transmission (0=Auto, 1=Manual)") + theme_minimal()
# Example: Grouped bar chart of car count by cylinder and transmission type ggplot(mtcars, aes(x = cyl_factor, fill = as.factor(am))) + geom_bar(position = "dodge") + # 'dodge' places bars side-by-side labs(title = "Car Count by Cylinder and Transmission (Grouped)", x = "Cylinders", y = "Count", fill = "Transmission (0=Auto, 1=Manual)") + theme_minimal()
par(mfrow = c(1, 2)) # Arrange plots side by side hist(income_data$income, main = "Original Income Distribution", xlab = "Income") hist(log(income_data$income), main = "Log-transformed Income Distribution", xlab = "Log(Income)")
mtcars_summary <- mtcars %>% group_by(cyl_factor, gear) %>% summarise(avg_mpg = mean(mpg)) #dplyr ggplot(mtcars_summary, aes(x = cyl_factor, y = as.factor(gear), fill = avg_mpg)) + geom_tile(color = "white") + scale_fill_gradient(low = "yellow", high = "red") + labs(title = "Average MPG by Cylinder and Gear", x = "Cylinders", y = "Gears", fill = "Average MPG") + theme_minimal()
Today, we explored advanced EDA techniques:
geom_smooth()
: Visualizing trends.