Visualizing Data with Seaborn (Python Track)

2024-09-11

Introduction

  • What is Seaborn?
  • Why use Seaborn for data visualization?
  • Brief overview of the session

Source Code

GitHub 1

Colab

Setting Up the Environment

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# from matplotlib import rcParams
# # Set global font properties to Arial
# rcParams.update(
#     {
#         "font.family": "sans-serif",
#         "font.sans-serif": "Arial",
#         "pdf.fonttype": 42,  # Embed fonts as Type 3 fonts for compatibility
#         "ps.fonttype": 42,
#         "text.usetex": False,
#         "svg.fonttype": "none",
#     }
# )


def stardize_columns(df):
    df.columns = [" ".join(col.strip().split()) for col in df.columns]
    # Basic data cleaning
    df["DATE OF OCCURRENCE"] = pd.to_datetime(df["DATE OF OCCURRENCE"])


# Load the data
df = pd.read_csv(
    "https://raw.githubusercontent.com/cauliyang/Visualizing-Data-with-Seaborn/main/data/Crimes_One_year_prior_to_present.csv",
    nrows=1000,
)
stardize_columns(df)

Understanding the Dataset

# Exercise: Explore the dataset
# 1. Use df.info() to display basic information about the dataset
# 2. Use df.head() to show the first few rows

# Your code here:


# What insights can you gather from this initial exploration?

data source: https://data.cityofchicago.org/Public-Safety/Crimes-One-year-prior-to-present/x2n5-8w5q/data

Understanding the Dataset

# Display basic information about the dataset
print(df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   CASE#                  1000 non-null   object        
 1   DATE OF OCCURRENCE     1000 non-null   datetime64[ns]
 2   BLOCK                  1000 non-null   object        
 3   IUCR                   1000 non-null   object        
 4   PRIMARY DESCRIPTION    1000 non-null   object        
 5   SECONDARY DESCRIPTION  1000 non-null   object        
 6   LOCATION DESCRIPTION   998 non-null    object        
 7   ARREST                 1000 non-null   object        
 8   DOMESTIC               1000 non-null   object        
 9   BEAT                   1000 non-null   int64         
 10  WARD                   1000 non-null   int64         
 11  FBI CD                 1000 non-null   object        
 12  X COORDINATE           999 non-null    float64       
 13  Y COORDINATE           999 non-null    float64       
 14  LATITUDE               999 non-null    float64       
 15  LONGITUDE              999 non-null    float64       
 16  LOCATION               999 non-null    object        
dtypes: datetime64[ns](1), float64(4), int64(2), object(10)
memory usage: 132.9+ KB
None

Introduction to Seaborn Plot Types

  • Overview of common Seaborn plot types
  • When to use each plot type
  • Basic syntax and structure
  • Complex plot type

Categorical Plots: Bar Plot

sns.countplot(
    data=df,
    y="PRIMARY DESCRIPTION",
    order=df["PRIMARY DESCRIPTION"].value_counts().index[:10],
)
plt.title("Top 10 Crime Types")
# sns.despine(offset=10, trim=True) Try that
plt.show()

Further Exploration

from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Combine all secondary descriptions into a single string
text = " ".join(df["SECONDARY DESCRIPTION"].dropna())

# Create and generate a word cloud image
wordcloud = WordCloud(
    width=800, height=400, background_color="white", min_font_size=10
).generate(text)

# Display the generated image
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

Further Exploration

Distribution Plots: Histogram

df["HOUR"] = df["DATE OF OCCURRENCE"].dt.hour
plt.figure(figsize=(12, 5))
sns.histplot(data=df, x="HOUR", bins=24, kde=True)
plt.title("Distribution of Crimes by Hour of the Day")
plt.show()

Distribution Plots: KDE Plot

plt.figure(figsize=(12, 6))
sns.kdeplot(data=df, x="HOUR", hue="PRIMARY DESCRIPTION", common_norm=False)
plt.title("Distribution of Different Crime Types by Hour")
plt.show()

Relational Plots: Scatter Plot

plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x="LONGITUDE", y="LATITUDE", hue="PRIMARY DESCRIPTION")
plt.title("Geographical Distribution of Crimes")
plt.show()

Relational Plots: Scatter Plot

Relational Plots: Line Plot

crime_counts = df.groupby("DATE OF OCCURRENCE").size().reset_index(name="COUNT")
plt.figure(figsize=(10, 5))
sns.lineplot(data=crime_counts, x="DATE OF OCCURRENCE", y="COUNT")
plt.title("Crime Trends Over Time")
plt.xticks(rotation=45)
plt.show()

Advanced Customization

plt.figure(figsize=(14, 6))
sns.set_style("whitegrid")
sns.set_palette("deep")

g = sns.countplot(
    data=df,
    y="PRIMARY DESCRIPTION",
    order=df["PRIMARY DESCRIPTION"].value_counts().index[:10],
)

g.set_title("Top 10 Crime Types", fontsize=20)
g.set_xlabel("Count", fontsize=14)
g.set_ylabel("Crime Type", fontsize=14)

# Seaborn way: Add bar labels
for container in g.containers:
    g.bar_label(container)
# g.bar_label(g.containers[0])

# matplotlib way: Add bar labels
# for i, v in enumerate(df["PRIMARY DESCRIPTION"].value_counts()[:10]):
#     g.text(v + 3, i, str(v), color="black", va="center")

plt.tight_layout()
plt.show()

Advanced Customization

Categorical Plots: Box Plot

df["DAY_OF_WEEK"] = df["DATE OF OCCURRENCE"].dt.day_name()
plt.figure(figsize=(12, 5))
sns.boxplot(data=df, x="DAY_OF_WEEK", y="DATE OF OCCURRENCE").set_ylabel("Date")
plt.title("Distribution of Crimes by Day of the Week")
plt.show()

Categorical Plots: Violin Plot

Now, it’s your turn!

# Exercise: Change the violinplot to a boxenplot
# Hint: Use sns.violinplot()

plt.figure(figsize=(12, 5))
# Your code here:

plt.title("Distribution of Crimes by Day of the Week")
plt.show()

Categorical Plots: Violin Plot

plt.figure(figsize=(12, 5))
sns.violinplot(data=df, x="DAY_OF_WEEK", y="DATE OF OCCURRENCE").set_ylabel("Date")
plt.show()

Categorical Plots: enhanced box plot

Now, it’s your turn!

# Exercise: Change the violinplot to a boxenplot
# Hint: Use sns.boxenplot()

plt.figure(figsize=(12, 5))
# Your code here:

plt.title("Distribution of Crimes by Day of the Week")
plt.show()

Categorical Plots: enhanced box plot

plt.figure(figsize=(12, 5))
sns.boxenplot(data=df, x="DAY_OF_WEEK", y="DATE OF OCCURRENCE").set_ylabel("Date")
plt.show()

Summary of Categorical Plots

  • boxkenplot: Draw an enhanced box plot for larger datasets.
  • boxplot: Draw a box plot to show distributions with respect to categories.
  • violinplot: Draw a patch representing a KDE and add observations or box plot statistics.
  • stripplot: Draw a categorical scatterplot using jitter to reduce overplotting.
  • swarmplot: Draw a categorical scatterplot with points adjusted to be non-overlapping.

API Summary

Heatmap

  • Useful for visualizing correlation between variables
  • Can show patterns and relationships in complex datasets
# Select numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns

# Compute correlation matrix
corr_matrix = df[numeric_cols].corr()

# Create heatmap
plt.figure(figsize=(6, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
plt.xticks(rotation=45, ha="right")
plt.show() 

Hint: Try sns.heatmap() with annot=False.

Heatmap

Customized Heatmap

Hint: Try np.tril instead of np.triu.

# Select numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns

# Compute correlation matrix
corr_matrix = df[numeric_cols].corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Set up the matplotlib figure
plt.figure(figsize=(8, 8))

# Create heatmap with only upper triangle
sns.heatmap(
    corr_matrix,
    mask=mask,
    annot=True,
    cmap="coolwarm",
    vmin=-1,
    vmax=1,
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.8},
    fmt=".2f",
)
plt.xticks(rotation=45, ha="right")
plt.show()

Customized Heatmap

Pair Plot

  • Useful for exploring relationships between multiple variables
  • Creates a grid of scatter plots for each pair of variables

Pair Plot

# Select relevant columns for the pair plot
# Hint: Choose a few relevant columns and use sns.pairplot()
cols_to_plot = ["X COORDINATE", "Y COORDINATE", "LATITUDE", "LONGITUDE"]

# Add hour of day
df["HOUR"] = pd.to_datetime(df["DATE OF OCCURRENCE"]).dt.hour

# Create the pair plot
plt.figure(figsize=(5, 5))
pairplot = sns.pairplot(
    df[cols_to_plot + ["HOUR", "PRIMARY DESCRIPTION"]],
    hue="PRIMARY DESCRIPTION",
    palette="viridis",
    plot_kws={"alpha": 0.6},
    diag_kind="kde",
)
plt.tight_layout()
plt.show()

Pair Plot

<Figure size 480x480 with 0 Axes>

Regression Plot

  • Visualizes the relationship between two variables
  • Includes a linear regression line and confidence interval
sns.lmplot(
    data=df,
    x="BEAT",
    y="WARD",
    col="ARREST",
    row="DOMESTIC",
    height=3,
    aspect=2,
    facet_kws=dict(sharex=False, sharey=False),
    scatter_kws={"alpha": 0.5},
)
plt.show()

Hint: try seaborn.regplot or seaborn.residplot

Regression Plot

Advanced Seaborn: FacetGrid

  • Demonstrates how to create multiple plots in a grid
  • Useful for comparing distributions across categories
# Create a FacetGrid
plt.figure(figsize=(4, 4))
g = sns.FacetGrid(df, col="PRIMARY DESCRIPTION", col_wrap=3, height=4, aspect=1.5)

# Map a histogram to each subplot
g.map(plt.hist, "HOUR", bins=24)

# Customize the plot
g.set_axis_labels("Hour of Day", "Count")
g.set_titles("{col_name}")
g.fig.suptitle("Distribution of Crimes by Hour for Different Crime Types", y=1.02)
g.tight_layout()
plt.show()

Advanced Seaborn: FacetGrid

<Figure size 384x384 with 0 Axes>

Seaborn figure styles

import numpy as np


def sinplot(n=10, flip=1):
    x = np.linspace(0, 14, 100)
    for i in range(1, n + 1):
        plt.plot(x, np.sin(x + i * 0.5) * (n + 2 - i) * flip)


f = plt.figure(figsize=(8, 8))
gs = f.add_gridspec(2, 2)

with sns.axes_style("darkgrid"):
    ax = f.add_subplot(gs[0, 0])
    sinplot(6)

with sns.axes_style("white"):
    ax = f.add_subplot(gs[0, 1])
    sinplot(6)

with sns.axes_style("ticks"):
    ax = f.add_subplot(gs[1, 0])
    sinplot(6)

with sns.axes_style("whitegrid"):
    ax = f.add_subplot(gs[1, 1])
    sinplot(6)

f.tight_layout()

Seaborn figure styles

Best Practices and Tips

  • Choosing the right plot for your data
  • Pay attention to color choices and accessibility
  • Avoiding common pitfalls
  • Consider the story your visualization is telling

Q&A

  • Q&A session

Additional resources Resours

  • https://seaborn.pydata.org
  • https://www.data-to-viz.com/
  • https://data.cityofchicago.org/Public-Safety/Crimes-One-year-prior-to-present/x2n5-8w5q/data
  • https://quarto.org/docs/presentations/revealjs/
  • https://seaborn.pydata.org/examples
  • https://seaborn.pydata.org/tutorial/color_palettes