import numpy as npimport pandas as pdimport seaborn as snsimport matplotlib.pyplot as plt# from matplotlib import rcParams# # Set global font properties to Arial# rcParams.update(# {# "": "sans-serif",# "font.sans-serif": "Arial",# "pdf.fonttype": 42, # Embed fonts as Type 3 fonts for compatibility# "ps.fonttype": 42,# "text.usetex": False,# "svg.fonttype": "none",# }# )def stardize_columns(df): df.columns = [" ".join(col.strip().split()) for col in df.columns]# Basic data cleaning df["DATE OF OCCURRENCE"] = pd.to_datetime(df["DATE OF OCCURRENCE"])# Load the datadf = pd.read_csv("", nrows=1000,)stardize_columns(df)
Understanding the Dataset
# Exercise: Explore the dataset# 1. Use to display basic information about the dataset# 2. Use df.head() to show the first few rows# Your code here:# What insights can you gather from this initial exploration?
data source:
# Display basic information about the datasetprint(
from wordcloud import WordCloudimport matplotlib.pyplot as plt# Combine all secondary descriptions into a single stringtext =" ".join(df["SECONDARY DESCRIPTION"].dropna())# Create and generate a word cloud imagewordcloud = WordCloud( width=800, height=400, background_color="white", min_font_size=10).generate(text)# Display the generated imageplt.figure(figsize=(12, 8))plt.imshow(wordcloud, interpolation="bilinear")plt.axis("off")
Further Exploration
Distribution Plots: Histogram
df["HOUR"] = df["DATE OF OCCURRENCE"].dt.hourplt.figure(figsize=(12, 5))sns.histplot(data=df, x="HOUR", bins=24, kde=True)plt.title("Distribution of Crimes by Hour of the Day")
Distribution Plots: KDE Plot
plt.figure(figsize=(12, 6))sns.kdeplot(data=df, x="HOUR", hue="PRIMARY DESCRIPTION", common_norm=False)plt.title("Distribution of Different Crime Types by Hour")
Relational Plots: Scatter Plot
plt.figure(figsize=(12, 8))sns.scatterplot(data=df, x="LONGITUDE", y="LATITUDE", hue="PRIMARY DESCRIPTION")plt.title("Geographical Distribution of Crimes")
Relational Plots: Line Plot
crime_counts = df.groupby("DATE OF OCCURRENCE").size().reset_index(name="COUNT")plt.figure(figsize=(10, 5))sns.lineplot(data=crime_counts, x="DATE OF OCCURRENCE", y="COUNT")plt.title("Crime Trends Over Time")plt.xticks(rotation=45)
Advanced Customization
plt.figure(figsize=(14, 6))sns.set_style("whitegrid")sns.set_palette("deep")g = sns.countplot( data=df, y="PRIMARY DESCRIPTION", order=df["PRIMARY DESCRIPTION"].value_counts().index[:10],)g.set_title("Top 10 Crime Types", fontsize=20)g.set_xlabel("Count", fontsize=14)g.set_ylabel("Crime Type", fontsize=14)# Seaborn way: Add bar labelsfor container in g.containers: g.bar_label(container)# g.bar_label(g.containers[0])# matplotlib way: Add bar labels# for i, v in enumerate(df["PRIMARY DESCRIPTION"].value_counts()[:10]):# g.text(v + 3, i, str(v), color="black", va="center")plt.tight_layout()
Categorical Plots: Box Plot
df["DAY_OF_WEEK"] = df["DATE OF OCCURRENCE"].dt.day_name()plt.figure(figsize=(12, 5))sns.boxplot(data=df, x="DAY_OF_WEEK", y="DATE OF OCCURRENCE").set_ylabel("Date")plt.title("Distribution of Crimes by Day of the Week")
Categorical Plots: Violin Plot
Now, it’s your turn!
# Exercise: Change the violinplot to a boxenplot# Hint: Use sns.violinplot()plt.figure(figsize=(12, 5))# Your code here:plt.title("Distribution of Crimes by Day of the Week")
plt.figure(figsize=(12, 5))sns.violinplot(data=df, x="DAY_OF_WEEK", y="DATE OF OCCURRENCE").set_ylabel("Date")
Categorical Plots: enhanced box plot
# Exercise: Change the violinplot to a boxenplot# Hint: Use sns.boxenplot()plt.figure(figsize=(12, 5))# Your code here:plt.title("Distribution of Crimes by Day of the Week")
plt.figure(figsize=(12, 5))sns.boxenplot(data=df, x="DAY_OF_WEEK", y="DATE OF OCCURRENCE").set_ylabel("Date")
Summary of Categorical Plots
boxkenplot: Draw an enhanced box plot for larger datasets.
boxplot: Draw a box plot to show distributions with respect to categories.
violinplot: Draw a patch representing a KDE and add observations or box plot statistics.
stripplot: Draw a categorical scatterplot using jitter to reduce overplotting.
swarmplot: Draw a categorical scatterplot with points adjusted to be non-overlapping.
API Summary
Useful for visualizing correlation between variables
Can show patterns and relationships in complex datasets
# Select numeric columnsnumeric_cols = df.select_dtypes(include=[np.number]).columns# Compute correlation matrixcorr_matrix = df[numeric_cols].corr()# Create a mask for the upper trianglemask = np.triu(np.ones_like(corr_matrix, dtype=bool))# Set up the matplotlib figureplt.figure(figsize=(8, 8))# Create heatmap with only upper trianglesns.heatmap( corr_matrix, mask=mask, annot=True, cmap="coolwarm", vmin=-1, vmax=1, center=0, square=True, linewidths=0.5, cbar_kws={"shrink": 0.8}, fmt=".2f",)plt.xticks(rotation=45, ha="right")
Customized Heatmap
Pair Plot
Useful for exploring relationships between multiple variables
Creates a grid of scatter plots for each pair of variables
Pair Plot
# Select relevant columns for the pair plot# Hint: Choose a few relevant columns and use sns.pairplot()cols_to_plot = ["X COORDINATE", "Y COORDINATE", "LATITUDE", "LONGITUDE"]# Add hour of daydf["HOUR"] = pd.to_datetime(df["DATE OF OCCURRENCE"]).dt.hour# Create the pair plotplt.figure(figsize=(5, 5))pairplot = sns.pairplot( df[cols_to_plot + ["HOUR", "PRIMARY DESCRIPTION"]], hue="PRIMARY DESCRIPTION", palette="viridis", plot_kws={"alpha": 0.6}, diag_kind="kde",)plt.tight_layout()
Pair Plot
Regression Plot
Visualizes the relationship between two variables
Includes a linear regression line and confidence interval
Demonstrates how to create multiple plots in a grid
Useful for comparing distributions across categories
# Create a FacetGridplt.figure(figsize=(4, 4))g = sns.FacetGrid(df, col="PRIMARY DESCRIPTION", col_wrap=3, height=4, aspect=1.5)# Map a histogram to each, "HOUR", bins=24)# Customize the plotg.set_axis_labels("Hour of Day", "Count")g.set_titles("{col_name}")g.fig.suptitle("Distribution of Crimes by Hour for Different Crime Types", y=1.02)g.tight_layout()