Visualisation All

A Basic Plot
import numpy as np
import matplotlib.pyplot as plt
# Create an array filled with numbers from 0 to 9

data = np.arange(10)
print(data)
# Create a plot of the data

plt.plot(data)
# show the plot

plt.show()
1
Another Basic Plot
# To avoid calling plt.show()
# we can use
%matplotlib
# in Ipython (Python shell)
# and for Jupyter Notebook use
%matplotlib inline
# Create a random data series

data = np.random.randn(50).cumsum()
print(data)

plt.plot(data)
# the plot is shown immediately
2
And Another Basic Plot
# Create an random data series
print(data)

plt.plot(data)
# NOTE: In IPython that's in the same figure!

# So, in IPython use either
plt.clf() # to clear a figure
# or
plt.close() # to close a figure window.
# This has no visible effect in the Jupyter Notebook.
3
Colors, Markers, and Line Styles
# as dashed line, circles and in green
plt.plot(data, linestyle='--', marker='o', color='g')
# or dotted line, triangles (up) and in red

plt.plot(data, linestyle=':', marker='^', color='r')
# In IPython:
plt.close() # to close the figure window
4
Ticks, …
plt.plot(np.random.randn(1000).cumsum())
# Change the x-axis ticks:

# where?
tick_loc = [0, 250, 500, 750, 1000]
# what?
tick_lab = ['one', 'two', 'three', 'four', 'five']
# set them:
plt.xticks(tick_loc, tick_lab, rotation=30,
fontsize='small')
# continue with code on next page in same cell!
5
…, Labels, …
# Change the x-axis label:
plt.xlabel('Stages')
# Change the y-axis label:

plt.ylabel('Distance', rotation=90)
# Change the title:

plt.title('Not my first matplotlib plot!')
# In IPython, clear the figure:

plt.clf()
6
… and Legends
# Create three sets of random data
data = [[],[],[]]
for i in range(3):
data[i] = np.random.randn(100).cumsum()
# define styles and labels

styles = ['ko-', 'gs--', 'b.-.']
labels = ['one', 'two', 'three']
# plot them
for i in range(3):
plt.plot(data[i], styles[i], label=labels[i])
# Finally, the legend

plt.legend(loc='best') 7
Annotations and Drawing on a Plot
# some text somewhere
plt.text(10, 5, 'Just to say Hello!',
family='monospace', fontsize=10)
# more meaningful annotations

# e.g., peak of 'one'
y = data[0].max()
x = data[0].argmax()
plt.annotate('Peak One', xy=(x+0.1, y+0.1),
xytext=(x+1,y+1), arrowprops=dict(facecolor='black',
headwidth=4, width=2, headlength=4),
horizontalalignment='left', verticalalignment='top')
# see also:
# https://matplotlib.org/users/annotations.html
8
Time to Save the Plot
filename = 'UpDown.png'
plt.savefig(filename)
# and close it (in IPython)

plt.close()
9
Scatter Plot
xvals = np.random.randn(100).cumsum()
yvals = np.random.randn(100).cumsum()
# plot in scatter plot

plt.scatter(xvals, yvals)
# repeat

plt.scatter(xvals, yvals)
10
Scatter Plot (cont'd)
# random size of dots
sizes = abs(np.random.randn(100) * 100)
# repeat

plt.scatter(xvals, yvals, s=sizes)
# and close it (in IPython)

plt.close()
11
Histograms
vals = np.random.randn(100)
# Univariate Histogram
plt.hist(vals, alpha=0.5)
# with six / twenty bins

plt.hist(vals, bins=6, alpha=0.5)
plt.hist(vals, bins=20, alpha=0.5)
12
Figures and Subplots
# an empty figure
fig = plt.figure()
# explicitly create subplots as 2x2

# numbered starting from 1!
ax1 = fig.add_subplot(2, 2, 1)
# matplotlib draws on the last figure and subplot used

plt.plot(np.random.randn(50).cumsum(), 'k--')
# continue with code on next page in same cell!
13
Figures and Subplots (con't)
# draw into another subplot
# histogram
ax1.hist(np.random.randn(100), bins=20, color='k',
alpha=0.3)
# scatterplot
ax2.scatter(np.arange(30), np.arange(30) + 3 *
np.random.randn(30))
# select scatter plot

plt.subplot(2,2,2)
plt.text(5, 20, 'Positive Linear Relationship')
# in IPython
plt.close() 14
Grids of Subplots
# figure with 2x3 subplots
fig, axes = plt.subplots(2, 3)
# the axes are stored in an array

axes
# and can be easily indexed, e.g.,
axes[0,1]
# fill the subplots with histograms

for i in range(2):
for j in range(3):
axes[i, j].hist(np.random.randn(500), bins=50,
color='k', alpha=0.5)
15
Grids of Subplots (cont'd)
# Make histograms directly comparable by
# sharing same x-axis ticks and y-axis ticks.
# Have to do this WHEN creating the figure
fig, axes = plt.subplots(2, 3, sharex=True, sharey=True)
# fill the subplots with histograms

for i in range(2):
for j in range(3):
axes[i, j].hist(np.random.randn(500), bins=50,
color='k', alpha=0.5)
# change spacing around the subplots

plt.subplots_adjust(wspace=0, hspace=0)
16
First, Read Data from CSV file
import pandas as pd
import numpy as np
%matplotlib inline
sales =
pd.read_csv("https://raw.githubusercontent.com/GerhardT
rippen/DataSets/master/sample-salesv2.csv",
parse_dates=['date'])
sales.head()
sales.dtypes
sales.describe()
sales['unit price'].describe() 17
Customers
customers = sales[['name','ext price','date']]
customers.head()
customer_group = customers.groupby('name')
customer_group.size()
sales_totals = customer_group.sum()
sales_totals.sort_values('ext price').head()
my_plot = sales_totals.plot(kind='bar')
my_plot = sales_totals.plot(kind='barh')
# identical
my_plot = sales_totals.plot.bar()
18
Customers – Title and Labels
my_plot = sales_totals.sort_values('ext price',
ascending=False).plot(kind='bar', legend=None,
title="Total Sales by Customer")
my_plot.set_xlabel("Customers")
my_plot.set_ylabel("Sales ($)")
19
Customers with Product Category
customers = sales[['name', 'category', 'ext price',
'date']]
customers.head()
category_group =
customers.groupby(['name','category']).sum()
category_group.head(10)
category_group = category_group.unstack()-- transpose
category_group.head(10)
my_plot = category_group.plot(kind='bar', stacked=True,
my_plot.set_xlabel("Customers")
my_plot.set_ylabel("Sales ($)")
my_plot.legend(["Belts","Shirts","Shoes"], loc='best',
ncol=3)
20
Customers with Product Category –
Sorted!
category_group = category_group.sort_values(('ext
price', 'Belt'), ascending=False)
category_group.head()
# sort by total without showing total!

category_group['total'] = category_group.sum(axis=1)
category_group = category_group.sort_values('total',
ascending=False)
category_group.head()
category_group.drop('total', axis=1, inplace=True)
21
Purchase Patterns
purchase_patterns = sales[['category','ext
price','date']]
purchase_patterns.head()
purchase_plot = purchase_patterns['ext
price'].hist(bins=20)
# done many times now,

# but should always be done to make figure self-
explanatory
purchase_plot.set_title("Purchase Patterns")
purchase_plot.set_xlabel("Order Amount ($)")
purchase_plot.set_ylabel("Number of Orders")
22
Purchase Patterns – Timeline
purchase_patterns = purchase_patterns.set_index('date’)
Taking date from data and making it index
purchase_patterns.head()
# sorted by time
purchase_patterns.sort_index()
# resampled by months
purchase_plot =
purchase_patterns.resample('M').sum().plot(title="Total
Sales by Month", legend=None)
# save the figure

fig = purchase_plot.get_figure()
fig.savefig("total-sales.png") 23
Purchase Patterns – Timeline
by Categories
fig, ax = plt.subplots()
# key gives the group name (i.e., category), data gives

the actual values
for key, data in purchase_patterns.groupby('category'):
data.resample('M').sum().plot(y='ext price', ax=ax,
label=key)
# change y range to start from 0 with matplotlib

ax.set_ylim(bottom=0)
24
Boxplots …
# Box and Whisker Plots
sales.boxplot() # Not very useful!
# Four boxplots in one figure with individual scales

sales.plot(kind='box', subplots=True, layout=(2,2),
sharex=False, sharey=False)
# This was not working for some time in some pandas
versions because of the date column, so:
sales.drop('date',axis=1).plot(kind='box',
subplots=True, layout=(2,2), sharex=False,
sharey=False)
# Individual boxplots for all names

sales.boxplot(column="ext price", by="name")
plt.xticks(rotation='vertical')
25
… and Histograms
# Histograms
sales.hist()
# for individual variables in one plot

sales.drop('date',axis=1).plot(kind='hist',
subplots=True, layout=(2,2), sharex=False,
sharey=False) # "ignored", unfortunately
# individual plots for individual customers

sales.hist(column="ext price", by="name", bins=30)
# same axes for comparison

sales.hist(column="ext price", by="name", bins=30,
sharex=True, sharey=True)
26
First, Read Data from CSV file
import numpy as np
# Load CSV using pandas

import pandas as pd
from pandas import read_csv
# AirBnB website visitors
filename = 'visitors.csv'
visitors = read_csv(filename, index_col='id_visitor')
print(visitors.head())
print(visitors.shape)
print(visitors.head())
print(visitors.dtypes)
27
Histograms, Density Plots, Box and
Whisker Plots
# Univariate Histograms
visitors.hist()
# Univariate Density Plots

visitors.plot(kind='density', subplots=True,
layout=(2,2), sharex=False)

visitors.plot(kind='box', subplots=True, layout=(2,2),
28
Correlation Matrix Plot
# correlation matrix
correlations = visitors.corr()
# plot correlation matrix (generic)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
# change the tick labels

ticks = np.arange(0,4,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(visitors.columns)
ax.set_yticklabels(visitors.columns)
29
Scatter Plot Matrix
# Scatterplot Matrix
from pandas.plotting import scatter_matrix
scatter_matrix(visitors)
30
Hypothesis Testing Versus
Exploratory Data Analysis
 Analyst may have “a priori” (presupposed by experience)
hypothesis to test
 For example, has increasing fee-structure led to
decreasing market share?
 Hypothesis Testing: test hypothesis market share has
decreased
31
Hypothesis Testing Vs
Exploratory Data Analysis (cont’d)
 However, we do not always have a priori notions about data
 In this case, use Exploratory Data Analysis (EDA)
 Approach useful for:
– Delving into data
– Examining important interrelationships between attributes
– Identifying interesting subsets or patterns
– Discovering possible relationships between predictors
and target variable
32
Getting to Know the Data Set
– Graphs, plots, and tables often uncover important
relationships in data
– The 3,333 records and 21 variables in churn data set
are explored (see churn.txt)
– Simple approach looks at field values of records
33
Getting to Know the Data Set –
Python Code
import pandas as pd
import seaborn as sns
# Read in the Churn data set

churn = pd.read_csv("churn.txt")
churn.shape
# Show the first ten records

churn.head(10)
churn.dtypes
churn.info()
34
Getting to Know the Data Set (cont’d)
– Eight of the attributes:
» State: categorical
» Account Length: numeric
» Area Code: categorical
» Phone: categorical
» Intl Plan: Boolean
» VMail Plan: Boolean
» Vmail Messages: numeric
» Day Mins: numeric
– “churn” attribute indicates customers leaving one company
in favor of another company’s products or services
35
Exploratory Data Analysis
 Goals:
– Investigate variables as part of the Data Understanding
Phase
» Numeric  Analyze Histograms, Scatter Plots,
Summary Statistics
» Categorical  Examine Distributions, Cross-
tabulations
– Become familiar with data
– Explore relationships among variable sets
– While performing EDA, remain focused on objective
I.e., creating data mining model of customer likely to
“churn”
36
Exploring the Target – Python Code
churn["Churn?"]
# Summarize the Churn variable

churn["Churn?"].value_counts()
# Calculate proportion of churners

churn["Churn?"].value_counts(normalize = True) * 100
# Bar chart of variable Churn

sns.countplot(churn["Churn?"])
ax.set_title("Churning Customers")
# in comparison: matplotlib
churn["Churn?"].value_counts().plot(kind='bar', title=
"Churning Customers") 37
Exploring Categorical Variables
– Cross-tabulation quantifies relationship between Churn and
International Plan
– International Plan and Churn variables both categorical
» First column: total  International plan = “no”

» Second column:total  International plan = “yes”
» First row: total  Churn = “False”
» Second row: total  Churn = “True”
– Data set contains 346 + 137 = 483 churners, 38
Exploring Categorical Variables (cont’d)
Quantifying the relationship:
– 42.4% of customers in International Plan churned
(137 / (137 + 186))
– 11.5% of customers not in International Plan churned
(346 / (346 + 2,664))
– Customers selecting International Plan more than 3×
more likely to leave company, as compared to those not
in plan
– Why does International Plan apparently cause customers
to leave?
– Data models predicting churn will likely include
International Plan as predictor
39
 International Plan
– Figure 3.4 shows proportion of customers in
International Plan with churn overlay
– International Plan: yes = 9.69%, no = 90.31%
– Possibly, greater proportion of those in International
Plan are churners?
40
– Again, Proportion of customers in International Plan

with churn overlay
– This time, same-sized bars used for each category
(normalized)
– Graphically, proportion of “churners” in each category
more apparent
– Those selecting International Plan more likely to churn 41
Exploring Categorical Variables –
Python Code
# do not add total margins for visualization
churn_crosstab = pd.crosstab(churn["Churn?"],
churn["Int'l Plan"], margins=False)
pd.crosstab(churn["Churn?"], churn["Int'l Plan"],

margins=True)
# seaborn does not (yet?) support stacked bar charts

natively
sns.countplot(x="Int'l Plan", hue="Churn?", data=churn)
churn_crosstab.plot(kind = 'bar', stacked = True)
42
Exploring Categorical Variables –
Python Code
churn_crosstab_norm =
churn_crosstab.div(churn_crosstab.sum(axis=1), axis=0)
churn_crosstab_norm
churn_crosstab_norm.plot(kind = 'bar', stacked = True)
43
– Voice Mail Plan has 842 + 80 = 922 customers

– Remaining 2,008 + 403 = 2,411 customers not in plan
44
– Only 8.7% = 80/922 of those in plan are churners
– Of those not in plan, 16.7% = 403/2,411 are churners
– Therefore, those not participating in plan ~2× more
likely to churn, as compared to those in plan
– Perhaps customer loyalty can be increased by

simplifying enrollment into Voice Mail Plan?
– Data models predicting churn likely to include Voice

Mail Plan as predictor
45
 Voice Mail Plan

– Figure 3.10 shows proportion of customers in Voice
Mail Plan with churn overlay (normalized)
– Voicemail Plan: yes = 27.66%, no = 72.34%
– Those not participating in Voice Mail Plan appear
more likely to churn
46
– Two-way Interactions between Voice Mail Plan and

International Plan, with respect to churn shown
47
– Voice Mail Plan = no (constant)

– Many customers have
neither plan:
1,878 + 302 = 2,180
– Of those, 302/2,180 = 14%
are churners
– Customers in International Plan
and not in Voice Mail Plan
churn at rate 101/231 = 44%
48
Exploring Numeric Variables
– Numeric summary measures for several variables shown
– see analysis in Python: churn.describe()
– Includes min and max, mean, median, std, and 1st
and 3rd quartile
– For example, Account Length has min = 1 and max = 243
– Mean and median both ~101, which indicates
symmetry
– Voice Mail Messages not symmetric; mean = 8.1 and

median = 0
– Median = 0 indicates half of customers had no voice
mail messages 49
Exploring Numerical Variables –
Python Code
churn.hist()
# Density Plots
churn.plot(kind='density', subplots=True, layout=(4,4),
sharex=False)
sns.distplot(churn["CustServ Calls"])

churn.boxplot()
plt.xticks(rotation=90)
churn.plot(kind='box', subplots=True, layout=(4,4),
sns.boxplot(x = churn["Churn?"], y = churn["CustServ
Calls"], data = churn)
50
Exploring Numeric Variables (cont’d)
– Histogram for Customer Service Calls attribute shown

– Increases understanding of attribute’s distribution
– Distribution is right-skewed and has mode = 1
– However, relationship to Churn not indicated (Left)
– Figure (Right) shows identical histogram including
Churn overlay
– Determining whether Churn proportion varies across 51
Python Code
# Separate the variable Customer Service Calls, by the
two values of the target
churn_csc_T = churn[churn["Churn?"] == "True."]
["CustServ Calls"]
churn_csc_F = churn[churn["Churn?"] == "False."]
["CustServ Calls"]
# Create a stacked histogram of the two variables

plt.hist([churn_csc_T, churn_csc_F], bins = 10, stacked
= True)
plt.legend(['Churn = True', 'Churn = False'])
plt.title('Histogram of Customer Service Calls with
Churn Overlay')
plt.xlabel('Customer Service Calls')
plt.ylabel('Frequency')
52
– Again, histogram of
Customer Service Calls shown
– Normalized values enhance pattern
of churn
– Customers calling customer service
3 or fewer times, far less likely to churn
– Results: Carefully track number of customer service calls
made by customers; Offer incentives to retain those
making higher number of calls
– Data mining model will probably include Customer
Service Calls as predictor
53
Python Code
import numpy as np
(n, bins, patches) = plt.hist([churn_csc_T,

churn_csc_F], bins = 10, stacked = True)
n_table = np.column_stack((n[0], n[1]))

n_norm = n_table / n_table.sum(axis=1)[:, None]
ourbins = np.column_stack((bins[0:10], bins[1:11]))
plt.bar(x = ourbins[:,0], height = n_norm[:,0], width =

ourbins[:, 1] - ourbins[:, 0])
plt.bar(x = ourbins[:,0], height = n_norm[:,1], width =
ourbins[:, 1] - ourbins[:, 0],
bottom = n_norm[:,0])
54
Python Code
plt.legend(['Churn = True', 'Churn = False'])
plt.title('Normalized Histogram of Customer Service
Calls with Churn Overlay')
plt.xlabel('Customer Service Calls')
plt.ylabel('Proportion')
55
– Normalized histogram of Day Minutes shown
with Churn overlay (Top)
– Indicates high usage customers churn at
significantly greater rate
– Results: Carefully track customer Day
Minutes as total exceeds 200
– Investigate why those with high usage tend
to leave
– Normalized histogram of Evening Minutes
shown with Churn overlay (Bottom)
– Higher usage customers churn slightly
more
– Results: Based on graphical evidence, we
cannot conclude beyond a reasonable
doubt that such an effect exists
56
– Additional EDA concludes no obvious association between
Churn and remaining numeric attributes (not shown)
– These numeric attributes probably not strong predictors in
data model
– However, they should be retained as input to model
– Important higher-level associations/interactions may exist
– In this case, let model identify which inputs are important
– Data mining performance adversely affected by many inputs
– Possibility: Use dimension-reduction technique such as
principal components analysis
57
Exploring Multivariate Relationships
– Multivariate graphics can uncover new interaction effects
which our univariate exploration missed
– Figure 3.20 shows a scatter plot of day minutes vs.
evenings minutes, with churners indicated by the darker
circles
58
Selecting Interesting Subsets of
the Data for further Investigation
– Graphical EDA can uncover subsets of records that call
for further investigation, as the rectangle in Figure 3.21
illustrates
59
Exploring Multivariate Relationships –
Python Code
sns.scatterplot(x="Day Mins", y="Eve Mins", data=churn)
sns.scatterplot(x = "Day Mins", y = "Eve Mins", hue =

"Churn?", data = churn)
sns.scatterplot(x = "Day Mins", y = "CustServ Calls",

hue = "Churn?", data = churn)
60
Selecting Interesting Subsets of the
Data for further Investigation (cont'd)
– Figure 3.22 shows that about 65% (115 of 177) of the
selected records are churners
– Those with high customer service calls and low day
minutes have a 65% probability of churning
– Figure 3.23 shows that only about 26% of customers with
high customer service calls and high day minutes are
churners
– Red-flag customers with high customer service calls and
low day minutes
61
Using EDA to uncover Anomalous Fields
– Exploratory data analysis will sometimes uncover strange or
anomalous records or fields which the earlier data cleaning
phase may have missed
– E.g., the area code field contains only three different values
for all the records, 408, 415, and 510 (which all happen to
be California area codes), as shown by Figure 3.24
– However, the three area codes seem to

be distributed more or less evenly across
all the states and the District of Columbia
(be wary of using the area code field as
input) 62
Binning based on Predictive Value
– Trying to determine whether there is a relationship
between evening minutes and churn
– Setting 3 bin values of evening minutes reveals the High
bin at 19.5% churn rate which beats the baseline churn
rate of 14.49% for all customers
63
Binning based on Predictive Value –
Python Code
churn['Eve Mins binned'] = pd.cut(x = churn['Eve Mins'],
bins = [0, 160.01, 240.01, 400], labels=["Low",
"Medium", "High"], right = False)
churn_crosstab = pd.crosstab(churn["Churn?"], churn["Eve

Mins binned"])
churn_crosstab.plot(kind = 'bar', stacked = True, title

= 'Bar Graph of Evening Minutes (Binned) with Churn
Overlay')
64
Using EDA to Investigate
Correlated Predictor Variables
– Just because two variables are correlated does not mean that we
should omit one of them
– Instead use the following strategy:
1) Identify any variables that are perfectly correlated (that is, r = 1.0
or r = -1.0). Do not retain both variables in the model, but rather omit
one
2) Identify groups of variables that are correlated with each other.
Then, later, during the modeling phase, apply dimension reduction
methods, such as principal components analysis, to these variables
65
Correlated Predictor Variables (cont'd)
– There does not seem to be any relationship between day
minutes and day calls, nor between day calls and day charge
– On the other hand, there is a perfect linear relationship between
day minutes and day charge, indicating that day charge is a
simple linear function of day minutes only
– We may express this function as the estimated regression
equation:
– “Day charge equals 0.000613 plus 0.17 times Day minutes.”
– Since day charge is perfectly correlated with day minutes, then
we should eliminate one of the two variables
– After dealing with the perfectly correlated predictors, the
correlation of each numerical predictor with every other
numerical predictor should be checked: (see next slide)
66
Correlated Predictor Variables (cont'd)
– All relationships between the remaining numerical
predictors are very weak and statistically not significant.
67
Correlated Predictor Variables –
Python Code
from pandas.plotting import scatter_matrix
scatter_matrix(churn)
# correlation matrix
correlations = churn.corr()
print(correlations)
68
Correlated Predictor Variables –
Python Code
# plot correlation matrix (generic)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
# change the tick labels

ticks = range(0,16,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(churn.columns, rotation=90)
ax.set_yticklabels(churn.columns)
69
Our EDA – Brief Summary
– The four charge fields are linear functions of the minute fields, and
should be omitted
– The area code field and/or the state field are anomalous, and should
be omitted until further clarification is obtained
– Some insights with respect to churn are as follows:
– Customers with the International Plan tend to churn more
frequently
– Customers with the Voice Mail Plan tend to churn less frequently
– Customers with four or more Customer Service Calls tend to
churn more frequently
– Customers with both high Day Minutes and high Evening
Minutes tend to churn at a higher rate than the other customers
– Customers with low Day Minutes and high Customer Service
Calls churn at a higher rate than the other customers 70

Visualisation All

Uploaded by

Document Informationclick to expand document information

Copyright:

Available Formats

Visualisation All

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Visualisation All

Uploaded by

Copyright:

Available Formats

A Basic Plot

# Create an array filled with numbers from 0 to 9

# Create a plot of the data

# show the plot

# Create a random data series

# Create a plot of the data

# Create a plot of the data

# NOTE: In IPython that's in the same figure!

# or dotted line, triangles (up) and in red

# Change the x-axis ticks:

# continue with code on next page in same cell!

# Change the y-axis label:

# Change the title:

# In IPython, clear the figure:

# define styles and labels

# Finally, the legend

# more meaningful annotations

# and close it (in IPython)

# plot in scatter plot

# plot in scatter plot

# plot in scatter plot

# and close it (in IPython)

# with six / twenty bins

# explicitly create subplots as 2x2

# matplotlib draws on the last figure and subplot used

# continue with code on next page in same cell!

# select scatter plot

# the axes are stored in an array

# fill the subplots with histograms

# fill the subplots with histograms

# change spacing around the subplots

# sort by total without showing total!

# done many times now,

# save the figure

# key gives the group name (i.e., category), data gives

# change y range to start from 0 with matplotlib

# Four boxplots in one figure with individual scales

# Individual boxplots for all names

# for individual variables in one plot

# individual plots for individual customers

# same axes for comparison

# Load CSV using pandas

# Univariate Density Plots

# Box and Whisker Plots

# change the tick labels

# Read in the Churn data set

# Show the first ten records

# Summarize the Churn variable

# Calculate proportion of churners

# Bar chart of variable Churn

» First column: total  International plan = “no”

– Again, Proportion of customers in International Plan

pd.crosstab(churn["Churn?"], churn["Int'l Plan"],

# seaborn does not (yet?) support stacked bar charts

churn_crosstab.plot(kind = 'bar', stacked = True)

churn_crosstab_norm.plot(kind = 'bar', stacked = True)

– Voice Mail Plan has 842 + 80 = 922 customers

– Perhaps customer loyalty can be increased by

– Data models predicting churn likely to include Voice