Visualisation All

Download as pptx, pdf, or txt
Download as pptx, pdf, or txt
You are on page 1of 70

A Basic Plot

import numpy as np
import matplotlib.pyplot as plt

# Create an array filled with numbers from 0 to 9


data = np.arange(10)
print(data)

# Create a plot of the data


plt.plot(data)

# show the plot


plt.show()

1
Another Basic Plot
# To avoid calling plt.show()
# we can use
%matplotlib
# in Ipython (Python shell)
# and for Jupyter Notebook use
%matplotlib inline

# Create a random data series


data = np.random.randn(50).cumsum()
print(data)

# Create a plot of the data


plt.plot(data)
# the plot is shown immediately
2
And Another Basic Plot
# Create an random data series
data = np.random.randn(50).cumsum()
print(data)

# Create a plot of the data


plt.plot(data)

# NOTE: In IPython that's in the same figure!


# So, in IPython use either
plt.clf() # to clear a figure
# or
plt.close() # to close a figure window.
# This has no visible effect in the Jupyter Notebook.
3
Colors, Markers, and Line Styles
# Create a plot of the data
# as dashed line, circles and in green
plt.plot(data, linestyle='--', marker='o', color='g')

# or dotted line, triangles (up) and in red


data = np.random.randn(50).cumsum()
plt.plot(data, linestyle=':', marker='^', color='r')

# In IPython:
plt.close() # to close the figure window

4
Ticks, …
plt.plot(np.random.randn(1000).cumsum())

# Change the x-axis ticks:


# where?
tick_loc = [0, 250, 500, 750, 1000]
# what?
tick_lab = ['one', 'two', 'three', 'four', 'five']

# set them:
plt.xticks(tick_loc, tick_lab, rotation=30,
fontsize='small')

# continue with code on next page in same cell!

5
…, Labels, …
# Change the x-axis label:
plt.xlabel('Stages')

# Change the y-axis label:


plt.ylabel('Distance', rotation=90)

# Change the title:


plt.title('Not my first matplotlib plot!')

# In IPython, clear the figure:


plt.clf()

6
… and Legends
# Create three sets of random data
data = [[],[],[]]
for i in range(3):
data[i] = np.random.randn(100).cumsum()

# define styles and labels


styles = ['ko-', 'gs--', 'b.-.']
labels = ['one', 'two', 'three']

# plot them
for i in range(3):
plt.plot(data[i], styles[i], label=labels[i])

# Finally, the legend


plt.legend(loc='best') 7
Annotations and Drawing on a Plot
# some text somewhere
plt.text(10, 5, 'Just to say Hello!',
family='monospace', fontsize=10)

# more meaningful annotations


# e.g., peak of 'one'
y = data[0].max()
x = data[0].argmax()
plt.annotate('Peak One', xy=(x+0.1, y+0.1),
xytext=(x+1,y+1), arrowprops=dict(facecolor='black',
headwidth=4, width=2, headlength=4),
horizontalalignment='left', verticalalignment='top')

# see also:
# https://matplotlib.org/users/annotations.html
8
Time to Save the Plot
filename = 'UpDown.png'
plt.savefig(filename)

# and close it (in IPython)


plt.close()

9
Scatter Plot
xvals = np.random.randn(100).cumsum()
yvals = np.random.randn(100).cumsum()

# plot in scatter plot


plt.scatter(xvals, yvals)

# repeat
xvals = np.random.randn(100).cumsum()
yvals = np.random.randn(100).cumsum()

# plot in scatter plot


plt.scatter(xvals, yvals)

10
Scatter Plot (cont'd)
# random size of dots
sizes = abs(np.random.randn(100) * 100)

# repeat
xvals = np.random.randn(100).cumsum()
yvals = np.random.randn(100).cumsum()

# plot in scatter plot


plt.scatter(xvals, yvals, s=sizes)

# and close it (in IPython)


plt.close()

11
Histograms
vals = np.random.randn(100)

# Univariate Histogram
plt.hist(vals, alpha=0.5)

# with six / twenty bins


plt.hist(vals, bins=6, alpha=0.5)
plt.hist(vals, bins=20, alpha=0.5)

12
Figures and Subplots
# an empty figure
fig = plt.figure()

# explicitly create subplots as 2x2


# numbered starting from 1!
ax1 = fig.add_subplot(2, 2, 1)
ax2 = fig.add_subplot(2, 2, 2)
ax3 = fig.add_subplot(2, 2, 3)

# matplotlib draws on the last figure and subplot used


plt.plot(np.random.randn(50).cumsum(), 'k--')

# continue with code on next page in same cell!

13
Figures and Subplots (con't)
# draw into another subplot
# histogram
ax1.hist(np.random.randn(100), bins=20, color='k',
alpha=0.3)

# scatterplot
ax2.scatter(np.arange(30), np.arange(30) + 3 *
np.random.randn(30))

# select scatter plot


plt.subplot(2,2,2)
plt.text(5, 20, 'Positive Linear Relationship')

# in IPython
plt.close() 14
Grids of Subplots
# figure with 2x3 subplots
fig, axes = plt.subplots(2, 3)

# the axes are stored in an array


axes
# and can be easily indexed, e.g.,
axes[0,1]

# fill the subplots with histograms


for i in range(2):
for j in range(3):
axes[i, j].hist(np.random.randn(500), bins=50,
color='k', alpha=0.5)

15
Grids of Subplots (cont'd)
# Make histograms directly comparable by
# sharing same x-axis ticks and y-axis ticks.
# Have to do this WHEN creating the figure
fig, axes = plt.subplots(2, 3, sharex=True, sharey=True)

# fill the subplots with histograms


for i in range(2):
for j in range(3):
axes[i, j].hist(np.random.randn(500), bins=50,
color='k', alpha=0.5)

# change spacing around the subplots


plt.subplots_adjust(wspace=0, hspace=0)

16
First, Read Data from CSV file
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

sales =
pd.read_csv("https://raw.githubusercontent.com/GerhardT
rippen/DataSets/master/sample-salesv2.csv",
parse_dates=['date'])
sales.head()
sales.dtypes

sales.describe()

sales['unit price'].describe() 17
Customers
customers = sales[['name','ext price','date']]
customers.head()

customer_group = customers.groupby('name')
customer_group.size()

sales_totals = customer_group.sum()
sales_totals.sort_values('ext price').head()

my_plot = sales_totals.plot(kind='bar')
my_plot = sales_totals.plot(kind='barh')

# identical
my_plot = sales_totals.plot.bar()
18
Customers – Title and Labels
my_plot = sales_totals.sort_values('ext price',
ascending=False).plot(kind='bar', legend=None,
title="Total Sales by Customer")

my_plot.set_xlabel("Customers")

my_plot.set_ylabel("Sales ($)")

19
Customers with Product Category
customers = sales[['name', 'category', 'ext price',
'date']]
customers.head()
category_group =
customers.groupby(['name','category']).sum()
category_group.head(10)
category_group = category_group.unstack()-- transpose
category_group.head(10)
my_plot = category_group.plot(kind='bar', stacked=True,
title="Total Sales by Customer")
my_plot.set_xlabel("Customers")
my_plot.set_ylabel("Sales ($)")
my_plot.legend(["Belts","Shirts","Shoes"], loc='best',
ncol=3)
20
Customers with Product Category –
Sorted!
category_group = category_group.sort_values(('ext
price', 'Belt'), ascending=False)
category_group.head()
my_plot = category_group.plot(kind='bar', stacked=True,
title="Total Sales by Customer")

# sort by total without showing total!


category_group['total'] = category_group.sum(axis=1)
category_group = category_group.sort_values('total',
ascending=False)
category_group.head()
category_group.drop('total', axis=1, inplace=True)
my_plot = category_group.plot(kind='bar', stacked=True,
title="Total Sales by Customer")
21
Purchase Patterns
purchase_patterns = sales[['category','ext
price','date']]
purchase_patterns.head()

purchase_plot = purchase_patterns['ext
price'].hist(bins=20)

# done many times now,


# but should always be done to make figure self-
explanatory
purchase_plot.set_title("Purchase Patterns")
purchase_plot.set_xlabel("Order Amount ($)")
purchase_plot.set_ylabel("Number of Orders")

22
Purchase Patterns – Timeline
purchase_patterns = purchase_patterns.set_index('date’)
Taking date from data and making it index
purchase_patterns.head()

# sorted by time
purchase_patterns.sort_index()

# resampled by months
purchase_plot =
purchase_patterns.resample('M').sum().plot(title="Total
Sales by Month", legend=None)

# save the figure


fig = purchase_plot.get_figure()
fig.savefig("total-sales.png") 23
Purchase Patterns – Timeline
by Categories
fig, ax = plt.subplots()

# key gives the group name (i.e., category), data gives


the actual values
for key, data in purchase_patterns.groupby('category'):
data.resample('M').sum().plot(y='ext price', ax=ax,
label=key)

# change y range to start from 0 with matplotlib


ax.set_ylim(bottom=0)

24
Boxplots …
# Box and Whisker Plots
sales.boxplot() # Not very useful!

# Four boxplots in one figure with individual scales


sales.plot(kind='box', subplots=True, layout=(2,2),
sharex=False, sharey=False)
# This was not working for some time in some pandas
versions because of the date column, so:
sales.drop('date',axis=1).plot(kind='box',
subplots=True, layout=(2,2), sharex=False,
sharey=False)

# Individual boxplots for all names


sales.boxplot(column="ext price", by="name")
plt.xticks(rotation='vertical')
25
… and Histograms
# Histograms
sales.hist()

# for individual variables in one plot


sales.drop('date',axis=1).plot(kind='hist',
subplots=True, layout=(2,2), sharex=False,
sharey=False) # "ignored", unfortunately

# individual plots for individual customers


sales.hist(column="ext price", by="name", bins=30)

# same axes for comparison


sales.hist(column="ext price", by="name", bins=30,
sharex=True, sharey=True)
26
First, Read Data from CSV file
import numpy as np
import matplotlib.pyplot as plt

# Load CSV using pandas


import pandas as pd
from pandas import read_csv
# AirBnB website visitors
filename = 'visitors.csv'
visitors = read_csv(filename, index_col='id_visitor')
print(visitors.head())

print(visitors.shape)
print(visitors.head())
print(visitors.dtypes)
27
Histograms, Density Plots, Box and
Whisker Plots
# Univariate Histograms
visitors.hist()

# Univariate Density Plots


visitors.plot(kind='density', subplots=True,
layout=(2,2), sharex=False)

# Box and Whisker Plots


visitors.plot(kind='box', subplots=True, layout=(2,2),
sharex=False, sharey=False)

28
Correlation Matrix Plot
# correlation matrix
correlations = visitors.corr()
# plot correlation matrix (generic)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)

# change the tick labels


ticks = np.arange(0,4,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(visitors.columns)
ax.set_yticklabels(visitors.columns)
29
Scatter Plot Matrix
# Scatterplot Matrix
from pandas.plotting import scatter_matrix
scatter_matrix(visitors)

30
Hypothesis Testing Versus
Exploratory Data Analysis
 Analyst may have “a priori” (presupposed by experience)
hypothesis to test
 For example, has increasing fee-structure led to
decreasing market share?
 Hypothesis Testing: test hypothesis market share has
decreased

31
Hypothesis Testing Vs
Exploratory Data Analysis (cont’d)
 However, we do not always have a priori notions about data
 In this case, use Exploratory Data Analysis (EDA)
 Approach useful for:
– Delving into data
– Examining important interrelationships between attributes
– Identifying interesting subsets or patterns
– Discovering possible relationships between predictors
and target variable

32
Getting to Know the Data Set
– Graphs, plots, and tables often uncover important
relationships in data
– The 3,333 records and 21 variables in churn data set
are explored (see churn.txt)
– Simple approach looks at field values of records

33
Getting to Know the Data Set –
Python Code
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Read in the Churn data set


churn = pd.read_csv("churn.txt")

churn.shape

# Show the first ten records


churn.head(10)

churn.dtypes
churn.info()
34
Getting to Know the Data Set (cont’d)
– Eight of the attributes:
» State: categorical
» Account Length: numeric
» Area Code: categorical
» Phone: categorical
» Intl Plan: Boolean
» VMail Plan: Boolean
» Vmail Messages: numeric
» Day Mins: numeric
– “churn” attribute indicates customers leaving one company
in favor of another company’s products or services
35
Exploratory Data Analysis
 Goals:
– Investigate variables as part of the Data Understanding
Phase
» Numeric  Analyze Histograms, Scatter Plots,
Summary Statistics
» Categorical  Examine Distributions, Cross-
tabulations
– Become familiar with data
– Explore relationships among variable sets
– While performing EDA, remain focused on objective
I.e., creating data mining model of customer likely to
“churn”
36
Exploring the Target – Python Code
churn["Churn?"]

# Summarize the Churn variable


churn["Churn?"].value_counts()

# Calculate proportion of churners


churn["Churn?"].value_counts(normalize = True) * 100

# Bar chart of variable Churn


sns.countplot(churn["Churn?"])
ax.set_title("Churning Customers")

# in comparison: matplotlib
churn["Churn?"].value_counts().plot(kind='bar', title=
"Churning Customers") 37
Exploring Categorical Variables
– Cross-tabulation quantifies relationship between Churn and
International Plan
– International Plan and Churn variables both categorical

» First column: total  International plan = “no”


» Second column:total  International plan = “yes”
» First row: total  Churn = “False”
» Second row: total  Churn = “True”
– Data set contains 346 + 137 = 483 churners, 38
Exploring Categorical Variables (cont’d)
Quantifying the relationship:
– 42.4% of customers in International Plan churned
(137 / (137 + 186))
– 11.5% of customers not in International Plan churned
(346 / (346 + 2,664))
– Customers selecting International Plan more than 3×
more likely to leave company, as compared to those not
in plan
– Why does International Plan apparently cause customers
to leave?
– Data models predicting churn will likely include
International Plan as predictor
39
Exploring Categorical Variables (cont’d)

 International Plan
– Figure 3.4 shows proportion of customers in
International Plan with churn overlay
– International Plan: yes = 9.69%, no = 90.31%
– Possibly, greater proportion of those in International
Plan are churners?
40
Exploring Categorical Variables (cont’d)

– Again, Proportion of customers in International Plan


with churn overlay
– This time, same-sized bars used for each category
(normalized)
– Graphically, proportion of “churners” in each category
more apparent
– Those selecting International Plan more likely to churn 41
Exploring Categorical Variables –
Python Code
# do not add total margins for visualization
churn_crosstab = pd.crosstab(churn["Churn?"],
churn["Int'l Plan"], margins=False)

pd.crosstab(churn["Churn?"], churn["Int'l Plan"],


margins=True)

# seaborn does not (yet?) support stacked bar charts


natively
sns.countplot(x="Int'l Plan", hue="Churn?", data=churn)

churn_crosstab.plot(kind = 'bar', stacked = True)

42
Exploring Categorical Variables –
Python Code
churn_crosstab_norm =
churn_crosstab.div(churn_crosstab.sum(axis=1), axis=0)

churn_crosstab_norm

churn_crosstab_norm.plot(kind = 'bar', stacked = True)

43
Exploring Categorical Variables (cont’d)

– Voice Mail Plan has 842 + 80 = 922 customers


– Remaining 2,008 + 403 = 2,411 customers not in plan
44
Exploring Categorical Variables (cont’d)
– Only 8.7% = 80/922 of those in plan are churners
– Of those not in plan, 16.7% = 403/2,411 are churners
– Therefore, those not participating in plan ~2× more
likely to churn, as compared to those in plan

– Perhaps customer loyalty can be increased by


simplifying enrollment into Voice Mail Plan?

– Data models predicting churn likely to include Voice


Mail Plan as predictor

45
Exploring Categorical Variables (cont’d)

 Voice Mail Plan


– Figure 3.10 shows proportion of customers in Voice
Mail Plan with churn overlay (normalized)
– Voicemail Plan: yes = 27.66%, no = 72.34%
– Those not participating in Voice Mail Plan appear
more likely to churn

46
Exploring Categorical Variables (cont’d)

– Two-way Interactions between Voice Mail Plan and


International Plan, with respect to churn shown
47
Exploring Categorical Variables (cont’d)

– Voice Mail Plan = no (constant)


– Many customers have
neither plan:
1,878 + 302 = 2,180
– Of those, 302/2,180 = 14%
are churners
– Customers in International Plan
and not in Voice Mail Plan
churn at rate 101/231 = 44%

48
Exploring Numeric Variables
– Numeric summary measures for several variables shown
– see analysis in Python: churn.describe()
– Includes min and max, mean, median, std, and 1st
and 3rd quartile
– For example, Account Length has min = 1 and max = 243
– Mean and median both ~101, which indicates
symmetry

– Voice Mail Messages not symmetric; mean = 8.1 and


median = 0
– Median = 0 indicates half of customers had no voice
mail messages 49
Exploring Numerical Variables –
Python Code
churn.hist()

# Density Plots
churn.plot(kind='density', subplots=True, layout=(4,4),
sharex=False)
sns.distplot(churn["CustServ Calls"])

# Box and Whisker Plots


churn.boxplot()
plt.xticks(rotation=90)
churn.plot(kind='box', subplots=True, layout=(4,4),
sharex=False, sharey=False)
sns.boxplot(x = churn["Churn?"], y = churn["CustServ
Calls"], data = churn)
50
Exploring Numeric Variables (cont’d)

– Histogram for Customer Service Calls attribute shown


– Increases understanding of attribute’s distribution
– Distribution is right-skewed and has mode = 1
– However, relationship to Churn not indicated (Left)
– Figure (Right) shows identical histogram including
Churn overlay
– Determining whether Churn proportion varies across 51
Exploring Numerical Variables –
Python Code
# Separate the variable Customer Service Calls, by the
two values of the target
churn_csc_T = churn[churn["Churn?"] == "True."]
["CustServ Calls"]
churn_csc_F = churn[churn["Churn?"] == "False."]
["CustServ Calls"]

# Create a stacked histogram of the two variables


plt.hist([churn_csc_T, churn_csc_F], bins = 10, stacked
= True)
plt.legend(['Churn = True', 'Churn = False'])
plt.title('Histogram of Customer Service Calls with
Churn Overlay')
plt.xlabel('Customer Service Calls')
plt.ylabel('Frequency')
52
Exploring Numeric Variables (cont’d)

– Again, histogram of
Customer Service Calls shown
– Normalized values enhance pattern
of churn
– Customers calling customer service
3 or fewer times, far less likely to churn
– Results: Carefully track number of customer service calls
made by customers; Offer incentives to retain those
making higher number of calls
– Data mining model will probably include Customer
Service Calls as predictor
53
Exploring Numerical Variables –
Python Code
import numpy as np

(n, bins, patches) = plt.hist([churn_csc_T,


churn_csc_F], bins = 10, stacked = True)

n_table = np.column_stack((n[0], n[1]))


n_norm = n_table / n_table.sum(axis=1)[:, None]
ourbins = np.column_stack((bins[0:10], bins[1:11]))

plt.bar(x = ourbins[:,0], height = n_norm[:,0], width =


ourbins[:, 1] - ourbins[:, 0])
plt.bar(x = ourbins[:,0], height = n_norm[:,1], width =
ourbins[:, 1] - ourbins[:, 0],
bottom = n_norm[:,0])
54
Exploring Numerical Variables –
Python Code
plt.legend(['Churn = True', 'Churn = False'])
plt.title('Normalized Histogram of Customer Service
Calls with Churn Overlay')
plt.xlabel('Customer Service Calls')
plt.ylabel('Proportion')

55
Exploring Numeric Variables (cont’d)
– Normalized histogram of Day Minutes shown
with Churn overlay (Top)
– Indicates high usage customers churn at
significantly greater rate
– Results: Carefully track customer Day
Minutes as total exceeds 200
– Investigate why those with high usage tend
to leave
– Normalized histogram of Evening Minutes
shown with Churn overlay (Bottom)
– Higher usage customers churn slightly
more
– Results: Based on graphical evidence, we
cannot conclude beyond a reasonable
doubt that such an effect exists
56
Exploring Numeric Variables (cont’d)
– Additional EDA concludes no obvious association between
Churn and remaining numeric attributes (not shown)
– These numeric attributes probably not strong predictors in
data model
– However, they should be retained as input to model
– Important higher-level associations/interactions may exist
– In this case, let model identify which inputs are important
– Data mining performance adversely affected by many inputs
– Possibility: Use dimension-reduction technique such as
principal components analysis

57
Exploring Multivariate Relationships
– Multivariate graphics can uncover new interaction effects
which our univariate exploration missed
– Figure 3.20 shows a scatter plot of day minutes vs.
evenings minutes, with churners indicated by the darker
circles

58
Selecting Interesting Subsets of
the Data for further Investigation
– Graphical EDA can uncover subsets of records that call
for further investigation, as the rectangle in Figure 3.21
illustrates

59
Exploring Multivariate Relationships –
Python Code
sns.scatterplot(x="Day Mins", y="Eve Mins", data=churn)

sns.scatterplot(x = "Day Mins", y = "Eve Mins", hue =


"Churn?", data = churn)

sns.scatterplot(x = "Day Mins", y = "CustServ Calls",


hue = "Churn?", data = churn)

60
Selecting Interesting Subsets of the
Data for further Investigation (cont'd)
– Figure 3.22 shows that about 65% (115 of 177) of the
selected records are churners
– Those with high customer service calls and low day
minutes have a 65% probability of churning
– Figure 3.23 shows that only about 26% of customers with
high customer service calls and high day minutes are
churners
– Red-flag customers with high customer service calls and
low day minutes

61
Using EDA to uncover Anomalous Fields
– Exploratory data analysis will sometimes uncover strange or
anomalous records or fields which the earlier data cleaning
phase may have missed
– E.g., the area code field contains only three different values
for all the records, 408, 415, and 510 (which all happen to
be California area codes), as shown by Figure 3.24

– However, the three area codes seem to


be distributed more or less evenly across
all the states and the District of Columbia
(be wary of using the area code field as
input) 62
Binning based on Predictive Value
– Trying to determine whether there is a relationship
between evening minutes and churn
– Setting 3 bin values of evening minutes reveals the High
bin at 19.5% churn rate which beats the baseline churn
rate of 14.49% for all customers

63
Binning based on Predictive Value –
Python Code
churn['Eve Mins binned'] = pd.cut(x = churn['Eve Mins'],
bins = [0, 160.01, 240.01, 400], labels=["Low",
"Medium", "High"], right = False)

churn_crosstab = pd.crosstab(churn["Churn?"], churn["Eve


Mins binned"])

churn_crosstab.plot(kind = 'bar', stacked = True, title


= 'Bar Graph of Evening Minutes (Binned) with Churn
Overlay')

64
Using EDA to Investigate
Correlated Predictor Variables
– Just because two variables are correlated does not mean that we
should omit one of them
– Instead use the following strategy:
1) Identify any variables that are perfectly correlated (that is, r = 1.0
or r = -1.0). Do not retain both variables in the model, but rather omit
one
2) Identify groups of variables that are correlated with each other.
Then, later, during the modeling phase, apply dimension reduction
methods, such as principal components analysis, to these variables

65
Using EDA to Investigate
Correlated Predictor Variables (cont'd)
– There does not seem to be any relationship between day
minutes and day calls, nor between day calls and day charge
– On the other hand, there is a perfect linear relationship between
day minutes and day charge, indicating that day charge is a
simple linear function of day minutes only
– We may express this function as the estimated regression
equation:
– “Day charge equals 0.000613 plus 0.17 times Day minutes.”
– Since day charge is perfectly correlated with day minutes, then
we should eliminate one of the two variables
– After dealing with the perfectly correlated predictors, the
correlation of each numerical predictor with every other
numerical predictor should be checked: (see next slide)
66
Using EDA to Investigate
Correlated Predictor Variables (cont'd)
– All relationships between the remaining numerical
predictors are very weak and statistically not significant.

67
Correlated Predictor Variables –
Python Code
from pandas.plotting import scatter_matrix
scatter_matrix(churn)

# correlation matrix
correlations = churn.corr()
print(correlations)

68
Correlated Predictor Variables –
Python Code
# plot correlation matrix (generic)
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)

# change the tick labels


ticks = range(0,16,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(churn.columns, rotation=90)
ax.set_yticklabels(churn.columns)

69
Our EDA – Brief Summary
– The four charge fields are linear functions of the minute fields, and
should be omitted
– The area code field and/or the state field are anomalous, and should
be omitted until further clarification is obtained
– Some insights with respect to churn are as follows:
– Customers with the International Plan tend to churn more
frequently
– Customers with the Voice Mail Plan tend to churn less frequently
– Customers with four or more Customer Service Calls  tend to
churn more frequently
– Customers with both high Day Minutes and high Evening
Minutes tend to churn at a higher rate than the other customers
– Customers with low Day Minutes and high Customer Service
Calls churn at a higher rate than the other customers 70

You might also like