In [ ]:
import pandas as pd
import os
Merge 12 months of sales data into a single file¶
In [ ]:
df = pd.read_csv("./Sales_Data/Sales_April_2019.csv")
all_moths_Data = pd.DataFrame()
files = [file for file in os.listdir("./Sales_Data")]
for file in files:
df = pd.read_csv("./Sales_Data/"+file)
all_moths_Data = pd.concat([all_moths_Data, df])
all_moths_Data.to_csv("all_data.csv", index=False)
all_moths_Data.head()
# df.head()
Out[Â ]:
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | |
|---|---|---|---|---|---|---|
| 0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 |
| 1 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 |
| 2 | 176560 | Google Phone | 1 | 600 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 |
| 3 | 176560 | Wired Headphones | 1 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 |
| 4 | 176561 | Wired Headphones | 1 | 11.99 | 04/30/19 09:27 | 333 8th St, Los Angeles, CA 90001 |
Read in updated dataframe¶
In [ ]:
all_data = pd.read_csv("all_data.csv")
Clean up the data¶
Drop rows of NaN
In [ ]:
nan_df = all_data[all_data.isna().any(axis=1)]
# nan_df.head()ˇ
all_data = all_data.dropna(how='all')
Find 'Or' and delete it
In [ ]:
all_data = all_data[all_data['Order Date'].str[0:2] != 'Or']
Convert columns to the correct type
In [ ]:
all_data['Quantity Ordered'] = pd.to_numeric(all_data['Quantity Ordered']) #make int
all_data['Price Each'] = pd.to_numeric(all_data['Price Each']) #make float
Augment data with additional columns¶
Task2: Add month column:
In [ ]:
all_data['Month'] = all_data['Order Date'].str[0:2]
all_data['Month'] = all_data['Month'].astype('int32')
all_data.head()
Out[Â ]:
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | |
|---|---|---|---|---|---|---|---|
| 0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 | 4 |
| 1 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 | 4 |
| 2 | 176560 | Google Phone | 1 | 600.00 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 |
| 3 | 176560 | Wired Headphones | 1 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 |
| 4 | 176561 | Wired Headphones | 1 | 11.99 | 04/30/19 09:27 | 333 8th St, Los Angeles, CA 90001 | 4 |
Task 3: Add sales columm
In [ ]:
all_data['Sales'] = all_data['Quantity Ordered'] * all_data['Price Each']
all_data.head()
Out[Â ]:
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | Sales | |
|---|---|---|---|---|---|---|---|---|
| 0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 | 4 | 23.90 |
| 1 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 | 4 | 99.99 |
| 2 | 176560 | Google Phone | 1 | 600.00 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 600.00 |
| 3 | 176560 | Wired Headphones | 1 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 11.99 |
| 4 | 176561 | Wired Headphones | 1 | 11.99 | 04/30/19 09:27 | 333 8th St, Los Angeles, CA 90001 | 4 | 11.99 |
Add city name column
In [ ]:
def get_city(adress):
return adress.split(",")[1]
def get_state(adress):
return adress.split(",")[2].split(" ")[1]
all_data['City'] = all_data['Purchase Address'].apply(lambda x: get_city(x) + ' '+ get_state(x))
all_data.head()
Out[Â ]:
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | Sales | City | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 | 4 | 23.90 | Dallas TX |
| 1 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 | 4 | 99.99 | Boston MA |
| 2 | 176560 | Google Phone | 1 | 600.00 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 600.00 | Los Angeles CA |
| 3 | 176560 | Wired Headphones | 1 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles CA |
| 4 | 176561 | Wired Headphones | 1 | 11.99 | 04/30/19 09:27 | 333 8th St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles CA |
Question 1: What was the best month for sales? How much was earned that month?¶
In [ ]:
results = all_data.groupby('Month')[['Quantity Ordered', 'Price Each', 'Sales']].sum()
In [ ]:
import matplotlib.pyplot as plt
months = range(1,13)
plt.bar(months, results['Sales'])
plt.xticks(months)
plt.xlabel("Months")
plt.ylabel('USD in Milions')
plt.show
Out[Â ]:
<function matplotlib.pyplot.show(close=None, block=None)>
Question 2: What city had the highest number of sales ?¶
In [ ]:
results = all_data.groupby('City')[['Quantity Ordered', 'Price Each', 'Month','Sales']].sum()
results
Out[Â ]:
| Quantity Ordered | Price Each | Month | Sales | |
|---|---|---|---|---|
| City | ||||
| Atlanta GA | 16602 | 2779908.20 | 104794 | 2795498.58 |
| Austin TX | 11153 | 1809873.61 | 69829 | 1819581.75 |
| Boston MA | 22528 | 3637409.77 | 141112 | 3661642.01 |
| Dallas TX | 16730 | 2752627.82 | 104620 | 2767975.40 |
| Los Angeles CA | 33289 | 5421435.23 | 208325 | 5452570.80 |
| New York City NY | 27932 | 4635370.83 | 175741 | 4664317.43 |
| Portland ME | 2750 | 447189.25 | 17144 | 449758.27 |
| Portland OR | 11303 | 1860558.22 | 70621 | 1870732.34 |
| San Francisco CA | 50239 | 8211461.74 | 315520 | 8262203.91 |
| Seattle WA | 16553 | 2733296.01 | 104941 | 2747755.48 |
In [ ]:
cities = [city for city, df in all_data.groupby('City')]
# Plotting
plt.bar(cities, results['Sales'])
plt.xticks(cities, rotation='vertical', size=8)
plt.xlabel("US City name")
plt.ylabel('USD in Millions')
plt.show()
Question 3: What time should we display advertisements to maximize likehood of customers buying product?¶
In [ ]:
all_data.head()
all_data['Order Date'] = pd.to_datetime(all_data['Order Date'])
all_data['Hour'] = all_data['Order Date'].dt.hour
all_data['Count'] = 1
all_data.head()
In [ ]:
hours = [hour for hour, df in all_data.groupby('Hour')]
plt.plot(hours, all_data.groupby(['Hour']).count())
plt.xticks(hours)
plt.xlabel("Hours")
plt.ylabel("Num of orders")
plt.grid()
plt.show()
Question 4: What products are most often sold together?¶
In [ ]:
df = all_data[all_data['Order ID'].duplicated(keep=False)]
df['Grouped'] = df.groupby('Order ID')['Product'].transform(lambda x: ','.join(x))
df = df[['Order ID', 'Grouped']].drop_duplicates()
df.head()
In [ ]:
from itertools import combinations
from collections import Counter
count = Counter()
for row in df['Grouped']:
row_list = row.split(',')
count.update(Counter(combinations(row_list,2)))
most_common_products = count.most_common(10)
most_common_products
Out[Â ]:
[(('iPhone', 'Lightning Charging Cable'), 1005),
(('Google Phone', 'USB-C Charging Cable'), 987),
(('iPhone', 'Wired Headphones'), 447),
(('Google Phone', 'Wired Headphones'), 414),
(('Vareebadd Phone', 'USB-C Charging Cable'), 361),
(('iPhone', 'Apple Airpods Headphones'), 360),
(('Google Phone', 'Bose SoundSport Headphones'), 220),
(('USB-C Charging Cable', 'Wired Headphones'), 160),
(('Vareebadd Phone', 'Wired Headphones'), 143),
(('Lightning Charging Cable', 'Wired Headphones'), 92)]
In [ ]:
product_pairs, counts = zip(*most_common_products)
# Create labels from product pairs
labels = [f"{pair[0]} & {pair[1]}" for pair in product_pairs]
# Plotting the bar graph
plt.figure(figsize=(10, 6))
plt.barh(labels, counts, color='skyblue')
plt.xlabel('Number of Occurrences')
plt.ylabel('Product Pairs')
plt.title('Top 10 Most Common Product Pairs')
plt.gca().invert_yaxis() # Invert y-axis to have the most common pair on top
plt.show()
In [ ]:
all_data.head()
Out[Â ]:
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Month | Sales | City | Hour | Count | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 2019-04-19 08:46:00 | 917 1st St, Dallas, TX 75001 | 4 | 23.90 | Dallas TX | 8 | 1 |
| 1 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 2019-04-07 22:30:00 | 682 Chestnut St, Boston, MA 02215 | 4 | 99.99 | Boston MA | 22 | 1 |
| 2 | 176560 | Google Phone | 1 | 600.00 | 2019-04-12 14:38:00 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 600.00 | Los Angeles CA | 14 | 1 |
| 3 | 176560 | Wired Headphones | 1 | 11.99 | 2019-04-12 14:38:00 | 669 Spruce St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles CA | 14 | 1 |
| 4 | 176561 | Wired Headphones | 1 | 11.99 | 2019-04-30 09:27:00 | 333 8th St, Los Angeles, CA 90001 | 4 | 11.99 | Los Angeles CA | 9 | 1 |
In [ ]:
grouped_data = all_data.groupby('Product')['Quantity Ordered'].sum()
grouped_data.values
Out[Â ]:
array([ 4129, 6244, 7550, 6199, 27635, 31017, 15661, 13457, 4819,
5532, 646, 666, 23217, 4728, 4130, 23975, 2068, 20557,
6849], dtype=int64)
In [ ]:
# numeric_columns = ['Quantity Ordered', 'Price Each', 'Month','Sales','Hour' ,'Count']
grouped_data = all_data.groupby('Product')['Quantity Ordered'].sum()
# products = [product for product, df in grouped_data]
plt.figure(figsize=(10, 6)) # Adjust the size of the plot
plt.bar(grouped_data.index, grouped_data.values, color='skyblue')
# Adding labels and title
plt.xlabel('Product')
plt.ylabel('Quantity Ordered')
plt.title('Total Quantity Ordered by Product')
plt.xticks(rotation='vertical') # Rotate x-axis labels for better readability
# Show the plot
plt.show()
In [ ]:
grouped_quantity = all_data.groupby('Product')['Quantity Ordered'].sum()
prices = all_data.groupby('Product')['Price Each'].mean()
# Create a figure and a set of subplots
fig, ax1 = plt.subplots(figsize=(10, 6))
# Plot the first y-axis: Quantity Ordered
ax1.bar(grouped_quantity.index, grouped_quantity.values, color='skyblue')
ax1.set_xlabel('Product')
ax1.set_ylabel('Quantity Ordered', color='skyblue')
# ax1.tick_params(axis='y', labelcolor='skyblue')
ax1.set_xticklabels(grouped_quantity.index, rotation='vertical')
# Create a second y-axis sharing the same x-axis
ax2 = ax1.twinx()
ax2.plot(grouped_quantity.index, prices.values, color='green')
ax2.set_ylabel('Average Price Each (USD)', color='green')
ax2.tick_params(axis='y', labelcolor='green')
# Title of the plot
plt.title('Total Quantity Ordered and Average Price by Product')
# Show the plot
plt.show()
C:\Users\Michal\AppData\Local\Temp\ipykernel_13708\254779068.py:12: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax1.set_xticklabels(grouped_quantity.index, rotation='vertical')