In [ ]:
import pandas as pd
import os

Merge 12 months of sales data into a single file¶

In [ ]:
df = pd.read_csv("./Sales_Data/Sales_April_2019.csv")
all_moths_Data = pd.DataFrame()
files = [file for file in os.listdir("./Sales_Data")] 
for file in files:
    df = pd.read_csv("./Sales_Data/"+file)
    all_moths_Data = pd.concat([all_moths_Data, df])
all_moths_Data.to_csv("all_data.csv", index=False)
all_moths_Data.head()
# df.head() 
Out[ ]:
Order ID Product Quantity Ordered Price Each Order Date Purchase Address
0 176558 USB-C Charging Cable 2 11.95 04/19/19 08:46 917 1st St, Dallas, TX 75001
1 176559 Bose SoundSport Headphones 1 99.99 04/07/19 22:30 682 Chestnut St, Boston, MA 02215
2 176560 Google Phone 1 600 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001
3 176560 Wired Headphones 1 11.99 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001
4 176561 Wired Headphones 1 11.99 04/30/19 09:27 333 8th St, Los Angeles, CA 90001

Read in updated dataframe¶

In [ ]:
all_data = pd.read_csv("all_data.csv")

Clean up the data¶

Drop rows of NaN

In [ ]:
nan_df = all_data[all_data.isna().any(axis=1)]
# nan_df.head()ˇ
all_data = all_data.dropna(how='all')

Find 'Or' and delete it

In [ ]:
all_data = all_data[all_data['Order Date'].str[0:2] != 'Or'] 

Convert columns to the correct type

In [ ]:
all_data['Quantity Ordered'] = pd.to_numeric(all_data['Quantity Ordered']) #make int
all_data['Price Each'] = pd.to_numeric(all_data['Price Each']) #make float

Augment data with additional columns¶

Task2: Add month column:

In [ ]:
all_data['Month'] = all_data['Order Date'].str[0:2]
all_data['Month'] = all_data['Month'].astype('int32')

all_data.head()
Out[ ]:
Order ID Product Quantity Ordered Price Each Order Date Purchase Address Month
0 176558 USB-C Charging Cable 2 11.95 04/19/19 08:46 917 1st St, Dallas, TX 75001 4
1 176559 Bose SoundSport Headphones 1 99.99 04/07/19 22:30 682 Chestnut St, Boston, MA 02215 4
2 176560 Google Phone 1 600.00 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4
3 176560 Wired Headphones 1 11.99 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4
4 176561 Wired Headphones 1 11.99 04/30/19 09:27 333 8th St, Los Angeles, CA 90001 4

Task 3: Add sales columm

In [ ]:
all_data['Sales'] = all_data['Quantity Ordered'] * all_data['Price Each']
all_data.head()
Out[ ]:
Order ID Product Quantity Ordered Price Each Order Date Purchase Address Month Sales
0 176558 USB-C Charging Cable 2 11.95 04/19/19 08:46 917 1st St, Dallas, TX 75001 4 23.90
1 176559 Bose SoundSport Headphones 1 99.99 04/07/19 22:30 682 Chestnut St, Boston, MA 02215 4 99.99
2 176560 Google Phone 1 600.00 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4 600.00
3 176560 Wired Headphones 1 11.99 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4 11.99
4 176561 Wired Headphones 1 11.99 04/30/19 09:27 333 8th St, Los Angeles, CA 90001 4 11.99

Add city name column

In [ ]:
def get_city(adress):
    return adress.split(",")[1]
def get_state(adress):
    return adress.split(",")[2].split(" ")[1]

all_data['City'] = all_data['Purchase Address'].apply(lambda x: get_city(x) + ' '+ get_state(x))
all_data.head()
Out[ ]:
Order ID Product Quantity Ordered Price Each Order Date Purchase Address Month Sales City
0 176558 USB-C Charging Cable 2 11.95 04/19/19 08:46 917 1st St, Dallas, TX 75001 4 23.90 Dallas TX
1 176559 Bose SoundSport Headphones 1 99.99 04/07/19 22:30 682 Chestnut St, Boston, MA 02215 4 99.99 Boston MA
2 176560 Google Phone 1 600.00 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4 600.00 Los Angeles CA
3 176560 Wired Headphones 1 11.99 04/12/19 14:38 669 Spruce St, Los Angeles, CA 90001 4 11.99 Los Angeles CA
4 176561 Wired Headphones 1 11.99 04/30/19 09:27 333 8th St, Los Angeles, CA 90001 4 11.99 Los Angeles CA

Question 1: What was the best month for sales? How much was earned that month?¶

In [ ]:
results = all_data.groupby('Month')[['Quantity Ordered', 'Price Each', 'Sales']].sum()
In [ ]:
import matplotlib.pyplot as plt

months = range(1,13)

plt.bar(months, results['Sales'])
plt.xticks(months)
plt.xlabel("Months")
plt.ylabel('USD in Milions')
plt.show
Out[ ]:
<function matplotlib.pyplot.show(close=None, block=None)>
No description has been provided for this image

Question 2: What city had the highest number of sales ?¶

In [ ]:
results = all_data.groupby('City')[['Quantity Ordered', 'Price Each', 'Month','Sales']].sum()
results
Out[ ]:
Quantity Ordered Price Each Month Sales
City
Atlanta GA 16602 2779908.20 104794 2795498.58
Austin TX 11153 1809873.61 69829 1819581.75
Boston MA 22528 3637409.77 141112 3661642.01
Dallas TX 16730 2752627.82 104620 2767975.40
Los Angeles CA 33289 5421435.23 208325 5452570.80
New York City NY 27932 4635370.83 175741 4664317.43
Portland ME 2750 447189.25 17144 449758.27
Portland OR 11303 1860558.22 70621 1870732.34
San Francisco CA 50239 8211461.74 315520 8262203.91
Seattle WA 16553 2733296.01 104941 2747755.48
In [ ]:
cities = [city for city, df in all_data.groupby('City')]

# Plotting
plt.bar(cities, results['Sales'])
plt.xticks(cities, rotation='vertical', size=8)
plt.xlabel("US City name")
plt.ylabel('USD in Millions')
plt.show()
No description has been provided for this image

Question 3: What time should we display advertisements to maximize likehood of customers buying product?¶

In [ ]:
all_data.head()
all_data['Order Date'] = pd.to_datetime(all_data['Order Date'])
all_data['Hour'] = all_data['Order Date'].dt.hour
all_data['Count'] = 1
all_data.head()
In [ ]:
hours = [hour for hour, df in all_data.groupby('Hour')]
plt.plot(hours, all_data.groupby(['Hour']).count())
plt.xticks(hours)
plt.xlabel("Hours")
plt.ylabel("Num of orders")
plt.grid()
plt.show()
No description has been provided for this image

Question 4: What products are most often sold together?¶

In [ ]:
df = all_data[all_data['Order ID'].duplicated(keep=False)]
df['Grouped'] = df.groupby('Order ID')['Product'].transform(lambda x: ','.join(x))
df = df[['Order ID', 'Grouped']].drop_duplicates()
df.head()
In [ ]:
from itertools import combinations
from collections import Counter

count = Counter()

for row in df['Grouped']:
    row_list = row.split(',')
    count.update(Counter(combinations(row_list,2)))
most_common_products = count.most_common(10)
most_common_products
Out[ ]:
[(('iPhone', 'Lightning Charging Cable'), 1005),
 (('Google Phone', 'USB-C Charging Cable'), 987),
 (('iPhone', 'Wired Headphones'), 447),
 (('Google Phone', 'Wired Headphones'), 414),
 (('Vareebadd Phone', 'USB-C Charging Cable'), 361),
 (('iPhone', 'Apple Airpods Headphones'), 360),
 (('Google Phone', 'Bose SoundSport Headphones'), 220),
 (('USB-C Charging Cable', 'Wired Headphones'), 160),
 (('Vareebadd Phone', 'Wired Headphones'), 143),
 (('Lightning Charging Cable', 'Wired Headphones'), 92)]
In [ ]:
product_pairs, counts = zip(*most_common_products)

# Create labels from product pairs
labels = [f"{pair[0]} & {pair[1]}" for pair in product_pairs]

# Plotting the bar graph
plt.figure(figsize=(10, 6))
plt.barh(labels, counts, color='skyblue')
plt.xlabel('Number of Occurrences')
plt.ylabel('Product Pairs')
plt.title('Top 10 Most Common Product Pairs')
plt.gca().invert_yaxis()  # Invert y-axis to have the most common pair on top
plt.show()
No description has been provided for this image
In [ ]:
all_data.head()
Out[ ]:
Order ID Product Quantity Ordered Price Each Order Date Purchase Address Month Sales City Hour Count
0 176558 USB-C Charging Cable 2 11.95 2019-04-19 08:46:00 917 1st St, Dallas, TX 75001 4 23.90 Dallas TX 8 1
1 176559 Bose SoundSport Headphones 1 99.99 2019-04-07 22:30:00 682 Chestnut St, Boston, MA 02215 4 99.99 Boston MA 22 1
2 176560 Google Phone 1 600.00 2019-04-12 14:38:00 669 Spruce St, Los Angeles, CA 90001 4 600.00 Los Angeles CA 14 1
3 176560 Wired Headphones 1 11.99 2019-04-12 14:38:00 669 Spruce St, Los Angeles, CA 90001 4 11.99 Los Angeles CA 14 1
4 176561 Wired Headphones 1 11.99 2019-04-30 09:27:00 333 8th St, Los Angeles, CA 90001 4 11.99 Los Angeles CA 9 1
In [ ]:
grouped_data = all_data.groupby('Product')['Quantity Ordered'].sum()
grouped_data.values
Out[ ]:
array([ 4129,  6244,  7550,  6199, 27635, 31017, 15661, 13457,  4819,
        5532,   646,   666, 23217,  4728,  4130, 23975,  2068, 20557,
        6849], dtype=int64)
In [ ]:
# numeric_columns = ['Quantity Ordered', 'Price Each', 'Month','Sales','Hour' ,'Count']
grouped_data = all_data.groupby('Product')['Quantity Ordered'].sum()
# products = [product for product, df in  grouped_data]
plt.figure(figsize=(10, 6))  # Adjust the size of the plot
plt.bar(grouped_data.index, grouped_data.values, color='skyblue')

# Adding labels and title
plt.xlabel('Product')
plt.ylabel('Quantity Ordered')
plt.title('Total Quantity Ordered by Product')
plt.xticks(rotation='vertical')  # Rotate x-axis labels for better readability

# Show the plot
plt.show()
No description has been provided for this image
In [ ]:
grouped_quantity = all_data.groupby('Product')['Quantity Ordered'].sum()
prices = all_data.groupby('Product')['Price Each'].mean()

# Create a figure and a set of subplots
fig, ax1 = plt.subplots(figsize=(10, 6))

# Plot the first y-axis: Quantity Ordered
ax1.bar(grouped_quantity.index, grouped_quantity.values, color='skyblue')
ax1.set_xlabel('Product')
ax1.set_ylabel('Quantity Ordered', color='skyblue')
# ax1.tick_params(axis='y', labelcolor='skyblue')
ax1.set_xticklabels(grouped_quantity.index, rotation='vertical')

# Create a second y-axis sharing the same x-axis
ax2 = ax1.twinx()
ax2.plot(grouped_quantity.index, prices.values, color='green')
ax2.set_ylabel('Average Price Each (USD)', color='green')
ax2.tick_params(axis='y', labelcolor='green')

# Title of the plot
plt.title('Total Quantity Ordered and Average Price by Product')

# Show the plot
plt.show()
C:\Users\Michal\AppData\Local\Temp\ipykernel_13708\254779068.py:12: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax1.set_xticklabels(grouped_quantity.index, rotation='vertical')
No description has been provided for this image