import pandas as pd
import os

df = pd.read_csv("./Sales_Data/Sales_April_2019.csv")
all_moths_Data = pd.DataFrame()
files = [file for file in os.listdir("./Sales_Data")] 
for file in files:
    df = pd.read_csv("./Sales_Data/"+file)
    all_moths_Data = pd.concat([all_moths_Data, df])
all_moths_Data.to_csv("all_data.csv", index=False)
all_moths_Data.head()
# df.head()

all_data = pd.read_csv("all_data.csv")

nan_df = all_data[all_data.isna().any(axis=1)]
# nan_df.head()ˇ
all_data = all_data.dropna(how='all')

all_data = all_data[all_data['Order Date'].str[0:2] != 'Or']

all_data['Quantity Ordered'] = pd.to_numeric(all_data['Quantity Ordered']) #make int
all_data['Price Each'] = pd.to_numeric(all_data['Price Each']) #make float

all_data['Month'] = all_data['Order Date'].str[0:2]
all_data['Month'] = all_data['Month'].astype('int32')

all_data.head()

all_data['Sales'] = all_data['Quantity Ordered'] * all_data['Price Each']
all_data.head()

def get_city(adress):
    return adress.split(",")[1]
def get_state(adress):
    return adress.split(",")[2].split(" ")[1]

all_data['City'] = all_data['Purchase Address'].apply(lambda x: get_city(x) + ' '+ get_state(x))
all_data.head()

results = all_data.groupby('Month')[['Quantity Ordered', 'Price Each', 'Sales']].sum()

import matplotlib.pyplot as plt

months = range(1,13)

plt.bar(months, results['Sales'])
plt.xticks(months)
plt.xlabel("Months")
plt.ylabel('USD in Milions')
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>

results = all_data.groupby('City')[['Quantity Ordered', 'Price Each', 'Month','Sales']].sum()
results

cities = [city for city, df in all_data.groupby('City')]

# Plotting
plt.bar(cities, results['Sales'])
plt.xticks(cities, rotation='vertical', size=8)
plt.xlabel("US City name")
plt.ylabel('USD in Millions')
plt.show()

all_data.head()
all_data['Order Date'] = pd.to_datetime(all_data['Order Date'])
all_data['Hour'] = all_data['Order Date'].dt.hour
all_data['Count'] = 1
all_data.head()

hours = [hour for hour, df in all_data.groupby('Hour')]
plt.plot(hours, all_data.groupby(['Hour']).count())
plt.xticks(hours)
plt.xlabel("Hours")
plt.ylabel("Num of orders")
plt.grid()
plt.show()

df = all_data[all_data['Order ID'].duplicated(keep=False)]
df['Grouped'] = df.groupby('Order ID')['Product'].transform(lambda x: ','.join(x))
df = df[['Order ID', 'Grouped']].drop_duplicates()
df.head()

from itertools import combinations
from collections import Counter

count = Counter()

for row in df['Grouped']:
    row_list = row.split(',')
    count.update(Counter(combinations(row_list,2)))
most_common_products = count.most_common(10)
most_common_products

[(('iPhone', 'Lightning Charging Cable'), 1005),
 (('Google Phone', 'USB-C Charging Cable'), 987),
 (('iPhone', 'Wired Headphones'), 447),
 (('Google Phone', 'Wired Headphones'), 414),
 (('Vareebadd Phone', 'USB-C Charging Cable'), 361),
 (('iPhone', 'Apple Airpods Headphones'), 360),
 (('Google Phone', 'Bose SoundSport Headphones'), 220),
 (('USB-C Charging Cable', 'Wired Headphones'), 160),
 (('Vareebadd Phone', 'Wired Headphones'), 143),
 (('Lightning Charging Cable', 'Wired Headphones'), 92)]

product_pairs, counts = zip(*most_common_products)

# Create labels from product pairs
labels = [f"{pair[0]} & {pair[1]}" for pair in product_pairs]

# Plotting the bar graph
plt.figure(figsize=(10, 6))
plt.barh(labels, counts, color='skyblue')
plt.xlabel('Number of Occurrences')
plt.ylabel('Product Pairs')
plt.title('Top 10 Most Common Product Pairs')
plt.gca().invert_yaxis()  # Invert y-axis to have the most common pair on top
plt.show()

all_data.head()

grouped_data = all_data.groupby('Product')['Quantity Ordered'].sum()
grouped_data.values

array([ 4129,  6244,  7550,  6199, 27635, 31017, 15661, 13457,  4819,
        5532,   646,   666, 23217,  4728,  4130, 23975,  2068, 20557,
        6849], dtype=int64)

# numeric_columns = ['Quantity Ordered', 'Price Each', 'Month','Sales','Hour' ,'Count']
grouped_data = all_data.groupby('Product')['Quantity Ordered'].sum()
# products = [product for product, df in  grouped_data]
plt.figure(figsize=(10, 6))  # Adjust the size of the plot
plt.bar(grouped_data.index, grouped_data.values, color='skyblue')

# Adding labels and title
plt.xlabel('Product')
plt.ylabel('Quantity Ordered')
plt.title('Total Quantity Ordered by Product')
plt.xticks(rotation='vertical')  # Rotate x-axis labels for better readability

# Show the plot
plt.show()

grouped_quantity = all_data.groupby('Product')['Quantity Ordered'].sum()
prices = all_data.groupby('Product')['Price Each'].mean()

# Create a figure and a set of subplots
fig, ax1 = plt.subplots(figsize=(10, 6))

# Plot the first y-axis: Quantity Ordered
ax1.bar(grouped_quantity.index, grouped_quantity.values, color='skyblue')
ax1.set_xlabel('Product')
ax1.set_ylabel('Quantity Ordered', color='skyblue')
# ax1.tick_params(axis='y', labelcolor='skyblue')
ax1.set_xticklabels(grouped_quantity.index, rotation='vertical')

# Create a second y-axis sharing the same x-axis
ax2 = ax1.twinx()
ax2.plot(grouped_quantity.index, prices.values, color='green')
ax2.set_ylabel('Average Price Each (USD)', color='green')
ax2.tick_params(axis='y', labelcolor='green')

# Title of the plot
plt.title('Total Quantity Ordered and Average Price by Product')

# Show the plot
plt.show()

C:\Users\Michal\AppData\Local\Temp\ipykernel_13708\254779068.py:12: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax1.set_xticklabels(grouped_quantity.index, rotation='vertical')

	Order ID	Product	Quantity Ordered	Price Each	Order Date	Purchase Address
0	176558	USB-C Charging Cable	2	11.95	04/19/19 08:46	917 1st St, Dallas, TX 75001
1	176559	Bose SoundSport Headphones	1	99.99	04/07/19 22:30	682 Chestnut St, Boston, MA 02215
2	176560	Google Phone	1	600	04/12/19 14:38	669 Spruce St, Los Angeles, CA 90001
3	176560	Wired Headphones	1	11.99	04/12/19 14:38	669 Spruce St, Los Angeles, CA 90001
4	176561	Wired Headphones	1	11.99	04/30/19 09:27	333 8th St, Los Angeles, CA 90001

	Order ID	Product	Quantity Ordered	Price Each	Order Date	Purchase Address	Month
0	176558	USB-C Charging Cable	2	11.95	04/19/19 08:46	917 1st St, Dallas, TX 75001	4
1	176559	Bose SoundSport Headphones	1	99.99	04/07/19 22:30	682 Chestnut St, Boston, MA 02215	4
2	176560	Google Phone	1	600.00	04/12/19 14:38	669 Spruce St, Los Angeles, CA 90001	4
3	176560	Wired Headphones	1	11.99	04/12/19 14:38	669 Spruce St, Los Angeles, CA 90001	4
4	176561	Wired Headphones	1	11.99	04/30/19 09:27	333 8th St, Los Angeles, CA 90001	4

	Order ID	Product	Quantity Ordered	Price Each	Order Date	Purchase Address	Month	Sales
0	176558	USB-C Charging Cable	2	11.95	04/19/19 08:46	917 1st St, Dallas, TX 75001	4	23.90
1	176559	Bose SoundSport Headphones	1	99.99	04/07/19 22:30	682 Chestnut St, Boston, MA 02215	4	99.99
2	176560	Google Phone	1	600.00	04/12/19 14:38	669 Spruce St, Los Angeles, CA 90001	4	600.00
3	176560	Wired Headphones	1	11.99	04/12/19 14:38	669 Spruce St, Los Angeles, CA 90001	4	11.99
4	176561	Wired Headphones	1	11.99	04/30/19 09:27	333 8th St, Los Angeles, CA 90001	4	11.99

	Order ID	Product	Quantity Ordered	Price Each	Order Date	Purchase Address	Month	Sales	City
0	176558	USB-C Charging Cable	2	11.95	04/19/19 08:46	917 1st St, Dallas, TX 75001	4	23.90	Dallas TX
1	176559	Bose SoundSport Headphones	1	99.99	04/07/19 22:30	682 Chestnut St, Boston, MA 02215	4	99.99	Boston MA
2	176560	Google Phone	1	600.00	04/12/19 14:38	669 Spruce St, Los Angeles, CA 90001	4	600.00	Los Angeles CA
3	176560	Wired Headphones	1	11.99	04/12/19 14:38	669 Spruce St, Los Angeles, CA 90001	4	11.99	Los Angeles CA
4	176561	Wired Headphones	1	11.99	04/30/19 09:27	333 8th St, Los Angeles, CA 90001	4	11.99	Los Angeles CA

	Quantity Ordered	Price Each	Month	Sales
City
Atlanta GA	16602	2779908.20	104794	2795498.58
Austin TX	11153	1809873.61	69829	1819581.75
Boston MA	22528	3637409.77	141112	3661642.01
Dallas TX	16730	2752627.82	104620	2767975.40
Los Angeles CA	33289	5421435.23	208325	5452570.80
New York City NY	27932	4635370.83	175741	4664317.43
Portland ME	2750	447189.25	17144	449758.27
Portland OR	11303	1860558.22	70621	1870732.34
San Francisco CA	50239	8211461.74	315520	8262203.91
Seattle WA	16553	2733296.01	104941	2747755.48

Merge 12 months of sales data into a single file¶

Read in updated dataframe¶

Clean up the data¶

Augment data with additional columns¶

Question 1: What was the best month for sales? How much was earned that month?¶

Question 2: What city had the highest number of sales ?¶

Question 3: What time should we display advertisements to maximize likehood of customers buying product?¶

Question 4: What products are most often sold together?¶