Loading database

library(tidyverse)

# loading database without index and desc column
df <- read.csv("BigBasket Products.csv") %>% select(-index,-description)

# factor distinct columns
df <- df %>% mutate(category=as_factor(category),
              sub_category=as_factor(sub_category),
              brand=as_factor(brand),
              type=as_factor(type))
# adding discount column
df <- df %>% mutate(discount=round((market_price-sale_price)/market_price, digits = 3))

Summary the data

summary(df)
##    product                              category   
##  Length:27555       Beauty & Hygiene        :7867  
##  Class :character   Gourmet & World Food    :4690  
##  Mode  :character   Kitchen, Garden & Pets  :3580  
##                     Snacks & Branded Foods  :2814  
##                     Foodgrains, Oil & Masala:2676  
##                     Cleaning & Household    :2675  
##                     (Other)                 :3253  
##                 sub_category                brand         sale_price      
##  Skin Care            : 2294   Fresho          :  638   Min.   :    2.45  
##  Health & Medicine    : 1133   bb Royal        :  539   1st Qu.:   95.00  
##  Hair Care            : 1028   BB Home         :  428   Median :  190.00  
##  Storage & Accessories: 1015   DP              :  250   Mean   :  322.51  
##  Fragrances & Deos    : 1000   Fresho Signature:  171   3rd Qu.:  359.00  
##  Bath & Hand Wash     :  996   bb Combo        :  168   Max.   :12500.00  
##  (Other)              :20089   (Other)         :25361                     
##   market_price                        type           rating     
##  Min.   :    3.0   Face Care            : 1508   Min.   :1.000  
##  1st Qu.:  100.0   Ayurveda             :  538   1st Qu.:3.700  
##  Median :  220.0   Men's Deodorants     :  500   Median :4.100  
##  Mean   :  382.1   Shampoo & Conditioner:  461   Mean   :3.943  
##  3rd Qu.:  425.0   Containers Sets      :  415   3rd Qu.:4.300  
##  Max.   :12500.0   Glassware            :  415   Max.   :5.000  
##                    (Other)              :23718   NA's   :8626   
##     discount     
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0500  
##  Mean   :0.1182  
##  3rd Qu.:0.2000  
##  Max.   :0.8370  
## 

% of NA’s in rating category

df %>% summarise(perc_of_na_rating = sum(is.na(rating))/n()*100)
##   perc_of_na_rating
## 1          31.30466

About 31% of rating data don’t have value.

Data structure

str(df)
## 'data.frame':    27555 obs. of  9 variables:
##  $ product     : chr  "Garlic Oil - Vegetarian Capsule 500 mg" "Water Bottle - Orange" "Brass Angle Deep - Plain, No.2" "Cereal Flip Lid Container/Storage Jar - Assorted Colour" ...
##  $ category    : Factor w/ 11 levels "Beauty & Hygiene",..: 1 2 3 3 1 3 1 1 1 3 ...
##  $ sub_category: Factor w/ 90 levels "Hair Care","Storage & Accessories",..: 1 2 3 4 5 6 7 5 1 8 ...
##  $ brand       : Factor w/ 2314 levels "Sri Sri Ayurveda ",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ sale_price  : num  220 180 119 149 162 ...
##  $ market_price: num  220 180 250 176 162 ...
##  $ type        : Factor w/ 426 levels "Hair Oil & Serum",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ rating      : num  4.1 2.3 3.4 3.7 4.4 3.3 3.6 4 3.5 4.3 ...
##  $ discount    : num  0 0 0.524 0.153 0 0.151 0 0 0 0 ...

What is the share of each category

df %>% 
  group_by(category) %>% 
  summarise(count=n()) %>% 
  mutate(perc=count/sum(count)) %>% 
  arrange(count)%>% 
  ggplot(aes(y=fct_reorder(category, count), x=perc*100, fill=category))+
  geom_col(show.legend = F)+
  labs(x="Percent", y="", title = "Percentage of a category")+
  theme_minimal()

About 30% items are from Beauty category.

Which brands have the best rating?

df2 <- df %>% group_by(brand) %>% 
  filter(!is.na(rating)) %>% 
  summarise(mean_rating = mean(rating), n_element = n()) %>% 
  arrange(desc(mean_rating))
# top rating brands which have more than 4 products
df2 %>% filter(n_element>4) %>% head(10) %>% select(-n_element)
## # A tibble: 10 x 2
##    brand                 mean_rating
##    <fct>                       <dbl>
##  1 Sangam Sweets                4.83
##  2 Kaadoo                       4.81
##  3 Bobs Red Mill                4.79
##  4 Nimyle                       4.78
##  5 Continent Spice Khadi        4.76
##  6 Pine Sol                     4.76
##  7 The Whole Truth              4.75
##  8 SOVI                         4.66
##  9 Pyrex                        4.63
## 10 Tabasco                      4.62
# top rating brands which have more than 9 products
df2 %>% filter(n_element>9) %>% head(10) %>% select(-n_element)
## # A tibble: 10 x 2
##    brand               mean_rating
##    <fct>                     <dbl>
##  1 "Hi-Luxe"                  4.57
##  2 "VAHDAM"                   4.49
##  3 "Morpheme Remedies"        4.46
##  4 "Papergrid"                4.43
##  5 "Whiskas"                  4.42
##  6 "Parachute "               4.41
##  7 "Tide"                     4.4 
##  8 "BB Home Herbal"           4.39
##  9 "Laopala Diva"             4.38
## 10 "Selpak"                   4.38

Discounts for each category?

df %>% ggplot(aes(y=category, x=discount, color=category)) +
  geom_boxplot(show.legend=F)

The vast majority of discounts are from 0 to 20%.

We can see that there are suspiciously many values around 0.2 in the Fruits & Vegetables category.

df %>% filter(category=="Fruits & Vegetables") %>% 
  summarise(discout_equal_0.2 =sum(discount==0.2)/n())
##   discout_equal_0.2
## 1         0.8743268

87% of products from the Fruits & Vegetables category have 20% discount.

Does discounting affect product rating?

df %>% na.omit(rating) %>% 
ggplot(aes(discount, rating))+
geom_point()+
geom_smooth()+
facet_wrap(vars(category))

There is huge spread and it seems that discounting doesn’t affect rating. For each category rating is about 4.