ODD Dataset

This is a worksheet of ODD Assignment.The dataset for this project includes car sales numbers from the beginning of 2016 through September 2019.

Preparing and Cleaning Datasets

We begin the project by reading the excel files that we download from the ODD website. For sales_2018, data gathering and celaning code given: (for all years, same code structure was used)

#library(tidyverse)
#library(readxl)
#2018 Sales Data#
#jan_2018<-read_excel("2018.01.xlsx",skip=7,col_names=FALSE)
#feb_2018<-read_excel("2018.02.xlsx",skip=7,col_names=FALSE)
#mar_2018<-read_excel("2018.03.xlsx",skip=7,col_names=FALSE)
#apr_2018<-read_excel("2018.04.xlsx",skip=7,col_names=FALSE)
#may_2018<-read_excel("2018.05.xlsx",skip=7,col_names=FALSE)
#jun_2018<-read_excel("2018.06.xlsx",skip=7,col_names=FALSE)
#jul_2018<-read_excel("2018.07.xlsx",skip=7,col_names=FALSE)
#aug_2018<-read_excel("2018.08.xlsx",skip=7,col_names=FALSE)
#sep_2018<-read_excel("2018.09.xlsx",skip=7,col_names=FALSE)
#oct_2018<-ead_excel("2018.10.xlsx",skip=7,col_names=FALSE)
#nov_2018<-read_excel("2018.11.xlsx",skip=7,col_names=FALSE)
#dec_2018<-read_excel("2018.12.xlsx",skip=7,col_names=FALSE)
#Slicing unnecessary rows from data
#jan_2018<-jan_2018%>%slice(-c(44,45))
#feb_2018<-feb_2018%>%slice(-c(44,45))
#mar_2018<-mar_2018%>%slice(-c(44,45))
#apr_2018<-apr_2018%>%slice(-c(43,44))
#may_2018<-may_2018%>%slice(-c(43,44))
#jun_2018<-jun_2018%>%slice(-c(43,44))
#jul_2018<-jul_2018%>%slice(-c(43,44))
#aug_2018<-aug_2018%>%slice(-c(43,44))
#sep_2018<-sep_2018%>%slice(-c(43,44))
#oct_2018<-oct_2018%>%slice(-c(43,44))
#nov_2018<-nov_2018%>%slice(-c(43,44))
#dec_2018<-dec_2018%>%slice(-c(43,44))
#Changing column names#
#dfs<-c("jan_2018","feb_2018","mar_2018","apr_2018","may_2018","jun_2018","jul_2018","aug_2018","sep_2018","oct_2018","nov_2018","dec_2018")  
#for(df in dfs) {
#  df.tmp <- get(df)
#  names(df.tmp) <- c("brand_name","auto_dom","auto_imp","auto_total","comm_dom",
#  "comm_imp","comm_total","total_dom","total_imp","total_total") 
#  assign(df, df.tmp)
# }
#Creating Month and Year Column#
#jan_2018 <- jan_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 1)
#feb_2018 <- feb_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 2)
#mar_2018 <- mar_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 3)
#apr_2018 <- apr_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 4)
#may_2018 <- may_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 5)
#jun_2018 <- jun_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 6)
#jul_2018 <- jul_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 7)
#aug_2018 <- aug_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 8)
#sep_2018 <- sep_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 9)
#oct_2018 <- oct_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 10)
#nov_2018 <- nov_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 11)
#dec_2018 <- dec_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 12)
#Combined 2018 Sales Data#
#sales_2018 <- rbind(jan_2018, feb_2018, mar_2018, apr_2018, may_2018, jun_2018, jul_2018, aug_2018, sep_2018, oct_2018, nov_2018, dec_2018)
#saveRDS(sales_2018,file="sales_2018.rds")

Merging Datasets

library(tidyverse)
library(ggplot2)
library(zoo)
library(scales)
library(formattable)
library(knitr)
sales_2016 <- readRDS(url("https://github.com/pjournal/mef03g-spo-R-ify/blob/master/sales_2016.rds?raw=true"))
sales_2017 <- readRDS(url("https://github.com/pjournal/mef03g-spo-R-ify/blob/master/sales_2017.rds?raw=true"))
sales_2018 <- readRDS(url("https://github.com/pjournal/mef03g-spo-R-ify/blob/master/sales_2018.rds?raw=true"))
sales_2019 <- readRDS(url("https://github.com/pjournal/mef03g-spo-R-ify/blob/master/sales_2019.rds?raw=true"))
combined_sales <- rbind(sales_2016, sales_2017, sales_2018, sales_2019)
combined_sales <- combined_sales %>% filter(!str_detect(brand_name,"ODD"))
combined_sales$brand_name <- str_replace_all(combined_sales$brand_name,"ASTON MARTÄ°N","ASTON MARTIN")

We should create the date column for analysis

combined_sales$YearMonth <- as.Date(as.yearmon(paste(combined_sales$year, " ", combined_sales$month), "%Y %m"))
print(combined_sales, width = Inf)
## # A tibble: 2,001 x 13
##    brand_name   auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##    <chr>           <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
##  1 ALFA ROMEO          0       12         12        0        0          0
##  2 ASTON MARTIN        0        2          2        0        0          0
##  3 AUDI                0      911        911        0        0          0
##  4 BENTLEY             0        0          0        0        0          0
##  5 BMW                 0      496        496        0        0          0
##  6 CHERY               0       30         30        0        0          0
##  7 CITROEN             0      394        394       41      207        248
##  8 DACIA               0     1235       1235        0      221        221
##  9 DS                  0        8          8        0        0          0
## 10 FERRARI             0        3          3        0        0          0
##    total_dom total_imp total_total  year month YearMonth 
##        <dbl>     <dbl>       <dbl> <dbl> <dbl> <date>    
##  1         0        12          12  2016     1 2016-01-01
##  2         0         2           2  2016     1 2016-01-01
##  3         0       911         911  2016     1 2016-01-01
##  4         0         0           0  2016     1 2016-01-01
##  5         0       496         496  2016     1 2016-01-01
##  6         0        30          30  2016     1 2016-01-01
##  7        41       601         642  2016     1 2016-01-01
##  8         0      1456        1456  2016     1 2016-01-01
##  9         0         8           8  2016     1 2016-01-01
## 10         0         3           3  2016     1 2016-01-01
## # ... with 1,991 more rows

Monthly Car Sales from 2016 to September 2019 by Brand Names

sales_byperiod <- combined_sales %>% group_by(YearMonth, brand_name) %>% summarise(total_sales = sum(total_total)) %>% arrange(desc(total_sales)) %>% top_n(1)
## Selecting by total_sales
sales_byperiod
## # A tibble: 46 x 3
## # Groups:   YearMonth [45]
##    YearMonth  brand_name total_sales
##    <date>     <chr>            <dbl>
##  1 2016-12-01 RENAULT          21078
##  2 2017-12-01 RENAULT          19054
##  3 2016-11-01 VOLKSWAGEN       14359
##  4 2017-11-01 RENAULT          13984
##  5 2016-05-01 RENAULT          12741
##  6 2016-06-01 VOLKSWAGEN       12406
##  7 2017-06-01 RENAULT          12233
##  8 2016-10-01 VOLKSWAGEN       12229
##  9 2016-04-01 RENAULT          12075
## 10 2018-12-01 FIAT             11846
## # ... with 36 more rows
ggplot(sales_byperiod, aes(x = YearMonth, y = total_sales, fill = brand_name)) + geom_bar(stat = "identity") + labs(x = "Months", y = "Total Car Sales", fill = "Brand Name")

Total Monthly Sales in Turkey

monthly_sales <- combined_sales %>% group_by(YearMonth) %>% summarise(total_sales = sum(total_total)) %>% arrange(desc(total_sales))
monthly_sales
## # A tibble: 45 x 2
##    YearMonth  total_sales
##    <date>           <dbl>
##  1 2016-12-01      141912
##  2 2017-12-01      136240
##  3 2016-11-01      122309
##  4 2017-11-01      100859
##  5 2016-05-01       93904
##  6 2017-10-01       91752
##  7 2016-06-01       91540
##  8 2017-05-01       85422
##  9 2016-04-01       84887
## 10 2017-06-01       83658
## # ... with 35 more rows
ggplot(monthly_sales, aes(x = YearMonth, y = total_sales, color = total_sales)) + geom_point() + geom_smooth() + labs(x = "Months", y = "Total Car Sales", color = "Total Car Sales")

Total Number of Car Sales by Brand from 2016 to September 2019

total_car_numbers <- combined_sales %>% group_by(brand_name) %>% summarise(total_total = sum(total_total)) %>% arrange(desc(total_total))
total_car_numbers
## # A tibble: 49 x 2
##    brand_name    total_total
##    <chr>               <dbl>
##  1 RENAULT            380614
##  2 VOLKSWAGEN         347657
##  3 FIAT               338887
##  4 FORD               312159
##  5 HYUNDAI            151963
##  6 TOYOTA             148515
##  7 DACIA              136003
##  8 OPEL               130706
##  9 PEUGEOT            121911
## 10 MERCEDES-BENZ      107718
## # ... with 39 more rows
ggplot(total_car_numbers, aes(x = brand_name, y = total_total, size = total_total, color = brand_name)) + geom_point() + theme(axis.text.x = element_text(angle = 90)) + labs(x = "Brand Name", y = "Total Car Sales", color = "Brand Name", size = "")

Ratio of the Number of Cars in Domestic Status to the Total Number of Cars per Month

domestic_percentage <- combined_sales %>% group_by(YearMonth) %>% summarise(total_dom = sum(total_dom), total_total = sum(total_total)) %>% mutate(percent_total_dom = total_dom/total_total) %>% arrange(desc(percent_total_dom))
domestic_percentage
## # A tibble: 45 x 4
##    YearMonth  total_dom total_total percent_total_dom
##    <date>         <dbl>       <dbl>             <dbl>
##  1 2019-05-01     16229       33016             0.492
##  2 2019-06-01     19986       42688             0.468
##  3 2019-03-01     22395       49221             0.455
##  4 2019-09-01     19085       41992             0.454
##  5 2019-07-01      8122       17927             0.453
##  6 2019-04-01     13615       30971             0.440
##  7 2019-02-01     10516       24875             0.423
##  8 2019-01-01      6068       14373             0.422
##  9 2018-12-01     31072       77706             0.400
## 10 2018-01-01     13915       35076             0.397
## # ... with 35 more rows
ggplot(domestic_percentage, aes(x = YearMonth, y = percent_total_dom, size = percent_total_dom, color = percent_total_dom)) + geom_point() + scale_color_gradientn(colours = rainbow(2)) + labs(x = "Months", y = "Percentage of Domestic Cars in Total", color = "", size="")

Top 5 Market Share Percentage 2016-2019

top5_market_share <- combined_sales %>% group_by(year, brand_name) %>% summarise(total_sales = sum(total_total)) %>% mutate(percent = total_sales / sum(total_sales)) %>% top_n(5, percent)
top5_market_share
## # A tibble: 20 x 4
## # Groups:   year [4]
##     year brand_name total_sales percent
##    <dbl> <chr>            <dbl>   <dbl>
##  1  2016 FIAT            106106  0.108 
##  2  2016 FORD            109604  0.111 
##  3  2016 OPEL             55471  0.0564
##  4  2016 RENAULT         121707  0.124 
##  5  2016 VOLKSWAGEN      134535  0.137 
##  6  2017 FIAT            120049  0.126 
##  7  2017 FORD            111064  0.116 
##  8  2017 HYUNDAI          52265  0.0547
##  9  2017 RENAULT         130276  0.136 
## 10  2017 VOLKSWAGEN      117481  0.123 
## 11  2018 FIAT             70058  0.113 
## 12  2018 FORD             65428  0.105 
## 13  2018 RENAULT          85839  0.138 
## 14  2018 TOYOTA           33978  0.0547
## 15  2018 VOLKSWAGEN       66834  0.108 
## 16  2019 FIAT             42674  0.152 
## 17  2019 FORD             26063  0.0926
## 18  2019 RENAULT          42792  0.152 
## 19  2019 TOYOTA           16496  0.0586
## 20  2019 VOLKSWAGEN       28807  0.102
ggplot(top5_market_share, aes(x = year, y = percent, color = brand_name)) + geom_point() + geom_line()

Top 20 Firms Holding the Market Share between 2016-2019

combined_sales$time<-as.Date(as.yearmon(paste(combined_sales$year, " ", combined_sales$month), "%Y %m"))
M_S<-combined_sales%>%select(brand_name,year,total_total)%>%group_by(brand_name)%>%mutate(t_total=sum(total_total))
M_S1<-M_S[!duplicated(M_S$brand_name), ]
sum_total<-sum(M_S1$t_total)
M_S2<-M_S1%>%arrange(desc(t_total))%>%head(20)
ggplot(M_S2,aes(x=reorder(brand_name,-t_total),y=t_total/sum_total,color=brand_name))+geom_col(aes(fill=brand_name))+theme(axis.text.x = element_text(angle = 90, hjust = 1))+labs(title = "2016-2019 Total Market Share in Percentage",y="Market Share in Percentage",x="Company Names")