ODD Dataset

This is a worksheet of ODD Assignment.The dataset for this project includes car sales numbers from the beginning of 2016 through September 2019.

Preparing and Cleaning Datasets

We begin the project by reading the excel files that we download from the ODD website. For sales_2018, data gathering and celaning code given: (for all years, same code structure was used)

#library(tidyverse)
#library(readxl)
#2018 Sales Data#
#jan_2018<-read_excel("2018.01.xlsx",skip=7,col_names=FALSE)
#feb_2018<-read_excel("2018.02.xlsx",skip=7,col_names=FALSE)
#mar_2018<-read_excel("2018.03.xlsx",skip=7,col_names=FALSE)
#apr_2018<-read_excel("2018.04.xlsx",skip=7,col_names=FALSE)
#may_2018<-read_excel("2018.05.xlsx",skip=7,col_names=FALSE)
#jun_2018<-read_excel("2018.06.xlsx",skip=7,col_names=FALSE)
#jul_2018<-read_excel("2018.07.xlsx",skip=7,col_names=FALSE)
#aug_2018<-read_excel("2018.08.xlsx",skip=7,col_names=FALSE)
#sep_2018<-read_excel("2018.09.xlsx",skip=7,col_names=FALSE)
#oct_2018<-ead_excel("2018.10.xlsx",skip=7,col_names=FALSE)
#nov_2018<-read_excel("2018.11.xlsx",skip=7,col_names=FALSE)
#dec_2018<-read_excel("2018.12.xlsx",skip=7,col_names=FALSE)
#Slicing unnecessary rows from data
#jan_2018<-jan_2018%>%slice(-c(44,45))
#feb_2018<-feb_2018%>%slice(-c(44,45))
#mar_2018<-mar_2018%>%slice(-c(44,45))
#apr_2018<-apr_2018%>%slice(-c(43,44))
#may_2018<-may_2018%>%slice(-c(43,44))
#jun_2018<-jun_2018%>%slice(-c(43,44))
#jul_2018<-jul_2018%>%slice(-c(43,44))
#aug_2018<-aug_2018%>%slice(-c(43,44))
#sep_2018<-sep_2018%>%slice(-c(43,44))
#oct_2018<-oct_2018%>%slice(-c(43,44))
#nov_2018<-nov_2018%>%slice(-c(43,44))
#dec_2018<-dec_2018%>%slice(-c(43,44))
#Changing column names#
#dfs<-c("jan_2018","feb_2018","mar_2018","apr_2018","may_2018","jun_2018","jul_2018","aug_2018","sep_2018","oct_2018","nov_2018","dec_2018")  
#for(df in dfs) {
#  df.tmp <- get(df)
#  names(df.tmp) <- c("brand_name","auto_dom","auto_imp","auto_total","comm_dom",
#  "comm_imp","comm_total","total_dom","total_imp","total_total") 
#  assign(df, df.tmp)
# }
#Creating Month and Year Column#
#jan_2018 <- jan_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 1)
#feb_2018 <- feb_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 2)
#mar_2018 <- mar_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 3)
#apr_2018 <- apr_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 4)
#may_2018 <- may_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 5)
#jun_2018 <- jun_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 6)
#jul_2018 <- jul_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 7)
#aug_2018 <- aug_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 8)
#sep_2018 <- sep_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 9)
#oct_2018 <- oct_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 10)
#nov_2018 <- nov_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 11)
#dec_2018 <- dec_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 12)
#Combined 2018 Sales Data#
#sales_2018 <- rbind(jan_2018, feb_2018, mar_2018, apr_2018, may_2018, jun_2018, jul_2018, aug_2018, sep_2018, oct_2018, nov_2018, dec_2018)
#saveRDS(sales_2018,file="sales_2018.rds")

Merging Datasets

library(tidyverse)
library(ggplot2)
library(zoo)
library(scales)
library(formattable)
library(knitr)
sales_2016 <- readRDS(url("https://github.com/pjournal/mef03g-spo-R-ify/blob/master/sales_2016.rds?raw=true"))
sales_2017 <- readRDS(url("https://github.com/pjournal/mef03g-spo-R-ify/blob/master/sales_2017.rds?raw=true"))
sales_2018 <- readRDS(url("https://github.com/pjournal/mef03g-spo-R-ify/blob/master/sales_2018.rds?raw=true"))
sales_2019 <- readRDS(url("https://github.com/pjournal/mef03g-spo-R-ify/blob/master/sales_2019.rds?raw=true"))
combined_sales <- rbind(sales_2016, sales_2017, sales_2018, sales_2019)
combined_sales <- combined_sales %>% filter(!str_detect(brand_name,"ODD"))
combined_sales$brand_name <- str_replace_all(combined_sales$brand_name,"ASTON MARTƄĀ°N","ASTON MARTIN")

We should create the date column for analysis

combined_sales$YearMonth <- as.Date(as.yearmon(paste(combined_sales$year, " ", combined_sales$month), "%Y %m"))
print(combined_sales, width = Inf)
## # A tibble: 2,001 x 13
##    brand_name   auto_dom auto_imp auto_total comm_dom comm_imp comm_total
##    <chr>           <dbl>    <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
##  1 ALFA ROMEO          0       12         12        0        0          0
##  2 ASTON MARTIN        0        2          2        0        0          0
##  3 AUDI                0      911        911        0        0          0
##  4 BENTLEY             0        0          0        0        0          0
##  5 BMW                 0      496        496        0        0          0
##  6 CHERY               0       30         30        0        0          0
##  7 CITROEN             0      394        394       41      207        248
##  8 DACIA               0     1235       1235        0      221        221
##  9 DS                  0        8          8        0        0          0
## 10 FERRARI             0        3          3        0        0          0
##    total_dom total_imp total_total  year month YearMonth 
##        <dbl>     <dbl>       <dbl> <dbl> <dbl> <date>    
##  1         0        12          12  2016     1 2016-01-01
##  2         0         2           2  2016     1 2016-01-01
##  3         0       911         911  2016     1 2016-01-01
##  4         0         0           0  2016     1 2016-01-01
##  5         0       496         496  2016     1 2016-01-01
##  6         0        30          30  2016     1 2016-01-01
##  7        41       601         642  2016     1 2016-01-01
##  8         0      1456        1456  2016     1 2016-01-01
##  9         0         8           8  2016     1 2016-01-01
## 10         0         3           3  2016     1 2016-01-01
## # ... with 1,991 more rows

Monthly Car Sales from 2016 to September 2019 by Brand Names

sales_byperiod <- combined_sales %>% group_by(YearMonth, brand_name) %>% summarise(total_sales = sum(total_total)) %>% arrange(desc(total_sales)) %>% top_n(1)
## Selecting by total_sales
sales_byperiod
## # A tibble: 46 x 3
## # Groups:   YearMonth [45]
##    YearMonth  brand_name total_sales
##    <date>     <chr>            <dbl>
##  1 2016-12-01 RENAULT          21078
##  2 2017-12-01 RENAULT          19054
##  3 2016-11-01 VOLKSWAGEN       14359
##  4 2017-11-01 RENAULT          13984
##  5 2016-05-01 RENAULT          12741
##  6 2016-06-01 VOLKSWAGEN       12406
##  7 2017-06-01 RENAULT          12233
##  8 2016-10-01 VOLKSWAGEN       12229
##  9 2016-04-01 RENAULT          12075
## 10 2018-12-01 FIAT             11846
## # ... with 36 more rows
ggplot(sales_byperiod, aes(x = YearMonth, y = total_sales, fill = brand_name)) + geom_bar(stat = "identity") + labs(x = "Months", y = "Total Car Sales", fill = "Brand Name")

Total Monthly Sales in Turkey

monthly_sales <- combined_sales %>% group_by(YearMonth) %>% summarise(total_sales = sum(total_total)) %>% arrange(desc(total_sales))
monthly_sales
## # A tibble: 45 x 2
##    YearMonth  total_sales
##    <date>           <dbl>
##  1 2016-12-01      141912
##  2 2017-12-01      136240
##  3 2016-11-01      122309
##  4 2017-11-01      100859
##  5 2016-05-01       93904
##  6 2017-10-01       91752
##  7 2016-06-01       91540
##  8 2017-05-01       85422
##  9 2016-04-01       84887
## 10 2017-06-01       83658
## # ... with 35 more rows
ggplot(monthly_sales, aes(x = YearMonth, y = total_sales, color = total_sales)) + geom_point() + geom_smooth() + labs(x = "Months", y = "Total Car Sales", color = "Total Car Sales")

Total Number of Car Sales by Brand from 2016 to September 2019

total_car_numbers <- combined_sales %>% group_by(brand_name) %>% summarise(total_total = sum(total_total)) %>% arrange(desc(total_total))
total_car_numbers
## # A tibble: 49 x 2
##    brand_name    total_total
##    <chr>               <dbl>
##  1 RENAULT            380614
##  2 VOLKSWAGEN         347657
##  3 FIAT               338887
##  4 FORD               312159
##  5 HYUNDAI            151963
##  6 TOYOTA             148515
##  7 DACIA              136003
##  8 OPEL               130706
##  9 PEUGEOT            121911
## 10 MERCEDES-BENZ      107718
## # ... with 39 more rows
ggplot(total_car_numbers, aes(x = brand_name, y = total_total, size = total_total, color = brand_name)) + geom_point() + theme(axis.text.x = element_text(angle = 90)) + labs(x = "Brand Name", y = "Total Car Sales", color = "Brand Name", size = "")

Ratio of the Number of Cars in Domestic Status to the Total Number of Cars per Month

domestic_percentage <- combined_sales %>% group_by(YearMonth) %>% summarise(total_dom = sum(total_dom), total_total = sum(total_total)) %>% mutate(percent_total_dom = total_dom/total_total) %>% arrange(desc(percent_total_dom))
domestic_percentage
## # A tibble: 45 x 4
##    YearMonth  total_dom total_total percent_total_dom
##    <date>         <dbl>       <dbl>             <dbl>
##  1 2019-05-01     16229       33016             0.492
##  2 2019-06-01     19986       42688             0.468
##  3 2019-03-01     22395       49221             0.455
##  4 2019-09-01     19085       41992             0.454
##  5 2019-07-01      8122       17927             0.453
##  6 2019-04-01     13615       30971             0.440
##  7 2019-02-01     10516       24875             0.423
##  8 2019-01-01      6068       14373             0.422
##  9 2018-12-01     31072       77706             0.400
## 10 2018-01-01     13915       35076             0.397
## # ... with 35 more rows
ggplot(domestic_percentage, aes(x = YearMonth, y = percent_total_dom, size = percent_total_dom, color = percent_total_dom)) + geom_point() + scale_color_gradientn(colours = rainbow(2)) + labs(x = "Months", y = "Percentage of Domestic Cars in Total", color = "", size="")

Top 5 Market Share Percentage 2016-2019

top5_market_share <- combined_sales %>% group_by(year, brand_name) %>% summarise(total_sales = sum(total_total)) %>% mutate(percent = total_sales / sum(total_sales)) %>% top_n(5, percent)
top5_market_share
## # A tibble: 20 x 4
## # Groups:   year [4]
##     year brand_name total_sales percent
##    <dbl> <chr>            <dbl>   <dbl>
##  1  2016 FIAT            106106  0.108 
##  2  2016 FORD            109604  0.111 
##  3  2016 OPEL             55471  0.0564
##  4  2016 RENAULT         121707  0.124 
##  5  2016 VOLKSWAGEN      134535  0.137 
##  6  2017 FIAT            120049  0.126 
##  7  2017 FORD            111064  0.116 
##  8  2017 HYUNDAI          52265  0.0547
##  9  2017 RENAULT         130276  0.136 
## 10  2017 VOLKSWAGEN      117481  0.123 
## 11  2018 FIAT             70058  0.113 
## 12  2018 FORD             65428  0.105 
## 13  2018 RENAULT          85839  0.138 
## 14  2018 TOYOTA           33978  0.0547
## 15  2018 VOLKSWAGEN       66834  0.108 
## 16  2019 FIAT             42674  0.152 
## 17  2019 FORD             26063  0.0926
## 18  2019 RENAULT          42792  0.152 
## 19  2019 TOYOTA           16496  0.0586
## 20  2019 VOLKSWAGEN       28807  0.102
ggplot(top5_market_share, aes(x = year, y = percent, color = brand_name)) + geom_point() + geom_line()

Top 20 Firms Holding the Market Share between 2016-2019

combined_sales$time<-as.Date(as.yearmon(paste(combined_sales$year, " ", combined_sales$month), "%Y %m"))
M_S<-combined_sales%>%select(brand_name,year,total_total)%>%group_by(brand_name)%>%mutate(t_total=sum(total_total))
M_S1<-M_S[!duplicated(M_S$brand_name), ]
sum_total<-sum(M_S1$t_total)
M_S2<-M_S1%>%arrange(desc(t_total))%>%head(20)
ggplot(M_S2,aes(x=reorder(brand_name,-t_total),y=t_total/sum_total,color=brand_name))+geom_col(aes(fill=brand_name))+theme(axis.text.x = element_text(angle = 90, hjust = 1))+labs(title = "2016-2019 Total Market Share in Percentage",y="Market Share in Percentage",x="Company Names")

Total Imported Car Shares of Brands between 2016-2019

T_I<-combined_sales%>%select(brand_name,total_imp,time)%>%group_by(time)%>% mutate(perc_total_imp = total_imp/sum(total_imp))%>%top_n(10)
ggplot(T_I,aes(x=time,y=total_imp,color=brand_name))+geom_col(aes(fill=brand_name))+scale_x_date(labels = date_format("%Y %m"))+labs(title = "2016-2019 Total Imports",y="Total Imports",x="Time")

Total Domestic Car Shares of Brands between 2016-2019

T_D<-combined_sales%>%select(brand_name,total_dom,time)%>%group_by(time)%>% mutate(perc_total_dom = total_dom/sum(total_dom))%>%top_n(10)  
ggplot(T_D,aes(x=time,y=total_dom))+geom_col(aes(fill=brand_name))+scale_x_date(labels = date_format("%Y %m"))+labs(title = "2016-2019 Total Domestic",y="Total Domestic",x="Time")

Total Domestic, Imported Car Sales in Months Beween 2016-2019

slct_1<-combined_sales %>% group_by(year,month) %>% summarise(sum_dom=sum(total_dom),sum_imp=sum(total_imp),sum_total=sum(total_total),sum_dom) %>% ungroup()
formattable(slct_1)
year month sum_dom sum_imp sum_total
2016 1 12190 20523 32713
2016 2 16711 36114 52825
2016 3 25593 57355 82948
2016 4 28714 56173 84887
2016 5 30435 63469 93904
2016 6 27314 64226 91540
2016 7 16380 42153 58533
2016 8 19178 52378 71556
2016 9 19345 48248 67593
2016 10 24904 58096 83000
2016 11 38870 83439 122309
2016 12 51230 90682 141912
2017 1 13783 21540 35323
2017 2 16693 30272 46965
2017 3 25543 48259 73802
2017 4 26546 49442 75988
2017 5 30440 54982 85422
2017 6 31002 52656 83658
2017 7 29698 52599 82297
2017 8 22820 49716 72536
2017 9 25002 46350 71352
2017 10 31073 60679 91752
2017 11 34827 66032 100859
2017 12 52886 83354 136240
2018 1 13915 21161 35076
2018 2 17578 29431 47009
2018 3 28575 47770 76345
2018 4 25388 45738 71126
2018 5 27238 45517 72755
2018 6 19842 31195 51037
2018 7 20071 32663 52734
2018 8 11590 22756 34346
2018 9 7836 15192 23028
2018 10 7885 13686 21571
2018 11 19532 38672 58204
2018 12 31072 46634 77706
2019 1 6068 8305 14373
2019 2 10516 14359 24875
2019 3 22395 26826 49221
2019 4 13615 17356 30971
2019 5 16229 16787 33016
2019 6 19986 22702 42688
2019 7 8122 9805 17927
2019 8 9994 16252 26246
2019 9 19085 22907 41992

Bar Graph of 4 Year Total Car Sales in Months

levels_month= c("1","2","3","4","5","6","7","8","9","10","11","12")
ggplot(slct_1,aes(x=ordered(as.character(month),levels=levels_month),y=sum_total,fill=ordered(year,levels=c("2016","2017","2018","2019")))) +geom_bar(stat="identity",position="dodge") + labs(y="Total Car Sales",x="Months",fill="")