This is a worksheet of ODD Assignment.The dataset for this project includes car sales numbers from the beginning of 2016 through September 2019.
We begin the project by reading the excel files that we download from the ODD website. For sales_2018, data gathering and celaning code given: (for all years, same code structure was used)
#library(tidyverse)
#library(readxl)
#2018 Sales Data#
#jan_2018<-read_excel("2018.01.xlsx",skip=7,col_names=FALSE)
#feb_2018<-read_excel("2018.02.xlsx",skip=7,col_names=FALSE)
#mar_2018<-read_excel("2018.03.xlsx",skip=7,col_names=FALSE)
#apr_2018<-read_excel("2018.04.xlsx",skip=7,col_names=FALSE)
#may_2018<-read_excel("2018.05.xlsx",skip=7,col_names=FALSE)
#jun_2018<-read_excel("2018.06.xlsx",skip=7,col_names=FALSE)
#jul_2018<-read_excel("2018.07.xlsx",skip=7,col_names=FALSE)
#aug_2018<-read_excel("2018.08.xlsx",skip=7,col_names=FALSE)
#sep_2018<-read_excel("2018.09.xlsx",skip=7,col_names=FALSE)
#oct_2018<-ead_excel("2018.10.xlsx",skip=7,col_names=FALSE)
#nov_2018<-read_excel("2018.11.xlsx",skip=7,col_names=FALSE)
#dec_2018<-read_excel("2018.12.xlsx",skip=7,col_names=FALSE)
#Slicing unnecessary rows from data
#jan_2018<-jan_2018%>%slice(-c(44,45))
#feb_2018<-feb_2018%>%slice(-c(44,45))
#mar_2018<-mar_2018%>%slice(-c(44,45))
#apr_2018<-apr_2018%>%slice(-c(43,44))
#may_2018<-may_2018%>%slice(-c(43,44))
#jun_2018<-jun_2018%>%slice(-c(43,44))
#jul_2018<-jul_2018%>%slice(-c(43,44))
#aug_2018<-aug_2018%>%slice(-c(43,44))
#sep_2018<-sep_2018%>%slice(-c(43,44))
#oct_2018<-oct_2018%>%slice(-c(43,44))
#nov_2018<-nov_2018%>%slice(-c(43,44))
#dec_2018<-dec_2018%>%slice(-c(43,44))
#Changing column names#
#dfs<-c("jan_2018","feb_2018","mar_2018","apr_2018","may_2018","jun_2018","jul_2018","aug_2018","sep_2018","oct_2018","nov_2018","dec_2018")
#for(df in dfs) {
# df.tmp <- get(df)
# names(df.tmp) <- c("brand_name","auto_dom","auto_imp","auto_total","comm_dom",
# "comm_imp","comm_total","total_dom","total_imp","total_total")
# assign(df, df.tmp)
# }
#Creating Month and Year Column#
#jan_2018 <- jan_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 1)
#feb_2018 <- feb_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 2)
#mar_2018 <- mar_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 3)
#apr_2018 <- apr_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 4)
#may_2018 <- may_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 5)
#jun_2018 <- jun_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 6)
#jul_2018 <- jul_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 7)
#aug_2018 <- aug_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 8)
#sep_2018 <- sep_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 9)
#oct_2018 <- oct_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 10)
#nov_2018 <- nov_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 11)
#dec_2018 <- dec_2018 %>% mutate_if(is.numeric,funs(ifelse(is.na(.),0,.))) %>% mutate(year = 2018, month = 12)
#Combined 2018 Sales Data#
#sales_2018 <- rbind(jan_2018, feb_2018, mar_2018, apr_2018, may_2018, jun_2018, jul_2018, aug_2018, sep_2018, oct_2018, nov_2018, dec_2018)
#saveRDS(sales_2018,file="sales_2018.rds")
library(tidyverse)
library(ggplot2)
library(zoo)
library(scales)
library(formattable)
library(knitr)
sales_2016 <- readRDS(url("https://github.com/pjournal/mef03g-spo-R-ify/blob/master/sales_2016.rds?raw=true"))
sales_2017 <- readRDS(url("https://github.com/pjournal/mef03g-spo-R-ify/blob/master/sales_2017.rds?raw=true"))
sales_2018 <- readRDS(url("https://github.com/pjournal/mef03g-spo-R-ify/blob/master/sales_2018.rds?raw=true"))
sales_2019 <- readRDS(url("https://github.com/pjournal/mef03g-spo-R-ify/blob/master/sales_2019.rds?raw=true"))
combined_sales <- rbind(sales_2016, sales_2017, sales_2018, sales_2019)
combined_sales <- combined_sales %>% filter(!str_detect(brand_name,"ODD"))
combined_sales$brand_name <- str_replace_all(combined_sales$brand_name,"ASTON MARTĆĀ°N","ASTON MARTIN")
We should create the date column for analysis
combined_sales$YearMonth <- as.Date(as.yearmon(paste(combined_sales$year, " ", combined_sales$month), "%Y %m"))
print(combined_sales, width = Inf)
## # A tibble: 2,001 x 13
## brand_name auto_dom auto_imp auto_total comm_dom comm_imp comm_total
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ALFA ROMEO 0 12 12 0 0 0
## 2 ASTON MARTIN 0 2 2 0 0 0
## 3 AUDI 0 911 911 0 0 0
## 4 BENTLEY 0 0 0 0 0 0
## 5 BMW 0 496 496 0 0 0
## 6 CHERY 0 30 30 0 0 0
## 7 CITROEN 0 394 394 41 207 248
## 8 DACIA 0 1235 1235 0 221 221
## 9 DS 0 8 8 0 0 0
## 10 FERRARI 0 3 3 0 0 0
## total_dom total_imp total_total year month YearMonth
## <dbl> <dbl> <dbl> <dbl> <dbl> <date>
## 1 0 12 12 2016 1 2016-01-01
## 2 0 2 2 2016 1 2016-01-01
## 3 0 911 911 2016 1 2016-01-01
## 4 0 0 0 2016 1 2016-01-01
## 5 0 496 496 2016 1 2016-01-01
## 6 0 30 30 2016 1 2016-01-01
## 7 41 601 642 2016 1 2016-01-01
## 8 0 1456 1456 2016 1 2016-01-01
## 9 0 8 8 2016 1 2016-01-01
## 10 0 3 3 2016 1 2016-01-01
## # ... with 1,991 more rows
sales_byperiod <- combined_sales %>% group_by(YearMonth, brand_name) %>% summarise(total_sales = sum(total_total)) %>% arrange(desc(total_sales)) %>% top_n(1)
## Selecting by total_sales
sales_byperiod
## # A tibble: 46 x 3
## # Groups: YearMonth [45]
## YearMonth brand_name total_sales
## <date> <chr> <dbl>
## 1 2016-12-01 RENAULT 21078
## 2 2017-12-01 RENAULT 19054
## 3 2016-11-01 VOLKSWAGEN 14359
## 4 2017-11-01 RENAULT 13984
## 5 2016-05-01 RENAULT 12741
## 6 2016-06-01 VOLKSWAGEN 12406
## 7 2017-06-01 RENAULT 12233
## 8 2016-10-01 VOLKSWAGEN 12229
## 9 2016-04-01 RENAULT 12075
## 10 2018-12-01 FIAT 11846
## # ... with 36 more rows
ggplot(sales_byperiod, aes(x = YearMonth, y = total_sales, fill = brand_name)) + geom_bar(stat = "identity") + labs(x = "Months", y = "Total Car Sales", fill = "Brand Name")
monthly_sales <- combined_sales %>% group_by(YearMonth) %>% summarise(total_sales = sum(total_total)) %>% arrange(desc(total_sales))
monthly_sales
## # A tibble: 45 x 2
## YearMonth total_sales
## <date> <dbl>
## 1 2016-12-01 141912
## 2 2017-12-01 136240
## 3 2016-11-01 122309
## 4 2017-11-01 100859
## 5 2016-05-01 93904
## 6 2017-10-01 91752
## 7 2016-06-01 91540
## 8 2017-05-01 85422
## 9 2016-04-01 84887
## 10 2017-06-01 83658
## # ... with 35 more rows
ggplot(monthly_sales, aes(x = YearMonth, y = total_sales, color = total_sales)) + geom_point() + geom_smooth() + labs(x = "Months", y = "Total Car Sales", color = "Total Car Sales")
total_car_numbers <- combined_sales %>% group_by(brand_name) %>% summarise(total_total = sum(total_total)) %>% arrange(desc(total_total))
total_car_numbers
## # A tibble: 49 x 2
## brand_name total_total
## <chr> <dbl>
## 1 RENAULT 380614
## 2 VOLKSWAGEN 347657
## 3 FIAT 338887
## 4 FORD 312159
## 5 HYUNDAI 151963
## 6 TOYOTA 148515
## 7 DACIA 136003
## 8 OPEL 130706
## 9 PEUGEOT 121911
## 10 MERCEDES-BENZ 107718
## # ... with 39 more rows
ggplot(total_car_numbers, aes(x = brand_name, y = total_total, size = total_total, color = brand_name)) + geom_point() + theme(axis.text.x = element_text(angle = 90)) + labs(x = "Brand Name", y = "Total Car Sales", color = "Brand Name", size = "")
domestic_percentage <- combined_sales %>% group_by(YearMonth) %>% summarise(total_dom = sum(total_dom), total_total = sum(total_total)) %>% mutate(percent_total_dom = total_dom/total_total) %>% arrange(desc(percent_total_dom))
domestic_percentage
## # A tibble: 45 x 4
## YearMonth total_dom total_total percent_total_dom
## <date> <dbl> <dbl> <dbl>
## 1 2019-05-01 16229 33016 0.492
## 2 2019-06-01 19986 42688 0.468
## 3 2019-03-01 22395 49221 0.455
## 4 2019-09-01 19085 41992 0.454
## 5 2019-07-01 8122 17927 0.453
## 6 2019-04-01 13615 30971 0.440
## 7 2019-02-01 10516 24875 0.423
## 8 2019-01-01 6068 14373 0.422
## 9 2018-12-01 31072 77706 0.400
## 10 2018-01-01 13915 35076 0.397
## # ... with 35 more rows
ggplot(domestic_percentage, aes(x = YearMonth, y = percent_total_dom, size = percent_total_dom, color = percent_total_dom)) + geom_point() + scale_color_gradientn(colours = rainbow(2)) + labs(x = "Months", y = "Percentage of Domestic Cars in Total", color = "", size="")
slct_1<-combined_sales %>% group_by(year,month) %>% summarise(sum_dom=sum(total_dom),sum_imp=sum(total_imp),sum_total=sum(total_total),sum_dom) %>% ungroup()
formattable(slct_1)
year | month | sum_dom | sum_imp | sum_total |
---|---|---|---|---|
2016 | 1 | 12190 | 20523 | 32713 |
2016 | 2 | 16711 | 36114 | 52825 |
2016 | 3 | 25593 | 57355 | 82948 |
2016 | 4 | 28714 | 56173 | 84887 |
2016 | 5 | 30435 | 63469 | 93904 |
2016 | 6 | 27314 | 64226 | 91540 |
2016 | 7 | 16380 | 42153 | 58533 |
2016 | 8 | 19178 | 52378 | 71556 |
2016 | 9 | 19345 | 48248 | 67593 |
2016 | 10 | 24904 | 58096 | 83000 |
2016 | 11 | 38870 | 83439 | 122309 |
2016 | 12 | 51230 | 90682 | 141912 |
2017 | 1 | 13783 | 21540 | 35323 |
2017 | 2 | 16693 | 30272 | 46965 |
2017 | 3 | 25543 | 48259 | 73802 |
2017 | 4 | 26546 | 49442 | 75988 |
2017 | 5 | 30440 | 54982 | 85422 |
2017 | 6 | 31002 | 52656 | 83658 |
2017 | 7 | 29698 | 52599 | 82297 |
2017 | 8 | 22820 | 49716 | 72536 |
2017 | 9 | 25002 | 46350 | 71352 |
2017 | 10 | 31073 | 60679 | 91752 |
2017 | 11 | 34827 | 66032 | 100859 |
2017 | 12 | 52886 | 83354 | 136240 |
2018 | 1 | 13915 | 21161 | 35076 |
2018 | 2 | 17578 | 29431 | 47009 |
2018 | 3 | 28575 | 47770 | 76345 |
2018 | 4 | 25388 | 45738 | 71126 |
2018 | 5 | 27238 | 45517 | 72755 |
2018 | 6 | 19842 | 31195 | 51037 |
2018 | 7 | 20071 | 32663 | 52734 |
2018 | 8 | 11590 | 22756 | 34346 |
2018 | 9 | 7836 | 15192 | 23028 |
2018 | 10 | 7885 | 13686 | 21571 |
2018 | 11 | 19532 | 38672 | 58204 |
2018 | 12 | 31072 | 46634 | 77706 |
2019 | 1 | 6068 | 8305 | 14373 |
2019 | 2 | 10516 | 14359 | 24875 |
2019 | 3 | 22395 | 26826 | 49221 |
2019 | 4 | 13615 | 17356 | 30971 |
2019 | 5 | 16229 | 16787 | 33016 |
2019 | 6 | 19986 | 22702 | 42688 |
2019 | 7 | 8122 | 9805 | 17927 |
2019 | 8 | 9994 | 16252 | 26246 |
2019 | 9 | 19085 | 22907 | 41992 |
levels_month= c("1","2","3","4","5","6","7","8","9","10","11","12")
ggplot(slct_1,aes(x=ordered(as.character(month),levels=levels_month),y=sum_total,fill=ordered(year,levels=c("2016","2017","2018","2019")))) +geom_bar(stat="identity",position="dodge") + labs(y="Total Car Sales",x="Months",fill="")