# read the csv files
BEERS = read.csv ("Beers.csv", header = TRUE)
BREWERIES = read.csv ("Breweries.csv", header = TRUE)
# check first five rows
head(BEERS, n = 5)
## Name Beer_ID ABV IBU Brewery_id
## 1 Pub Beer 1436 0.050 NA 409
## 2 Devil's Cup 2265 0.066 NA 178
## 3 Rise of the Phoenix 2264 0.071 NA 178
## 4 Sinister 2263 0.090 NA 178
## 5 Sex and Candy 2262 0.075 NA 178
## Style Ounces
## 1 American Pale Lager 12
## 2 American Pale Ale (APA) 12
## 3 American IPA 12
## 4 American Double / Imperial IPA 12
## 5 American IPA 12
head(BREWERIES, n = 5)
## Brew_ID Name City State
## 1 1 NorthGate Brewing Minneapolis MN
## 2 2 Against the Grain Brewery Louisville KY
## 3 3 Jack's Abby Craft Lagers Framingham MA
## 4 4 Mike Hess Brewing Company San Diego CA
## 5 5 Fort Point Beer Company San Francisco CA
# look at data frame
str(BEERS)
## 'data.frame': 2410 obs. of 7 variables:
## $ Name : chr "Pub Beer" "Devil's Cup" "Rise of the Phoenix" "Sinister" ...
## $ Beer_ID : int 1436 2265 2264 2263 2262 2261 2260 2259 2258 2131 ...
## $ ABV : num 0.05 0.066 0.071 0.09 0.075 0.077 0.045 0.065 0.055 0.086 ...
## $ IBU : int NA NA NA NA NA NA NA NA NA NA ...
## $ Brewery_id: int 409 178 178 178 178 178 178 178 178 178 ...
## $ Style : chr "American Pale Lager" "American Pale Ale (APA)" "American IPA" "American Double / Imperial IPA" ...
## $ Ounces : num 12 12 12 12 12 12 12 12 12 12 ...
str(BREWERIES)
## 'data.frame': 558 obs. of 4 variables:
## $ Brew_ID: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Name : chr "NorthGate Brewing " "Against the Grain Brewery" "Jack's Abby Craft Lagers" "Mike Hess Brewing Company" ...
## $ City : chr "Minneapolis" "Louisville" "Framingham" "San Diego" ...
## $ State : chr " MN" " KY" " MA" " CA" ...
# visualization
# display summary statistics
summary(BEERS)
## Name Beer_ID ABV IBU
## Length:2410 Min. : 1.0 Min. :0.00100 Min. : 4.00
## Class :character 1st Qu.: 808.2 1st Qu.:0.05000 1st Qu.: 21.00
## Mode :character Median :1453.5 Median :0.05600 Median : 35.00
## Mean :1431.1 Mean :0.05977 Mean : 42.71
## 3rd Qu.:2075.8 3rd Qu.:0.06700 3rd Qu.: 64.00
## Max. :2692.0 Max. :0.12800 Max. :138.00
## NA's :62 NA's :1005
## Brewery_id Style Ounces
## Min. : 1.0 Length:2410 Min. : 8.40
## 1st Qu.: 94.0 Class :character 1st Qu.:12.00
## Median :206.0 Mode :character Median :12.00
## Mean :232.7 Mean :13.59
## 3rd Qu.:367.0 3rd Qu.:16.00
## Max. :558.0 Max. :32.00
##
summary(BREWERIES)
## Brew_ID Name City State
## Min. : 1.0 Length:558 Length:558 Length:558
## 1st Qu.:140.2 Class :character Class :character Class :character
## Median :279.5 Mode :character Mode :character Mode :character
## Mean :279.5
## 3rd Qu.:418.8
## Max. :558.0
# full table view
View(BEERS)
View(BREWERIES)
# view count of breweries by state
breweries_by_state <- BREWERIES %>%
group_by (State) %>%
summarize (Count = n()) %>%
arrange (desc(Count))
View(breweries_by_state)
# total number of breweries in US unfiltered
total_breweries <- sum(breweries_by_state$Count)
# total number of beers in US unfiltered
total_beers <- count(BEERS)
na_abv <- sum(is.na(BEERS$ABV))
na_abv
## [1] 62
which(is.na(BEERS$ABV))
## [1] 295 300 316 317 318 319 408 409 451 452 465 521 522 523 524
## [16] 542 567 656 657 724 729 730 731 732 733 734 735 813 837 925
## [31] 1031 1118 1133 1168 1169 1170 1171 1172 1173 1174 1354 1555 1556 1557 1559
## [46] 1637 1668 1669 1715 1720 1761 1814 1888 1970 2045 2052 2164 2187 2338 2346
## [61] 2348 2350
na_ibu <- sum(is.na(BEERS$IBU))
na_ibu
## [1] 1005
# ggplot2 bar plot of breweries by state
barplot_of_breweries_by_state <- BREWERIES %>%
group_by(State) %>%
ggplot(aes(State, fill = State)) + geom_bar(stat = "count") +
ggtitle(paste("Barplot of Breweries by State (US), n = ", total_breweries)) +
xlab("State") + ylab("Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
barplot_of_breweries_by_state
# TODO histogram of popular sizes by style
#hist(BEERS$Ounces, main = paste("Histogram of Beers by Volume (Ounce), n = ", length(table(beerOunceAsFactor))), xlab = "Volume (oz.)", ylab = "Frequency", col = "lightblue", border = "black")
plot(BEERS$IBU, BEERS$ABV, pch = 3, xlab = "IBU", ylab = "ABV(%)", main = paste("Scatter Plot of International Bitterness Unit v. Alcohol by Content, n = ", total_beers))
# full merge beer and breweries data frames
BEER_AND_BREWERIES <- merge(BEERS, BREWERIES, by.x = "Brewery_id", by.y = "Brew_ID", all = TRUE) %>%
rename(Beer = Name.x, Brewery = Name.y)
# view complete table
View(BEER_AND_BREWERIES)
# print the first 6 observations
print(head(BEER_AND_BREWERIES, n = 6))
## Brewery_id Beer Beer_ID ABV IBU
## 1 1 Get Together 2692 0.045 50
## 2 1 Maggie's Leap 2691 0.049 26
## 3 1 Wall's End 2690 0.048 19
## 4 1 Pumpion 2689 0.060 38
## 5 1 Stronghold 2688 0.060 25
## 6 1 Parapet ESB 2687 0.056 47
## Style Ounces Brewery City
## 1 American IPA 16 NorthGate Brewing Minneapolis
## 2 Milk / Sweet Stout 16 NorthGate Brewing Minneapolis
## 3 English Brown Ale 16 NorthGate Brewing Minneapolis
## 4 Pumpkin Ale 16 NorthGate Brewing Minneapolis
## 5 American Porter 16 NorthGate Brewing Minneapolis
## 6 Extra Special / Strong Bitter (ESB) 16 NorthGate Brewing Minneapolis
## State
## 1 MN
## 2 MN
## 3 MN
## 4 MN
## 5 MN
## 6 MN
# print the last 6 observations
print(tail(BEER_AND_BREWERIES, n = 6))
## Brewery_id Beer Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Brewery City
## 2405 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2406 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2407 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State
## 2405 CA
## 2406 NY
## 2407 NY
## 2408 NY
## 2409 NY
## 2410 AK
# tidy data
library(naniar)
# plot missing variables
gg_miss_var(BEERS) # IBU, ABV Missing
gg_miss_var(BREWERIES)
gg_miss_var(BEER_AND_BREWERIES)
# test for NA values
sum(is.na(BEERS$ABV)) # missing 62 values
## [1] 62
sum(is.na(BEERS$IBU)) # missing 1005 values
## [1] 1005
sum(is.na(BEER_AND_BREWERIES)) # missing 1067 values
## [1] 1067
# store index of rows containing NA
missing_index <- which(is.na(BEER_AND_BREWERIES))
missing_index
## [1] 7305 7306 7416 7423 7457 7482 7670 7671 7738 7739 7740 7741
## [13] 7798 7799 7800 7801 7802 7803 7804 7859 8019 8021 8023 8130
## [25] 8222 8223 8224 8226 8388 8528 8654 8710 8861 9117 9122 9149
## [37] 9154 9181 9326 9407 9408 9414 9457 9464 9472 9489 9490 9491
## [49] 9492 9526 9575 9581 9582 9583 9584 9585 9586 9587 9595 9600
## [61] 9624 9625 9657 9675 9676 9677 9678 9679 9680 9681 9682 9683
## [73] 9684 9685 9686 9688 9689 9691 9692 9694 9695 9696 9697 9698
## [85] 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710
## [97] 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722
## [109] 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734
## [121] 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746
## [133] 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9764
## [145] 9765 9767 9770 9775 9779 9785 9792 9798 9799 9800 9803 9804
## [157] 9805 9808 9809 9812 9815 9826 9833 9834 9838 9843 9854 9855
## [169] 9856 9858 9860 9867 9875 9877 9878 9884 9892 9893 9894 9895
## [181] 9896 9897 9899 9900 9903 9907 9909 9910 9911 9912 9913 9914
## [193] 9917 9918 9927 9931 9932 9933 9934 9936 9937 9938 9939 9940
## [205] 9941 9942 9943 9944 9945 9946 9947 9948 9949 9951 9968 9969
## [217] 9978 10000 10001 10002 10003 10006 10007 10010 10011 10012 10013 10015
## [229] 10016 10019 10020 10024 10025 10027 10030 10031 10032 10033 10035 10037
## [241] 10039 10044 10050 10051 10052 10053 10056 10057 10058 10059 10060 10062
## [253] 10063 10079 10080 10081 10083 10085 10086 10087 10088 10089 10090 10091
## [265] 10093 10094 10097 10099 10104 10105 10108 10117 10118 10121 10125 10132
## [277] 10133 10135 10136 10142 10143 10144 10145 10146 10147 10148 10149 10150
## [289] 10151 10155 10178 10179 10180 10181 10186 10187 10188 10189 10197 10199
## [301] 10201 10202 10208 10209 10210 10211 10212 10213 10214 10215 10216 10225
## [313] 10226 10227 10229 10236 10237 10238 10239 10240 10251 10252 10253 10254
## [325] 10255 10256 10261 10266 10267 10268 10269 10274 10275 10278 10284 10294
## [337] 10295 10296 10297 10299 10301 10302 10303 10304 10305 10306 10322 10323
## [349] 10336 10344 10353 10355 10358 10363 10365 10369 10372 10385 10388 10389
## [361] 10391 10398 10402 10406 10416 10421 10429 10430 10431 10433 10435 10445
## [373] 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457
## [385] 10465 10466 10468 10469 10470 10471 10478 10479 10480 10481 10484 10488
## [397] 10489 10493 10496 10501 10502 10503 10515 10516 10517 10518 10519 10520
## [409] 10521 10522 10523 10527 10528 10529 10530 10531 10532 10540 10541 10551
## [421] 10552 10553 10554 10556 10572 10574 10575 10587 10591 10595 10596 10597
## [433] 10598 10599 10600 10603 10604 10606 10607 10630 10631 10632 10633 10634
## [445] 10635 10636 10640 10641 10643 10650 10651 10653 10662 10663 10664 10668
## [457] 10669 10672 10673 10674 10675 10690 10691 10694 10695 10696 10697 10698
## [469] 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710
## [481] 10711 10712 10713 10714 10715 10716 10717 10719 10720 10721 10722 10723
## [493] 10724 10725 10726 10727 10728 10729 10730 10731 10733 10734 10736 10737
## [505] 10743 10758 10759 10760 10761 10762 10763 10768 10773 10780 10781 10794
## [517] 10795 10796 10797 10798 10799 10800 10801 10803 10819 10820 10821 10822
## [529] 10824 10825 10827 10828 10831 10838 10839 10840 10841 10842 10845 10846
## [541] 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10860
## [553] 10870 10871 10872 10873 10875 10877 10878 10879 10880 10881 10882 10883
## [565] 10894 10895 10905 10908 10926 10929 10932 10933 10937 10938 10939 10940
## [577] 10941 10942 10943 10950 10957 10967 10968 10969 10970 10971 10972 10979
## [589] 10991 11002 11010 11013 11014 11017 11019 11021 11022 11023 11026 11027
## [601] 11028 11029 11030 11031 11032 11034 11036 11038 11039 11040 11041 11044
## [613] 11051 11053 11054 11055 11056 11057 11058 11059 11062 11063 11064 11066
## [625] 11067 11071 11073 11083 11086 11087 11088 11089 11090 11091 11096 11108
## [637] 11109 11110 11111 11117 11118 11119 11120 11127 11132 11141 11143 11146
## [649] 11148 11149 11150 11151 11155 11163 11165 11182 11184 11185 11186 11187
## [661] 11188 11190 11191 11192 11193 11196 11197 11203 11205 11206 11208 11209
## [673] 11210 11211 11212 11213 11215 11216 11217 11218 11220 11225 11229 11232
## [685] 11236 11237 11240 11241 11242 11243 11244 11245 11247 11250 11253 11254
## [697] 11260 11267 11268 11269 11270 11271 11272 11273 11274 11275 11276 11277
## [709] 11280 11290 11291 11294 11295 11301 11302 11303 11304 11305 11306 11307
## [721] 11308 11314 11315 11316 11317 11318 11319 11320 11323 11324 11325 11337
## [733] 11346 11347 11348 11351 11352 11367 11368 11369 11375 11377 11378 11379
## [745] 11380 11381 11382 11383 11385 11386 11388 11389 11394 11395 11401 11405
## [757] 11406 11412 11419 11420 11421 11422 11423 11424 11425 11428 11429 11432
## [769] 11444 11445 11446 11452 11453 11456 11472 11474 11475 11476 11477 11479
## [781] 11480 11481 11484 11494 11495 11501 11502 11503 11504 11514 11516 11517
## [793] 11518 11519 11522 11526 11527 11528 11529 11530 11531 11532 11533 11534
## [805] 11535 11540 11549 11554 11555 11556 11557 11558 11559 11560 11561 11562
## [817] 11563 11564 11565 11566 11569 11570 11571 11572 11579 11580 11581 11582
## [829] 11583 11590 11591 11592 11593 11594 11599 11600 11601 11602 11603 11606
## [841] 11613 11614 11615 11616 11617 11618 11619 11620 11623 11624 11625 11631
## [853] 11632 11633 11634 11635 11636 11637 11639 11640 11641 11642 11644 11646
## [865] 11647 11648 11649 11650 11654 11655 11656 11657 11658 11662 11663 11672
## [877] 11673 11674 11675 11676 11680 11681 11682 11683 11685 11686 11688 11689
## [889] 11690 11693 11695 11703 11704 11705 11706 11711 11712 11713 11714 11715
## [901] 11716 11717 11718 11722 11724 11725 11726 11727 11728 11729 11734 11735
## [913] 11736 11742 11745 11746 11752 11753 11754 11755 11765 11772 11774 11775
## [925] 11776 11777 11783 11784 11785 11786 11787 11792 11794 11795 11796 11797
## [937] 11798 11799 11800 11808 11809 11810 11811 11812 11815 11817 11818 11819
## [949] 11821 11822 11823 11824 11825 11826 11827 11836 11837 11838 11841 11844
## [961] 11845 11846 11854 11855 11856 11857 11867 11868 11874 11875 11879 11882
## [973] 11885 11888 11892 11893 11894 11899 11900 11901 11902 11905 11906 11907
## [985] 11909 11910 11911 11912 11924 11926 11927 11928 11930 11931 11932 11933
## [997] 11934 11935 11936 11945 11946 11947 11948 11949 11950 11951 11956 11957
## [1009] 11963 11964 11965 11966 11967 11970 11971 11972 11973 11982 11983 11984
## [1021] 11985 11987 11991 11992 11993 11994 11995 11996 11997 12001 12002 12003
## [1033] 12004 12005 12006 12007 12008 12009 12010 12011 12012 12013 12014 12015
## [1045] 12018 12019 12022 12023 12032 12033 12034 12035 12036 12037 12038 12039
## [1057] 12040 12041 12042 12043 12044 12045 12046 12047 12048 12049 12050
# test the function complete.cases() on gg_miss_var
gg_miss_var(BEERS[complete.cases(BEERS$IBU, BEERS$ABV), ])
# replace missing NA values with new value
# BEERS <- BEERS[complete.cases(BEERS$IBU, BEERS$ABV), ]
# NOTE: South Dakota IBU is NA for all, missing IBU data
# filter and summarize ABV by state
median_by_state <- BEER_AND_BREWERIES %>%
group_by(State) %>%
#filter(!is.na(ABV) & !is.na(IBU)) %>%
summarize(median_ABV = median(ABV, na.rm = TRUE), median_IBU = median(IBU, na.rm = TRUE))
(median_by_state)
## # A tibble: 51 × 3
## State median_ABV median_IBU
## <chr> <dbl> <dbl>
## 1 " AK" 0.056 46
## 2 " AL" 0.06 43
## 3 " AR" 0.052 39
## 4 " AZ" 0.055 20.5
## 5 " CA" 0.058 42
## 6 " CO" 0.0605 40
## 7 " CT" 0.06 29
## 8 " DC" 0.0625 47.5
## 9 " DE" 0.055 52
## 10 " FL" 0.057 55
## # ℹ 41 more rows
# store sample size
total_states <- count(median_by_state)
total_states
## # A tibble: 1 × 1
## n
## <int>
## 1 51
# bar plot median ABV
barplot_median_abv <- median_by_state %>%
ggplot(aes(State, median_ABV, color = median_ABV, fill = median_ABV)) +
geom_bar(stat = "identity") +
ggtitle(paste("Barplot of Median Alcohol by Volume (% ABV) by State (US), n = ", total_states)) +
xlab("State") + ylab("Median ABV (%)") + ylim(0, 0.15) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
barplot_median_abv
# bar plot median IBU
barplot_median_ibu <- median_by_state %>%
ggplot(aes(State, median_IBU, color = median_IBU, fill = median_IBU)) +
geom_bar(stat = "identity") +
ggtitle(paste("Barplot of Median Internation Bitterness Unit (IBU) by State (US), n = ", total_states)) +
xlab("State(") + ylab("Median IBU") + ylim(0, 70) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
barplot_median_ibu
## Warning: Removed 1 rows containing missing values (`position_stack()`).
# PART V #5. Which state has the maximum alcoholic (ABV) beer? Which
state has the most bitter (IBU) beer?
# group filter and summarize max abv and ibu values
max_by_state <- BEER_AND_BREWERIES %>%
group_by(State) %>%
#filter(!is.na(ABV) & !is.na(IBU)) %>%
summarize(max_ABV = max(ABV, na.rm = TRUE), max_IBU = max(IBU, na.rm = TRUE))
## Warning: There was 1 warning in `summarize()`.
## ℹ In argument: `max_IBU = max(IBU, na.rm = TRUE)`.
## ℹ In group 42: `State = " SD"`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
max_by_state
## # A tibble: 51 × 3
## State max_ABV max_IBU
## <chr> <dbl> <dbl>
## 1 " AK" 0.068 71
## 2 " AL" 0.093 103
## 3 " AR" 0.061 39
## 4 " AZ" 0.095 99
## 5 " CA" 0.099 115
## 6 " CO" 0.128 104
## 7 " CT" 0.09 85
## 8 " DC" 0.092 115
## 9 " DE" 0.055 52
## 10 " FL" 0.082 82
## # ℹ 41 more rows
# bar plot max ABV
barplot_max_abv <- max_by_state %>%
ggplot(aes(State, max_ABV, color = max_ABV, fill = max_ABV)) +
geom_bar(stat = "identity") +
ggtitle(paste("Barplot of Maximum Alcohol by Volume (% ABV) by State (US), n = ", total_states)) +
xlab("State") + ylab("Max ABV (%)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
barplot_max_abv
# bar plot max IBU
barplot_max_ibu <- max_by_state %>%
ggplot(aes(State, max_IBU, color = max_IBU, fill = max_IBU)) +
geom_bar(stat = "identity") +
ggtitle(paste("Barplot of Maximum Internation Bitterness Unit (IBU) by State (US), n = ", total_states)) +
xlab("State") + ylab("Maximum IBU") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
barplot_max_ibu
## Warning: Removed 1 rows containing missing values (`geom_bar()`).
# console output max abv and ibu values
max(max_by_state$max_ABV)
## [1] 0.128
max(max_by_state$max_IBU)
## [1] 138
# store max abv and ibu index
max_abv <- which.max(max_by_state$max_ABV)
max_ibu <- which.max(max_by_state$max_IBU)
# console output max abv and ibu pointers
max_abv
## [1] 6
max_ibu
## [1] 38
# find max abv and ibu state by index
max_abv_state <- max_by_state$State[max_abv]
max_ibu_state <- max_by_state$State[max_ibu]
# console output max abv and ibu
max_abv_state # "CO"
## [1] " CO"
max_ibu_state # "OR"
## [1] " OR"
# import map libraries
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(mapproj)
# merge max and med abv and ibu
max_med_abv_ibu <- merge(max_by_state, median_by_state, by = "State", all = TRUE)
max_med_abv_ibu
## State max_ABV max_IBU median_ABV median_IBU
## 1 AK 0.068 71 0.0560 46.0
## 2 AL 0.093 103 0.0600 43.0
## 3 AR 0.061 39 0.0520 39.0
## 4 AZ 0.095 99 0.0550 20.5
## 5 CA 0.099 115 0.0580 42.0
## 6 CO 0.128 104 0.0605 40.0
## 7 CT 0.090 85 0.0600 29.0
## 8 DC 0.092 115 0.0625 47.5
## 9 DE 0.055 52 0.0550 52.0
## 10 FL 0.082 82 0.0570 55.0
## 11 GA 0.072 65 0.0550 55.0
## 12 HI 0.083 75 0.0540 22.5
## 13 IA 0.095 99 0.0555 26.0
## 14 ID 0.099 100 0.0565 39.0
## 15 IL 0.096 100 0.0580 30.0
## 16 IN 0.120 115 0.0580 33.0
## 17 KS 0.085 110 0.0500 20.0
## 18 KY 0.125 80 0.0625 31.5
## 19 LA 0.088 60 0.0520 31.5
## 20 MA 0.099 130 0.0540 35.0
## 21 MD 0.085 90 0.0580 29.0
## 22 ME 0.099 70 0.0510 61.0
## 23 MI 0.099 115 0.0620 35.0
## 24 MN 0.099 120 0.0560 44.5
## 25 MO 0.080 89 0.0520 24.0
## 26 MS 0.080 80 0.0580 45.0
## 27 MT 0.075 80 0.0550 40.0
## 28 NC 0.099 98 0.0570 33.5
## 29 ND 0.067 70 0.0500 32.0
## 30 NE 0.096 65 0.0560 35.0
## 31 NH 0.065 82 0.0550 48.5
## 32 NJ 0.099 100 0.0460 34.5
## 33 NM 0.080 100 0.0620 51.0
## 34 NV 0.099 90 0.0600 41.0
## 35 NY 0.100 111 0.0550 47.0
## 36 OH 0.099 126 0.0580 40.0
## 37 OK 0.085 100 0.0600 35.0
## 38 OR 0.088 138 0.0560 40.0
## 39 PA 0.099 113 0.0570 30.0
## 40 RI 0.086 75 0.0550 24.0
## 41 SC 0.097 65 0.0550 30.0
## 42 SD 0.069 -Inf 0.0600 NA
## 43 TN 0.062 61 0.0570 37.0
## 44 TX 0.099 118 0.0550 33.0
## 45 UT 0.090 83 0.0400 34.0
## 46 VA 0.088 135 0.0565 42.0
## 47 VT 0.096 120 0.0550 30.0
## 48 WA 0.084 83 0.0555 38.0
## 49 WI 0.099 80 0.0520 19.0
## 50 WV 0.067 71 0.0620 57.5
## 51 WY 0.072 75 0.0500 21.0
# merge brewery count with max and med abv and ibu
breweries_map_data <- merge(max_med_abv_ibu, breweries_by_state, by = "State")
#colnames(breweries_map_data)[1] = "abb"
breweries_map_data$State <- trimws(breweries_map_data$State)
breweries_map_data
## State max_ABV max_IBU median_ABV median_IBU Count
## 1 AK 0.068 71 0.0560 46.0 7
## 2 AL 0.093 103 0.0600 43.0 3
## 3 AR 0.061 39 0.0520 39.0 2
## 4 AZ 0.095 99 0.0550 20.5 11
## 5 CA 0.099 115 0.0580 42.0 39
## 6 CO 0.128 104 0.0605 40.0 47
## 7 CT 0.090 85 0.0600 29.0 8
## 8 DC 0.092 115 0.0625 47.5 1
## 9 DE 0.055 52 0.0550 52.0 2
## 10 FL 0.082 82 0.0570 55.0 15
## 11 GA 0.072 65 0.0550 55.0 7
## 12 HI 0.083 75 0.0540 22.5 4
## 13 IA 0.095 99 0.0555 26.0 5
## 14 ID 0.099 100 0.0565 39.0 5
## 15 IL 0.096 100 0.0580 30.0 18
## 16 IN 0.120 115 0.0580 33.0 22
## 17 KS 0.085 110 0.0500 20.0 3
## 18 KY 0.125 80 0.0625 31.5 4
## 19 LA 0.088 60 0.0520 31.5 5
## 20 MA 0.099 130 0.0540 35.0 23
## 21 MD 0.085 90 0.0580 29.0 7
## 22 ME 0.099 70 0.0510 61.0 9
## 23 MI 0.099 115 0.0620 35.0 32
## 24 MN 0.099 120 0.0560 44.5 12
## 25 MO 0.080 89 0.0520 24.0 9
## 26 MS 0.080 80 0.0580 45.0 2
## 27 MT 0.075 80 0.0550 40.0 9
## 28 NC 0.099 98 0.0570 33.5 19
## 29 ND 0.067 70 0.0500 32.0 1
## 30 NE 0.096 65 0.0560 35.0 5
## 31 NH 0.065 82 0.0550 48.5 3
## 32 NJ 0.099 100 0.0460 34.5 3
## 33 NM 0.080 100 0.0620 51.0 4
## 34 NV 0.099 90 0.0600 41.0 2
## 35 NY 0.100 111 0.0550 47.0 16
## 36 OH 0.099 126 0.0580 40.0 15
## 37 OK 0.085 100 0.0600 35.0 6
## 38 OR 0.088 138 0.0560 40.0 29
## 39 PA 0.099 113 0.0570 30.0 25
## 40 RI 0.086 75 0.0550 24.0 5
## 41 SC 0.097 65 0.0550 30.0 4
## 42 SD 0.069 -Inf 0.0600 NA 1
## 43 TN 0.062 61 0.0570 37.0 3
## 44 TX 0.099 118 0.0550 33.0 28
## 45 UT 0.090 83 0.0400 34.0 4
## 46 VA 0.088 135 0.0565 42.0 16
## 47 VT 0.096 120 0.0550 30.0 10
## 48 WA 0.084 83 0.0555 38.0 23
## 49 WI 0.099 80 0.0520 19.0 20
## 50 WV 0.067 71 0.0620 57.5 1
## 51 WY 0.072 75 0.0500 21.0 4
# store state in lookup dataframe
lookup = data.frame(State = state.abb, region = state.name) #makes a data frame with State name and abbreviation.
lookup <- bind_rows(lookup, data.frame(State = "DC", region = "Washington DC")) # Add missing row
lookup <- lookup %>% arrange(lookup$State)
lookup$region <- tolower(lookup$region)
#lookup
# create state map data frame
us_states <- map_data("state")
us_states$subregion <- NULL
#us_states
# Found the bug! Blank space prevents merging, use trimws function
unique(breweries_map_data$State)
## [1] "AK" "AL" "AR" "AZ" "CA" "CO" "CT" "DC" "DE" "FL" "GA" "HI" "IA" "ID" "IL"
## [16] "IN" "KS" "KY" "LA" "MA" "MD" "ME" "MI" "MN" "MO" "MS" "MT" "NC" "ND" "NE"
## [31] "NH" "NJ" "NM" "NV" "NY" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN" "TX" "UT"
## [46] "VA" "VT" "WA" "WI" "WV" "WY"
unique(lookup$State)
## [1] "AK" "AL" "AR" "AZ" "CA" "CO" "CT" "DC" "DE" "FL" "GA" "HI" "IA" "ID" "IL"
## [16] "IN" "KS" "KY" "LA" "MA" "MD" "ME" "MI" "MN" "MO" "MS" "MT" "NC" "ND" "NE"
## [31] "NH" "NJ" "NM" "NV" "NY" "OH" "OK" "OR" "PA" "RI" "SC" "SD" "TN" "TX" "UT"
## [46] "VA" "VT" "WA" "WI" "WV" "WY"
# merge breweries and lookup data
breweries_lookup <- merge(breweries_map_data, lookup, by = "State")
breweries_lookup
## State max_ABV max_IBU median_ABV median_IBU Count region
## 1 AK 0.068 71 0.0560 46.0 7 alaska
## 2 AL 0.093 103 0.0600 43.0 3 alabama
## 3 AR 0.061 39 0.0520 39.0 2 arkansas
## 4 AZ 0.095 99 0.0550 20.5 11 arizona
## 5 CA 0.099 115 0.0580 42.0 39 california
## 6 CO 0.128 104 0.0605 40.0 47 colorado
## 7 CT 0.090 85 0.0600 29.0 8 connecticut
## 8 DC 0.092 115 0.0625 47.5 1 washington dc
## 9 DE 0.055 52 0.0550 52.0 2 delaware
## 10 FL 0.082 82 0.0570 55.0 15 florida
## 11 GA 0.072 65 0.0550 55.0 7 georgia
## 12 HI 0.083 75 0.0540 22.5 4 hawaii
## 13 IA 0.095 99 0.0555 26.0 5 iowa
## 14 ID 0.099 100 0.0565 39.0 5 idaho
## 15 IL 0.096 100 0.0580 30.0 18 illinois
## 16 IN 0.120 115 0.0580 33.0 22 indiana
## 17 KS 0.085 110 0.0500 20.0 3 kansas
## 18 KY 0.125 80 0.0625 31.5 4 kentucky
## 19 LA 0.088 60 0.0520 31.5 5 louisiana
## 20 MA 0.099 130 0.0540 35.0 23 massachusetts
## 21 MD 0.085 90 0.0580 29.0 7 maryland
## 22 ME 0.099 70 0.0510 61.0 9 maine
## 23 MI 0.099 115 0.0620 35.0 32 michigan
## 24 MN 0.099 120 0.0560 44.5 12 minnesota
## 25 MO 0.080 89 0.0520 24.0 9 missouri
## 26 MS 0.080 80 0.0580 45.0 2 mississippi
## 27 MT 0.075 80 0.0550 40.0 9 montana
## 28 NC 0.099 98 0.0570 33.5 19 north carolina
## 29 ND 0.067 70 0.0500 32.0 1 north dakota
## 30 NE 0.096 65 0.0560 35.0 5 nebraska
## 31 NH 0.065 82 0.0550 48.5 3 new hampshire
## 32 NJ 0.099 100 0.0460 34.5 3 new jersey
## 33 NM 0.080 100 0.0620 51.0 4 new mexico
## 34 NV 0.099 90 0.0600 41.0 2 nevada
## 35 NY 0.100 111 0.0550 47.0 16 new york
## 36 OH 0.099 126 0.0580 40.0 15 ohio
## 37 OK 0.085 100 0.0600 35.0 6 oklahoma
## 38 OR 0.088 138 0.0560 40.0 29 oregon
## 39 PA 0.099 113 0.0570 30.0 25 pennsylvania
## 40 RI 0.086 75 0.0550 24.0 5 rhode island
## 41 SC 0.097 65 0.0550 30.0 4 south carolina
## 42 SD 0.069 -Inf 0.0600 NA 1 south dakota
## 43 TN 0.062 61 0.0570 37.0 3 tennessee
## 44 TX 0.099 118 0.0550 33.0 28 texas
## 45 UT 0.090 83 0.0400 34.0 4 utah
## 46 VA 0.088 135 0.0565 42.0 16 virginia
## 47 VT 0.096 120 0.0550 30.0 10 vermont
## 48 WA 0.084 83 0.0555 38.0 23 washington
## 49 WI 0.099 80 0.0520 19.0 20 wisconsin
## 50 WV 0.067 71 0.0620 57.5 1 west virginia
## 51 WY 0.072 75 0.0500 21.0 4 wyoming
# merge breweries lookup and us states by region
map.df <- merge(breweries_lookup, us_states, by = "region", all.x = T)
map.df <- map.df[order(map.df$order),] # arrange order desc, fix polygon clipping
#map.df
# plot the map data
breweries_heat_map <- map.df %>% ggplot(aes(x=long,y=lat,group=group))+
geom_polygon(aes(fill = Count))+
geom_path() +
scale_fill_gradientn(colours=rev(heat.colors(10)),na.value="grey90")+ggtitle(paste("Number of Breweries by State, n = ", total_breweries)) +
coord_map()
breweries_heat_map
## Warning: Removed 3 rows containing missing values (`geom_path()`).
# max abv heat map
max_abv_heat_map <- map.df %>% ggplot(aes(x = long, y = lat, group = group)) +
geom_polygon(aes(fill = max_ABV)) + geom_path() + scale_fill_gradientn(colors = rev(heat.colors(10)), na.value = "grey90") +
ggtitle(paste("Maximum Alcohol by Volume (% ABV) by State, n = ", total_states)) +
coord_map()
max_abv_heat_map
## Warning: Removed 3 rows containing missing values (`geom_path()`).
# max ibu heat map
max_ibu_heat_map <- map.df %>% ggplot(aes(x = long, y = lat, group = group)) +
geom_polygon(aes(fill = max_IBU)) + geom_path() + scale_fill_gradientn(colors = rev(heat.colors(10)), na.value = "grey90") +
ggtitle(paste("Maximum International Bitterness Unit (IBU) by State, n = ", total_states)) +
coord_map()
max_ibu_heat_map
## Warning: Removed 3 rows containing missing values (`geom_path()`).
# median abv heat map
med_abv_heat_map <- map.df %>% ggplot(aes(x = long, y = lat, group = group)) +
geom_polygon(aes(fill = median_ABV)) + geom_path() + scale_fill_gradientn(colors = rev(heat.colors(10)), na.value = "grey90") +
ggtitle(paste("Median Alcohol by Volume (% ABV) by State, n = ", total_states)) +
coord_map()
med_abv_heat_map
## Warning: Removed 3 rows containing missing values (`geom_path()`).
# median ibu heat map
med_ibu_heat_map <- map.df %>% ggplot(aes(x = long, y = lat, group = group)) +
geom_polygon(aes(fill = median_IBU)) + geom_path() + scale_fill_gradientn(colors = rev(heat.colors(10)), na.value = "grey90") +
ggtitle(paste("Median International Bitterness Unit (IBU) by State, n = ", total_states)) +
coord_map()
med_ibu_heat_map
## Warning: Removed 3 rows containing missing values (`geom_path()`).
# string_r, which, grepl
# TODO search for keywords 'Ale', 'Pilsner', 'Stout'
# scatter plot of IBU v. ABV
ggplot(data = BEER_AND_BREWERIES) +
geom_point(mapping = aes(IBU, ABV, color = State, size = Ounces)) +
geom_smooth(mapping = aes(IBU, ABV, linetype = as.factor(Ounces))) +
ggtitle(paste("Scatter plot Alcohol by Volume (% ABV) v. International Bitterness Unit (IBU) by Ounce (oz.), n = ", total_beers))+
xlab("ABV (%)") + ylab("IBU")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 1005 rows containing non-finite values (`stat_smooth()`).
## Warning: Computation failed in `stat_smooth()`
## Caused by error in `smooth.construct.cr.smooth.spec()`:
## ! x has insufficient unique values to support 10 knots: reduce k.
## Warning: Removed 1005 rows containing missing values (`geom_point()`).
#facet_wrap(~Ounces)
#8. Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA). You decide to use KNN classification to investigate this relationship. Provide statistical evidence one way or the other. You can of course assume your audience is comfortable with percentages … KNN is very easy to understand conceptually. #In addition, while you have decided to use KNN to investigate this relationship (KNN is required) you may also feel free to supplement your response to this question with any other methods or techniques you have learned. Creativity and alternative solutions are always encouraged.
# filter by ale
ale_abv_ibu <- BEER_AND_BREWERIES %>%
select(Ounces, Style, ABV, IBU) %>%
filter(grepl("Ale", Style))
ale_abv_ibu$Class <- "Ale"
dim(ale_abv_ibu)
## [1] 976 5
#ale_abv_ibu
# filter by ipa
ipa_abv_ibu <- BEER_AND_BREWERIES %>%
select(Ounces, Style, ABV, IBU) %>%
filter(grepl("IPA", Style))
ipa_abv_ibu$Class <- "IPA"
# combine ale and ipa list
ale_and_ipa <- rbind(ipa_abv_ibu, ale_abv_ibu)
#install package and load library
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# plot ggpairs by class ipa v. ale
ggpairs_ale_ipa_abv_ibu_by_class <- ale_and_ipa %>%
select(ABV, IBU, Class) %>%
ggpairs(aes(color = Class))
ggpairs_ale_ipa_abv_ibu_by_class
## Warning: Removed 43 rows containing non-finite values (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 596 rows containing missing values
## Warning: Removed 43 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 596 rows containing missing values (`geom_point()`).
## Warning: Removed 596 rows containing non-finite values (`stat_density()`).
## Warning: Removed 596 rows containing non-finite values (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 43 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 596 rows containing non-finite values (`stat_bin()`).
# suppress warning: standard deviation is zero - ABV, IBU NA values
# plot ggpairs by ounce ipa v. ale
ggpairs_ale_ipa_abv_ibu_by_ounces <- ale_and_ipa %>%
select(ABV, IBU, Ounces) %>%
ggpairs(aes(na.rm = TRUE, color = as.factor(Ounces)))
ggpairs_ale_ipa_abv_ibu_by_ounces
## Warning: Removed 43 rows containing non-finite values (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 596 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 43 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning: Removed 596 rows containing missing values (`geom_point()`).
## Warning: Removed 596 rows containing non-finite values (`stat_density()`).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 596 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning: Removed 43 rows containing missing values (`geom_point()`).
## Warning: Removed 596 rows containing missing values (`geom_point()`).
## KNN Classification # Evaluate the ML classification model using
K-Nearest Neighbor algorithm to predict ale and IPA class
# install KNN libraries
#install.packages("class")
#install.packages("caret")
#install.packages("e1071")
library(class)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
# count total ale and ipa
count(ale_and_ipa) # 1547 samples total, including NA
## n
## 1 1547
ale_and_ipa <- na.omit(ale_and_ipa) # omit NA values
count(ale_and_ipa) # omit NA, 951 remain
## n
## 1 951
# set up train and test set
trainIndex <- sample(seq(1, 951), 665)
trainBeers <- ale_and_ipa[trainIndex,]
testBeers <- ale_and_ipa[-trainIndex,]
# plot train set
trainBeers %>% ggplot(aes(ABV, IBU, color = Class)) + geom_point()
# knn classification
classification = knn(trainBeers[, c("ABV", "IBU")], testBeers[, c("ABV", "IBU")], trainBeers$Class, k = 5, prob = TRUE )
classification
## [1] IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA Ale
## [19] IPA Ale IPA IPA Ale IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA Ale
## [37] IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA Ale IPA Ale Ale IPA IPA IPA
## [55] IPA IPA IPA IPA IPA IPA Ale Ale Ale Ale IPA IPA Ale IPA IPA IPA Ale Ale
## [73] IPA IPA IPA IPA Ale Ale Ale IPA IPA IPA IPA IPA IPA IPA Ale IPA IPA IPA
## [91] Ale IPA IPA IPA IPA IPA IPA IPA Ale IPA IPA IPA IPA IPA IPA IPA IPA IPA
## [109] IPA Ale IPA IPA IPA Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [127] Ale Ale Ale IPA Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale
## [145] Ale Ale Ale Ale Ale Ale Ale Ale IPA IPA Ale Ale Ale Ale Ale Ale Ale IPA
## [163] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale IPA Ale Ale Ale
## [181] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale
## [199] Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale
## [217] Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale
## [235] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale IPA
## [253] Ale Ale IPA Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [271] Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale
## attr(,"prob")
## [1] 0.8333333 1.0000000 1.0000000 0.8000000 0.8888889 0.8333333 0.8000000
## [8] 1.0000000 0.6666667 0.6666667 0.8000000 0.8750000 0.8571429 0.8888889
## [15] 1.0000000 1.0000000 1.0000000 1.0000000 0.7777778 0.8000000 0.8333333
## [22] 0.8000000 0.8571429 0.8571429 0.6250000 1.0000000 0.7500000 1.0000000
## [29] 1.0000000 0.6666667 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [36] 0.7142857 0.6000000 0.8000000 0.8750000 0.6000000 0.8000000 1.0000000
## [43] 1.0000000 0.8333333 0.8000000 1.0000000 0.8000000 0.8000000 1.0000000
## [50] 0.7777778 0.7142857 1.0000000 0.8888889 0.8000000 0.8571429 0.8000000
## [57] 1.0000000 0.8000000 0.8000000 0.8000000 0.7500000 0.8333333 0.8333333
## [64] 0.8333333 0.8333333 0.8000000 0.8000000 0.8888889 1.0000000 0.8333333
## [71] 0.8333333 0.8333333 0.8888889 0.8000000 0.6000000 0.9166667 0.7777778
## [78] 0.7777778 0.8000000 1.0000000 0.8750000 1.0000000 1.0000000 1.0000000
## [85] 1.0000000 0.8000000 0.7000000 1.0000000 0.8333333 0.8000000 0.8571429
## [92] 1.0000000 0.8333333 0.9000000 1.0000000 0.8000000 0.8571429 1.0000000
## [99] 1.0000000 1.0000000 1.0000000 0.8461538 0.6000000 1.0000000 0.8571429
## [106] 0.8000000 0.8571429 0.5000000 0.8000000 0.7777778 0.8571429 0.8888889
## [113] 0.6250000 1.0000000 1.0000000 0.6000000 1.0000000 0.6000000 0.7777778
## [120] 0.8333333 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8750000
## [127] 1.0000000 1.0000000 1.0000000 1.0000000 0.9000000 0.9000000 0.8000000
## [134] 1.0000000 1.0000000 1.0000000 0.7142857 1.0000000 0.8000000 1.0000000
## [141] 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000 0.6000000 1.0000000
## [148] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8571429 0.8000000
## [155] 1.0000000 0.8000000 0.8571429 0.8000000 0.5000000 1.0000000 1.0000000
## [162] 0.6000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [169] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8000000 1.0000000
## [176] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [183] 0.8333333 1.0000000 0.7500000 0.8000000 0.8000000 1.0000000 0.7000000
## [190] 0.7000000 1.0000000 0.7000000 1.0000000 1.0000000 1.0000000 0.8000000
## [197] 1.0000000 1.0000000 1.0000000 0.8333333 0.8750000 1.0000000 0.7142857
## [204] 0.6000000 0.8000000 1.0000000 1.0000000 1.0000000 0.8571429 1.0000000
## [211] 1.0000000 0.7000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [218] 0.8000000 1.0000000 0.8000000 1.0000000 0.8000000 0.8333333 1.0000000
## [225] 1.0000000 1.0000000 0.8333333 1.0000000 1.0000000 1.0000000 1.0000000
## [232] 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000 0.6000000 1.0000000
## [239] 1.0000000 1.0000000 0.8000000 1.0000000 0.8333333 1.0000000 1.0000000
## [246] 1.0000000 0.6000000 0.6000000 0.8000000 1.0000000 1.0000000 0.6666667
## [253] 1.0000000 1.0000000 0.8333333 1.0000000 1.0000000 1.0000000 0.8333333
## [260] 1.0000000 0.8000000 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000
## [267] 1.0000000 1.0000000 0.6000000 0.8000000 1.0000000 1.0000000 0.5000000
## [274] 1.0000000 1.0000000 1.0000000 1.0000000 0.8000000 0.7000000 1.0000000
## [281] 1.0000000 1.0000000 0.7000000 0.7777778 1.0000000 1.0000000
## Levels: Ale IPA
table <- table(classification, testBeers$Class)
table
##
## classification Ale IPA
## Ale 153 21
## IPA 20 92
CM <- confusionMatrix(table, mode = "everything")
CM
## Confusion Matrix and Statistics
##
##
## classification Ale IPA
## Ale 153 21
## IPA 20 92
##
## Accuracy : 0.8566
## 95% CI : (0.8106, 0.8951)
## No Information Rate : 0.6049
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6996
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8844
## Specificity : 0.8142
## Pos Pred Value : 0.8793
## Neg Pred Value : 0.8214
## Precision : 0.8793
## Recall : 0.8844
## F1 : 0.8818
## Prevalence : 0.6049
## Detection Rate : 0.5350
## Detection Prevalence : 0.6084
## Balanced Accuracy : 0.8493
##
## 'Positive' Class : Ale
##
# k = 5, train test only
loocv_classification <- knn.cv(trainBeers[, c("ABV", "IBU")], trainBeers$Class, k = 5)
loocv_classification
## [1] Ale IPA IPA Ale Ale Ale Ale IPA Ale IPA IPA Ale IPA IPA Ale IPA Ale Ale
## [19] IPA Ale Ale Ale Ale IPA IPA Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale
## [37] IPA Ale IPA Ale IPA Ale Ale IPA IPA Ale IPA Ale Ale Ale Ale Ale IPA Ale
## [55] IPA Ale Ale Ale Ale IPA Ale IPA Ale Ale IPA Ale IPA IPA IPA Ale Ale Ale
## [73] Ale Ale Ale IPA Ale Ale IPA Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale IPA
## [91] IPA Ale Ale IPA IPA IPA Ale IPA IPA Ale Ale Ale Ale Ale Ale Ale Ale IPA
## [109] IPA Ale Ale Ale Ale Ale Ale IPA IPA IPA Ale Ale IPA Ale Ale IPA Ale IPA
## [127] IPA IPA IPA Ale Ale Ale IPA IPA Ale Ale Ale Ale IPA IPA Ale Ale Ale Ale
## [145] Ale Ale IPA Ale IPA IPA IPA Ale IPA Ale Ale IPA IPA IPA Ale IPA IPA Ale
## [163] Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale IPA Ale IPA IPA IPA Ale IPA Ale
## [181] IPA Ale Ale IPA Ale Ale IPA Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale IPA
## [199] Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA
## [217] IPA IPA Ale Ale Ale Ale IPA Ale Ale IPA Ale Ale Ale Ale Ale Ale IPA Ale
## [235] Ale IPA IPA Ale IPA IPA Ale IPA Ale IPA IPA IPA IPA IPA Ale Ale Ale Ale
## [253] Ale IPA IPA Ale IPA Ale IPA Ale IPA Ale Ale IPA IPA IPA Ale Ale IPA Ale
## [271] Ale IPA IPA Ale Ale Ale IPA IPA IPA IPA Ale Ale Ale IPA Ale IPA IPA Ale
## [289] Ale Ale Ale IPA IPA Ale Ale IPA IPA Ale Ale Ale IPA Ale Ale Ale IPA Ale
## [307] Ale IPA IPA Ale Ale Ale IPA Ale Ale IPA Ale Ale Ale Ale IPA IPA Ale Ale
## [325] Ale Ale Ale IPA IPA Ale IPA Ale IPA Ale Ale Ale Ale Ale Ale IPA Ale Ale
## [343] IPA IPA Ale Ale IPA IPA Ale Ale Ale IPA IPA Ale IPA IPA IPA IPA IPA Ale
## [361] IPA Ale Ale IPA Ale Ale IPA Ale Ale Ale IPA Ale IPA IPA IPA Ale Ale IPA
## [379] Ale IPA Ale IPA Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [397] IPA Ale Ale IPA IPA IPA Ale IPA Ale Ale Ale IPA Ale Ale Ale Ale Ale IPA
## [415] Ale IPA IPA IPA IPA Ale IPA Ale Ale Ale IPA Ale Ale Ale Ale Ale IPA IPA
## [433] Ale IPA Ale IPA Ale IPA IPA Ale Ale Ale Ale Ale IPA IPA Ale Ale IPA IPA
## [451] Ale Ale Ale IPA Ale IPA Ale IPA Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale
## [469] Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale IPA IPA Ale IPA IPA
## [487] Ale IPA IPA Ale IPA Ale IPA Ale IPA Ale Ale Ale Ale IPA IPA Ale Ale Ale
## [505] IPA Ale Ale IPA IPA IPA Ale IPA Ale IPA Ale Ale Ale IPA Ale Ale IPA IPA
## [523] Ale IPA IPA Ale IPA IPA IPA IPA IPA IPA IPA IPA IPA Ale IPA IPA IPA Ale
## [541] IPA Ale IPA Ale IPA IPA Ale Ale Ale Ale Ale Ale IPA IPA Ale IPA IPA IPA
## [559] Ale IPA IPA Ale Ale Ale IPA IPA IPA Ale Ale Ale Ale Ale Ale Ale IPA Ale
## [577] IPA Ale IPA Ale Ale Ale IPA IPA Ale IPA IPA Ale IPA Ale IPA Ale Ale Ale
## [595] IPA Ale Ale Ale Ale IPA Ale Ale IPA Ale IPA IPA IPA IPA IPA IPA Ale Ale
## [613] Ale IPA Ale IPA IPA Ale Ale IPA Ale Ale Ale IPA IPA IPA IPA IPA Ale Ale
## [631] Ale IPA IPA Ale IPA Ale IPA IPA IPA IPA IPA Ale Ale Ale Ale IPA Ale IPA
## [649] Ale Ale IPA Ale IPA Ale IPA Ale Ale Ale Ale Ale Ale Ale Ale IPA IPA
## Levels: Ale IPA
loocv_table <- table(loocv_classification, trainBeers$Class)
loocv_table
##
## loocv_classification Ale IPA
## Ale 348 52
## IPA 38 227
loocv_cm <- confusionMatrix(loocv_table, mode = "everything")
loocv_cm
## Confusion Matrix and Statistics
##
##
## loocv_classification Ale IPA
## Ale 348 52
## IPA 38 227
##
## Accuracy : 0.8647
## 95% CI : (0.8363, 0.8897)
## No Information Rate : 0.5805
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7202
##
## Mcnemar's Test P-Value : 0.1706
##
## Sensitivity : 0.9016
## Specificity : 0.8136
## Pos Pred Value : 0.8700
## Neg Pred Value : 0.8566
## Precision : 0.8700
## Recall : 0.9016
## F1 : 0.8855
## Prevalence : 0.5805
## Detection Rate : 0.5233
## Detection Prevalence : 0.6015
## Balanced Accuracy : 0.8576
##
## 'Positive' Class : Ale
##
Above code: KNN support the finding that IBU and ABV seem to be positivly correlated
#9. Knock their socks off! Find one other useful inference from the data that you feel Budweiser may be able to find value in. You must convince them why it is important and back up your conviction with appropriate statistical evidence.
# use of themes and advanced plots
hist_abv <- BEER_AND_BREWERIES %>%
ggplot(aes(ABV, na.rm = TRUE)) + geom_histogram()
hist_abv
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 62 rows containing non-finite values (`stat_bin()`).
Above code: TBD
6. Comment on the summary statistics and distribution of the ABV variable.