Установим и подгрузим пакеты.
# install.packages('readr')
# install.packages('dplyr')
# install.packages('ggplot2')
library(readr)
library(dplyr)
library(ggplot2)
Будем работать с файликом counties.rds. Формат .rds это формат данных для R, который сохраняет ваш df в том виде, в котором вы его видели. Его очень легко подгрузить с помощью функции read_rds()
.
Этот набор данных, сформированный после переписи населения США в 2015 году.
Немножко информации о некоторых переменных:
state
– название штата США (всего 50)county
– округ штатаregion
– регионmetro
– есть ли метроpopulation
– количество жителейmen
– количество мужчинwomen
– количество женщинunemployment
– процент безработицыland area
– площадь округаprivate_work
, public_work
, self_employed
, family_work
– процент работающих на частные компании, на государство, на себя и процент работающих в семейном бизнесеemployed
– частный предпринимательincome
– доходwalk
– количество граждан (в %), которые ходят на работу пешком
Сколько наблюдений и переменных в нашем наборе данных? Что является наблюдением? Какое первое значение в переменной income
?
## Observations: 3,138
## Variables: 40
## $ census_id <chr> "1001", "1003", "1005", "1007", "1009", "1011…
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "…
## $ county <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blo…
## $ region <chr> "South", "South", "South", "South", "South", …
## $ metro <chr> "Metro", "Metro", "Nonmetro", "Metro", "Metro…
## $ population <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20…
## $ men <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502…
## $ women <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 1085…
## $ hispanic <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, …
## $ white <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.…
## $ black <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3,…
## $ native <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, …
## $ asian <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, …
## $ pacific <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ citizens <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 155…
## $ income <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 322…
## $ income_err <dbl> 2391, 1263, 2973, 3995, 3141, 5884, 1793, 925…
## $ income_per_cap <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 183…
## $ income_per_cap_err <dbl> 1080, 711, 798, 1618, 708, 2055, 714, 489, 13…
## $ poverty <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.…
## $ child_poverty <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.…
## $ professional <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.…
## $ service <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.…
## $ office <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.…
## $ construction <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5…
## $ production <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.…
## $ drive <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.…
## $ carpool <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, …
## $ transit <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, …
## $ walk <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, …
## $ other_transp <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, …
## $ work_at_home <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, …
## $ mean_commute <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.…
## $ employed <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, …
## $ private_work <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.…
## $ public_work <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.…
## $ self_employed <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, …
## $ family_work <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, …
## $ unemployment <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8…
## $ land_area <dbl> 594.44, 1589.78, 884.88, 622.58, 644.78, 622.…
California
.## # A tibble: 58 x 40
## census_id state county region metro population men women hispanic
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 6001 Cali… Alame… West Metro 1584983 776699 808284 22.6
## 2 6003 Cali… Alpine West Nonm… 1131 654 477 9.5
## 3 6005 Cali… Amador West Nonm… 36995 20012 16983 12.9
## 4 6007 Cali… Butte West Metro 222564 110115 112449 15.2
## 5 6009 Cali… Calav… West Nonm… 44767 22143 22624 11.1
## 6 6011 Cali… Colusa West Nonm… 21396 11129 10267 57.4
## 7 6013 Cali… Contr… West Metro 1096068 534618 561450 24.9
## 8 6015 Cali… Del N… West Nonm… 27788 15418 12370 19.1
## 9 6017 Cali… El Do… West Metro 182093 90970 91123 12.5
## 10 6019 Cali… Fresno West Metro 956749 477316 479433 51.6
## # … with 48 more rows, and 31 more variables: white <dbl>, black <dbl>,
## # native <dbl>, asian <dbl>, pacific <dbl>, citizens <dbl>,
## # income <dbl>, income_err <dbl>, income_per_cap <dbl>,
## # income_per_cap_err <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>,
## # walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## # mean_commute <dbl>, employed <dbl>, private_work <dbl>,
## # public_work <dbl>, self_employed <dbl>, family_work <dbl>,
## # unemployment <dbl>, land_area <dbl>
California
и Florida
.## # A tibble: 125 x 40
## census_id state county region metro population men women hispanic
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 6001 Cali… Alame… West Metro 1584983 776699 808284 22.6
## 2 6003 Cali… Alpine West Nonm… 1131 654 477 9.5
## 3 6005 Cali… Amador West Nonm… 36995 20012 16983 12.9
## 4 6007 Cali… Butte West Metro 222564 110115 112449 15.2
## 5 6009 Cali… Calav… West Nonm… 44767 22143 22624 11.1
## 6 6011 Cali… Colusa West Nonm… 21396 11129 10267 57.4
## 7 6013 Cali… Contr… West Metro 1096068 534618 561450 24.9
## 8 6015 Cali… Del N… West Nonm… 27788 15418 12370 19.1
## 9 6017 Cali… El Do… West Metro 182093 90970 91123 12.5
## 10 6019 Cali… Fresno West Metro 956749 477316 479433 51.6
## # … with 115 more rows, and 31 more variables: white <dbl>, black <dbl>,
## # native <dbl>, asian <dbl>, pacific <dbl>, citizens <dbl>,
## # income <dbl>, income_err <dbl>, income_per_cap <dbl>,
## # income_per_cap_err <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>,
## # walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## # mean_commute <dbl>, employed <dbl>, private_work <dbl>,
## # public_work <dbl>, self_employed <dbl>, family_work <dbl>,
## # unemployment <dbl>, land_area <dbl>
California
, Florida
, Alabama
и Indiana
.## # A tibble: 284 x 40
## census_id state county region metro population men women hispanic
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alab… Autau… South Metro 55221 26745 28476 2.6
## 2 1003 Alab… Baldw… South Metro 195121 95314 99807 4.5
## 3 1005 Alab… Barbo… South Nonm… 26932 14497 12435 4.6
## 4 1007 Alab… Bibb South Metro 22604 12073 10531 2.2
## 5 1009 Alab… Blount South Metro 57710 28512 29198 8.6
## 6 1011 Alab… Bullo… South Nonm… 10678 5660 5018 4.4
## 7 1013 Alab… Butler South Nonm… 20354 9502 10852 1.2
## 8 1015 Alab… Calho… South Metro 116648 56274 60374 3.5
## 9 1017 Alab… Chamb… South Nonm… 34079 16258 17821 0.4
## 10 1019 Alab… Chero… South Nonm… 26008 12975 13033 1.5
## # … with 274 more rows, and 31 more variables: white <dbl>, black <dbl>,
## # native <dbl>, asian <dbl>, pacific <dbl>, citizens <dbl>,
## # income <dbl>, income_err <dbl>, income_per_cap <dbl>,
## # income_per_cap_err <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>,
## # walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## # mean_commute <dbl>, employed <dbl>, private_work <dbl>,
## # public_work <dbl>, self_employed <dbl>, family_work <dbl>,
## # unemployment <dbl>, land_area <dbl>
California
, Florida
, Alabama
и Indiana
.counties %>%
filter(state %in% c('California', 'Florida', 'Alabama', 'Indiana'), population >= 20000)
## # A tibble: 226 x 40
## census_id state county region metro population men women hispanic
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alab… Autau… South Metro 55221 26745 28476 2.6
## 2 1003 Alab… Baldw… South Metro 195121 95314 99807 4.5
## 3 1005 Alab… Barbo… South Nonm… 26932 14497 12435 4.6
## 4 1007 Alab… Bibb South Metro 22604 12073 10531 2.2
## 5 1009 Alab… Blount South Metro 57710 28512 29198 8.6
## 6 1013 Alab… Butler South Nonm… 20354 9502 10852 1.2
## 7 1015 Alab… Calho… South Metro 116648 56274 60374 3.5
## 8 1017 Alab… Chamb… South Nonm… 34079 16258 17821 0.4
## 9 1019 Alab… Chero… South Nonm… 26008 12975 13033 1.5
## 10 1021 Alab… Chilt… South Metro 43819 21619 22200 7.6
## # … with 216 more rows, and 31 more variables: white <dbl>, black <dbl>,
## # native <dbl>, asian <dbl>, pacific <dbl>, citizens <dbl>,
## # income <dbl>, income_err <dbl>, income_per_cap <dbl>,
## # income_per_cap_err <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>,
## # walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## # mean_commute <dbl>, employed <dbl>, private_work <dbl>,
## # public_work <dbl>, self_employed <dbl>, family_work <dbl>,
## # unemployment <dbl>, land_area <dbl>
California
.## # A tibble: 3,080 x 40
## census_id state county region metro population men women hispanic
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alab… Autau… South Metro 55221 26745 28476 2.6
## 2 1003 Alab… Baldw… South Metro 195121 95314 99807 4.5
## 3 1005 Alab… Barbo… South Nonm… 26932 14497 12435 4.6
## 4 1007 Alab… Bibb South Metro 22604 12073 10531 2.2
## 5 1009 Alab… Blount South Metro 57710 28512 29198 8.6
## 6 1011 Alab… Bullo… South Nonm… 10678 5660 5018 4.4
## 7 1013 Alab… Butler South Nonm… 20354 9502 10852 1.2
## 8 1015 Alab… Calho… South Metro 116648 56274 60374 3.5
## 9 1017 Alab… Chamb… South Nonm… 34079 16258 17821 0.4
## 10 1019 Alab… Chero… South Nonm… 26008 12975 13033 1.5
## # … with 3,070 more rows, and 31 more variables: white <dbl>, black <dbl>,
## # native <dbl>, asian <dbl>, pacific <dbl>, citizens <dbl>,
## # income <dbl>, income_err <dbl>, income_per_cap <dbl>,
## # income_per_cap_err <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>,
## # walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## # mean_commute <dbl>, employed <dbl>, private_work <dbl>,
## # public_work <dbl>, self_employed <dbl>, family_work <dbl>,
## # unemployment <dbl>, land_area <dbl>
## # A tibble: 3,138 x 40
## census_id state county region metro population men women hispanic
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alab… Autau… South Metro 55221 26745 28476 2.6
## 2 1003 Alab… Baldw… South Metro 195121 95314 99807 4.5
## 3 1005 Alab… Barbo… South Nonm… 26932 14497 12435 4.6
## 4 1007 Alab… Bibb South Metro 22604 12073 10531 2.2
## 5 1009 Alab… Blount South Metro 57710 28512 29198 8.6
## 6 1011 Alab… Bullo… South Nonm… 10678 5660 5018 4.4
## 7 1013 Alab… Butler South Nonm… 20354 9502 10852 1.2
## 8 1015 Alab… Calho… South Metro 116648 56274 60374 3.5
## 9 1017 Alab… Chamb… South Nonm… 34079 16258 17821 0.4
## 10 1019 Alab… Chero… South Nonm… 26008 12975 13033 1.5
## # … with 3,128 more rows, and 31 more variables: white <dbl>, black <dbl>,
## # native <dbl>, asian <dbl>, pacific <dbl>, citizens <dbl>,
## # income <dbl>, income_err <dbl>, income_per_cap <dbl>,
## # income_per_cap_err <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>,
## # walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## # mean_commute <dbl>, employed <dbl>, private_work <dbl>,
## # public_work <dbl>, self_employed <dbl>, family_work <dbl>,
## # unemployment <dbl>, land_area <dbl>
## # A tibble: 3,138 x 40
## census_id state county region metro population men women hispanic
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alab… Autau… South Metro 55221 26745 28476 2.6
## 2 1003 Alab… Baldw… South Metro 195121 95314 99807 4.5
## 3 1005 Alab… Barbo… South Nonm… 26932 14497 12435 4.6
## 4 1007 Alab… Bibb South Metro 22604 12073 10531 2.2
## 5 1009 Alab… Blount South Metro 57710 28512 29198 8.6
## 6 1011 Alab… Bullo… South Nonm… 10678 5660 5018 4.4
## 7 1013 Alab… Butler South Nonm… 20354 9502 10852 1.2
## 8 1015 Alab… Calho… South Metro 116648 56274 60374 3.5
## 9 1017 Alab… Chamb… South Nonm… 34079 16258 17821 0.4
## 10 1019 Alab… Chero… South Nonm… 26008 12975 13033 1.5
## # … with 3,128 more rows, and 31 more variables: white <dbl>, black <dbl>,
## # native <dbl>, asian <dbl>, pacific <dbl>, citizens <dbl>,
## # income <dbl>, income_err <dbl>, income_per_cap <dbl>,
## # income_per_cap_err <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>,
## # walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## # mean_commute <dbl>, employed <dbl>, private_work <dbl>,
## # public_work <dbl>, self_employed <dbl>, family_work <dbl>,
## # unemployment <dbl>, land_area <dbl>
## # A tibble: 3,138 x 40
## census_id state county region metro population men women hispanic
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 6037 Cali… Los A… West Metro 10038388 4.95e6 5.09e6 48.2
## 2 17031 Illi… Cook North… Metro 5236393 2.54e6 2.70e6 24.7
## 3 48201 Texas Harris South Metro 4356362 2.17e6 2.19e6 41.6
## 4 4013 Ariz… Maric… West Metro 4018143 1.99e6 2.03e6 30.1
## 5 6073 Cali… San D… West Metro 3223096 1.62e6 1.60e6 32.9
## 6 6059 Cali… Orange West Metro 3116069 1.54e6 1.58e6 34.2
## 7 12086 Flor… Miami… South Metro 2639042 1.28e6 1.36e6 65.6
## 8 36047 New … Kings North… Metro 2595259 1.23e6 1.37e6 19.6
## 9 48113 Texas Dallas South Metro 2485003 1.23e6 1.26e6 39
## 10 36081 New … Queens North… Metro 2301139 1.12e6 1.19e6 27.9
## # … with 3,128 more rows, and 31 more variables: white <dbl>, black <dbl>,
## # native <dbl>, asian <dbl>, pacific <dbl>, citizens <dbl>,
## # income <dbl>, income_err <dbl>, income_per_cap <dbl>,
## # income_per_cap_err <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>,
## # walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## # mean_commute <dbl>, employed <dbl>, private_work <dbl>,
## # public_work <dbl>, self_employed <dbl>, family_work <dbl>,
## # unemployment <dbl>, land_area <dbl>
state
, county
, population
и unemployment
.## # A tibble: 3,138 x 4
## state county population unemployment
## <chr> <chr> <dbl> <dbl>
## 1 Alabama Autauga 55221 7.6
## 2 Alabama Baldwin 195121 7.5
## 3 Alabama Barbour 26932 17.6
## 4 Alabama Bibb 22604 8.3
## 5 Alabama Blount 57710 7.7
## 6 Alabama Bullock 10678 18
## 7 Alabama Butler 20354 10.9
## 8 Alabama Calhoun 116648 12.3
## 9 Alabama Chambers 34079 8.9
## 10 Alabama Cherokee 26008 7.9
## # … with 3,128 more rows
state
, county
, region
, metro
, population
, men
и women
.## # A tibble: 3,138 x 7
## state county region metro population men women
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Alabama Autauga South Metro 55221 26745 28476
## 2 Alabama Baldwin South Metro 195121 95314 99807
## 3 Alabama Barbour South Nonmetro 26932 14497 12435
## 4 Alabama Bibb South Metro 22604 12073 10531
## 5 Alabama Blount South Metro 57710 28512 29198
## 6 Alabama Bullock South Nonmetro 10678 5660 5018
## 7 Alabama Butler South Nonmetro 20354 9502 10852
## 8 Alabama Calhoun South Metro 116648 56274 60374
## 9 Alabama Chambers South Nonmetro 34079 16258 17821
## 10 Alabama Cherokee South Nonmetro 26008 12975 13033
## # … with 3,128 more rows
state
, county
, region
, population
, men
и women
.## # A tibble: 3,138 x 6
## state county region population men women
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Alabama Autauga South 55221 26745 28476
## 2 Alabama Baldwin South 195121 95314 99807
## 3 Alabama Barbour South 26932 14497 12435
## 4 Alabama Bibb South 22604 12073 10531
## 5 Alabama Blount South 57710 28512 29198
## 6 Alabama Bullock South 10678 5660 5018
## 7 Alabama Butler South 20354 9502 10852
## 8 Alabama Calhoun South 116648 56274 60374
## 9 Alabama Chambers South 34079 16258 17821
## 10 Alabama Cherokee South 26008 12975 13033
## # … with 3,128 more rows
unemployed_population
, которая показывает количество безработного населения.## # A tibble: 3,138 x 41
## census_id state county region metro population men women hispanic
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alab… Autau… South Metro 55221 26745 28476 2.6
## 2 1003 Alab… Baldw… South Metro 195121 95314 99807 4.5
## 3 1005 Alab… Barbo… South Nonm… 26932 14497 12435 4.6
## 4 1007 Alab… Bibb South Metro 22604 12073 10531 2.2
## 5 1009 Alab… Blount South Metro 57710 28512 29198 8.6
## 6 1011 Alab… Bullo… South Nonm… 10678 5660 5018 4.4
## 7 1013 Alab… Butler South Nonm… 20354 9502 10852 1.2
## 8 1015 Alab… Calho… South Metro 116648 56274 60374 3.5
## 9 1017 Alab… Chamb… South Nonm… 34079 16258 17821 0.4
## 10 1019 Alab… Chero… South Nonm… 26008 12975 13033 1.5
## # … with 3,128 more rows, and 32 more variables: white <dbl>, black <dbl>,
## # native <dbl>, asian <dbl>, pacific <dbl>, citizens <dbl>,
## # income <dbl>, income_err <dbl>, income_per_cap <dbl>,
## # income_per_cap_err <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>,
## # walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## # mean_commute <dbl>, employed <dbl>, private_work <dbl>,
## # public_work <dbl>, self_employed <dbl>, family_work <dbl>,
## # unemployment <dbl>, land_area <dbl>, unemployed_population <dbl>
pop_1kk
, которая отвечает на вопрос: “Проживает ли в округе более 1000000 человек?”## # A tibble: 3,138 x 41
## census_id state county region metro population men women hispanic
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1001 Alab… Autau… South Metro 55221 26745 28476 2.6
## 2 1003 Alab… Baldw… South Metro 195121 95314 99807 4.5
## 3 1005 Alab… Barbo… South Nonm… 26932 14497 12435 4.6
## 4 1007 Alab… Bibb South Metro 22604 12073 10531 2.2
## 5 1009 Alab… Blount South Metro 57710 28512 29198 8.6
## 6 1011 Alab… Bullo… South Nonm… 10678 5660 5018 4.4
## 7 1013 Alab… Butler South Nonm… 20354 9502 10852 1.2
## 8 1015 Alab… Calho… South Metro 116648 56274 60374 3.5
## 9 1017 Alab… Chamb… South Nonm… 34079 16258 17821 0.4
## 10 1019 Alab… Chero… South Nonm… 26008 12975 13033 1.5
## # … with 3,128 more rows, and 32 more variables: white <dbl>, black <dbl>,
## # native <dbl>, asian <dbl>, pacific <dbl>, citizens <dbl>,
## # income <dbl>, income_err <dbl>, income_per_cap <dbl>,
## # income_per_cap_err <dbl>, poverty <dbl>, child_poverty <dbl>,
## # professional <dbl>, service <dbl>, office <dbl>, construction <dbl>,
## # production <dbl>, drive <dbl>, carpool <dbl>, transit <dbl>,
## # walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## # mean_commute <dbl>, employed <dbl>, private_work <dbl>,
## # public_work <dbl>, self_employed <dbl>, family_work <dbl>,
## # unemployment <dbl>, land_area <dbl>, pop_1kk <lgl>
## # A tibble: 3,138 x 4
## state county population private_work
## <chr> <chr> <dbl> <dbl>
## 1 Indiana Kosciusko 77983 88.3
## 2 Indiana Gibson 33668 88
## 3 Indiana Bartholomew 79488 87.8
## 4 Wisconsin Washington 132921 87.7
## 5 Indiana Clinton 32835 87.5
## 6 Indiana Elkhart 200685 87.5
## 7 Wisconsin Waukesha 393873 87.4
## 8 Indiana DeKalb 42449 87.3
## 9 Michigan Kent 622590 87.3
## 10 Nebraska Dakota 20798 87.3
## # … with 3,128 more rows
counties %>%
select(state, county, population) %>%
filter(population > 1000000) %>%
arrange(desc(population))
## # A tibble: 41 x 3
## state county population
## <chr> <chr> <dbl>
## 1 California Los Angeles 10038388
## 2 Illinois Cook 5236393
## 3 Texas Harris 4356362
## 4 Arizona Maricopa 4018143
## 5 California San Diego 3223096
## 6 California Orange 3116069
## 7 Florida Miami-Dade 2639042
## 8 New York Kings 2595259
## 9 Texas Dallas 2485003
## 10 New York Queens 2301139
## # … with 31 more rows
counties %>%
select(state, county, population) %>%
filter(state == 'California', population > 1000000) %>%
arrange(desc(population))
## # A tibble: 9 x 3
## state county population
## <chr> <chr> <dbl>
## 1 California Los Angeles 10038388
## 2 California San Diego 3223096
## 3 California Orange 3116069
## 4 California Riverside 2298032
## 5 California San Bernardino 2094769
## 6 California Santa Clara 1868149
## 7 California Alameda 1584983
## 8 California Sacramento 1465832
## 9 California Contra Costa 1096068
counties %>%
select(state, county, population, private_work) %>%
filter(state == 'Texas', population > 10000) %>%
arrange(desc(private_work))
## # A tibble: 169 x 4
## state county population private_work
## <chr> <chr> <dbl> <dbl>
## 1 Texas Gregg 123178 84.7
## 2 Texas Collin 862215 84.1
## 3 Texas Dallas 2485003 83.9
## 4 Texas Harris 4356362 83.4
## 5 Texas Andrews 16775 83.1
## 6 Texas Tarrant 1914526 83.1
## 7 Texas Titus 32553 82.5
## 8 Texas Denton 731851 82.2
## 9 Texas Ector 149557 82
## 10 Texas Moore 22281 82
## # … with 159 more rows
counties %>%
select(state, county, population, women) %>%
mutate(proportion_women = women / population) %>%
arrange(desc(proportion_women))
## # A tibble: 3,138 x 5
## state county population women proportion_women
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Virginia Norton city 4007 2381 0.594
## 2 Georgia Pulaski 11590 6724 0.580
## 3 Alabama Sumter 13341 7436 0.557
## 4 Mississippi Sharkey 4805 2666 0.555
## 5 Virginia Franklin city 8457 4691 0.555
## 6 Virginia Highland 2244 1241 0.553
## 7 Texas Edwards 1906 1052 0.552
## 8 Virginia Staunton city 24193 13332 0.551
## 9 New Mexico De Baca 2020 1113 0.551
## 10 Missouri Livingston 15042 8255 0.549
## # … with 3,128 more rows
counties %>%
select(state, county, population, men) %>%
mutate(proportion_men = men / population) %>%
filter(population >= 10000) %>%
arrange(desc(proportion_men))
## # A tibble: 2,437 x 5
## state county population men proportion_men
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Virginia Sussex 11864 8130 0.685
## 2 California Lassen 32645 21818 0.668
## 3 Georgia Chattahoochee 11914 7940 0.666
## 4 Louisiana West Feliciana 15415 10228 0.664
## 5 Florida Union 15191 9830 0.647
## 6 Texas Jones 19978 12652 0.633
## 7 Missouri DeKalb 12782 8080 0.632
## 8 Texas Madison 13838 8648 0.625
## 9 Virginia Greensville 11760 7303 0.621
## 10 Texas Anderson 57915 35469 0.612
## # … with 2,427 more rows
## # A tibble: 4 x 2
## region n
## <chr> <int>
## 1 South 1420
## 2 North Central 1054
## 3 West 447
## 4 Northeast 217
counties %>%
summarize(min_population = min(population),
max_unemployment = max(unemployment),
average_income = mean(income))
## # A tibble: 1 x 3
## min_population max_unemployment average_income
## <dbl> <dbl> <dbl>
## 1 85 29.4 46832.
density
) и высчитать ее для каждого штата. Найти штат с наибольшей плотностью.counties %>%
group_by(state) %>%
summarize(total_area = sum(land_area),
total_population = sum(population)) %>%
mutate(density = total_population/total_area) %>%
arrange(desc(density))
## # A tibble: 50 x 4
## state total_area total_population density
## <chr> <dbl> <dbl> <dbl>
## 1 New Jersey 7354. 8904413 1211.
## 2 Rhode Island 1034. 1053661 1019.
## 3 Massachusetts 7800. 6705586 860.
## 4 Connecticut 4842. 3593222 742.
## 5 Maryland 9707. 5930538 611.
## 6 Delaware 1949. 926454 475.
## 7 New York 47126. 19673174 417.
## 8 Florida 53625. 19645772 366.
## 9 Pennsylvania 44743. 12779559 286.
## 10 Ohio 40861. 11575977 283.
## # … with 40 more rows
## # A tibble: 50 x 2
## state total_pop
## <chr> <dbl>
## 1 Alabama 4830620
## 2 Alaska 725461
## 3 Arizona 6641928
## 4 Arkansas 2958208
## 5 California 38421464
## 6 Colorado 5278906
## 7 Connecticut 3593222
## 8 Delaware 926454
## 9 Florida 19645772
## 10 Georgia 10006693
## # … with 40 more rows
## # A tibble: 50 x 2
## state n_region
## <chr> <int>
## 1 Alabama 1
## 2 Alaska 1
## 3 Arizona 1
## 4 Arkansas 1
## 5 California 1
## 6 Colorado 1
## 7 Connecticut 1
## 8 Delaware 1
## 9 Florida 1
## 10 Georgia 1
## # … with 40 more rows
counties %>%
group_by(region, state) %>%
summarize(total_pop = sum(population)) %>%
group_by(region) %>%
summarize(average_pop = mean(total_pop),
median_pop = median(total_pop))
## # A tibble: 4 x 3
## region average_pop median_pop
## <chr> <dbl> <dbl>
## 1 North Central 5627687. 5580644
## 2 Northeast 6221058. 3593222
## 3 South 7370486 4804098
## 4 West 5722755. 2798636
## # A tibble: 9 x 40
## # Groups: region [4]
## census_id state county region metro population men women hispanic
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2013 Alas… Aleut… West Nonm… 3304 2198 1106 12
## 2 2188 Alas… North… West Nonm… 7732 4165 3567 1.8
## 3 13037 Geor… Calho… South Nonm… 6489 3892 2597 4.6
## 4 31165 Nebr… Sioux North… Nonm… 1249 651 598 3.8
## 5 36061 New … New Y… North… Metro 1629507 769434 860073 25.8
## 6 36109 New … Tompk… North… Metro 103855 51125 52730 4.6
## 7 38007 Nort… Billi… North… Nonm… 969 582 387 4.3
## 8 38051 Nort… McInt… North… Nonm… 2759 1341 1418 0.9
## 9 51678 Virg… Lexin… South Nonm… 7071 4372 2699 3.9
## # … with 31 more variables: white <dbl>, black <dbl>, native <dbl>,
## # asian <dbl>, pacific <dbl>, citizens <dbl>, income <dbl>,
## # income_err <dbl>, income_per_cap <dbl>, income_per_cap_err <dbl>,
## # poverty <dbl>, child_poverty <dbl>, professional <dbl>, service <dbl>,
## # office <dbl>, construction <dbl>, production <dbl>, drive <dbl>,
## # carpool <dbl>, transit <dbl>, walk <dbl>, other_transp <dbl>,
## # work_at_home <dbl>, mean_commute <dbl>, employed <dbl>,
## # private_work <dbl>, public_work <dbl>, self_employed <dbl>,
## # family_work <dbl>, unemployment <dbl>, land_area <dbl>
## # A tibble: 4 x 3
## # Groups: region [4]
## region state average_income
## <chr> <chr> <dbl>
## 1 North Central North Dakota 55575.
## 2 Northeast New Jersey 73014.
## 3 South Maryland 69200.
## 4 West Alaska 65125.
Будем работать с файлом babynames.rds. Давайте подгрузим его.
Этот датасет показывает количество детей, которым дали определенное имя в определенный год.
## Observations: 1,756,284
## Variables: 3
## Groups: year [138]
## $ year <dbl> 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 188…
## $ name <chr> "Aaron", "Ab", "Abbie", "Abbott", "Abby", "Abe", "Abel", …
## $ number <int> 102, 5, 71, 5, 6, 50, 9, 12, 27, 81, 21, 652, 24, 23, 104…
## [1] 1880 2017
babynames %>%
group_by(year) %>%
summarize(SUM = sum(number)) %>%
ggplot(aes(year, SUM)) +
geom_line()
Arya
и Emilia
.babynames %>%
filter(name == 'Emilia', year > 1970) %>%
ggplot(aes(x = year, y = number)) +
geom_line()
babynames_Emilia <- babynames %>%
filter(name == 'Emilia')
MIN <- min(babynames_Emilia$year)
MAX <- max(babynames_Emilia$year)
ggplot(babynames_Emilia, aes(x = year, y = number)) +
geom_line() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
scale_x_continuous(breaks = seq(MIN, MAX, 5))
## # A tibble: 138 x 3
## # Groups: year [138]
## year name number
## <dbl> <chr> <int>
## 1 1880 John 9701
## 2 1881 John 8795
## 3 1882 John 9597
## 4 1883 John 8934
## 5 1884 John 9428
## 6 1885 Mary 9166
## 7 1886 Mary 9921
## 8 1887 Mary 9935
## 9 1888 Mary 11804
## 10 1889 Mary 11689
## # … with 128 more rows
Steven
, Thomas
и Matthew
.selected_names <- babynames %>%
filter(name %in% c("Steven", "Thomas", "Matthew"))
ggplot(selected_names, aes(x = year, y = number, color = name)) +
geom_line()
babynames %>%
group_by(year) %>%
mutate(year_total = sum(number)) %>%
ungroup() %>%
mutate(fraction = number / year_total) %>%
group_by(name) %>%
top_n(1, fraction)
## # A tibble: 97,310 x 5
## # Groups: name [97,310]
## year name number year_total fraction
## <dbl> <chr> <int> <int> <dbl>
## 1 1880 Abbott 5 201484 0.0000248
## 2 1880 Abe 50 201484 0.000248
## 3 1880 Adelbert 28 201484 0.000139
## 4 1880 Adella 26 201484 0.000129
## 5 1880 Agustus 5 201484 0.0000248
## 6 1880 Albert 1493 201484 0.00741
## 7 1880 Albertus 5 201484 0.0000248
## 8 1880 Alcide 7 201484 0.0000347
## 9 1880 Alonzo 122 201484 0.000606
## 10 1880 Amos 128 201484 0.000635
## # … with 97,300 more rows
babynames %>%
group_by(name) %>%
mutate(name_max = max(number)) %>%
ungroup() %>%
mutate(fraction_max = number/name_max)
## # A tibble: 1,756,284 x 5
## year name number name_max fraction_max
## <dbl> <chr> <int> <int> <dbl>
## 1 1880 Aaron 102 15411 0.00662
## 2 1880 Ab 5 41 0.122
## 3 1880 Abbie 71 536 0.132
## 4 1880 Abbott 5 59 0.0847
## 5 1880 Abby 6 2048 0.00293
## 6 1880 Abe 50 280 0.179
## 7 1880 Abel 9 3245 0.00277
## 8 1880 Abigail 12 15948 0.000752
## 9 1880 Abner 27 202 0.134
## 10 1880 Abraham 81 2575 0.0315
## # … with 1,756,274 more rows