Установим и подгрузим пакеты, которые нам будут нужны для работы.

# packages <- c('readr', 'dplyr', 'ggplot2', 'tidyr', 'lubridate', 'stringr', 'naniar')
# install.packages(packages)
library(readr)
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
library(stringr)
library(naniar)

Рассмотрим набор данных weather.

weather <- read_csv2('data/weather.csv')
glimpse(weather)
## Observations: 286
## Variables: 35
## $ X       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ year    <dbl> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 20…
## $ month   <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, …
## $ measure <chr> "Max.TemperatureF", "Mean.TemperatureF", "Min.Temperatur…
## $ X1      <chr> "64", "52", "39", "46", "40", "26", "74", "63", "52", "3…
## $ X2      <chr> "42", "38", "33", "40", "27", "17", "92", "72", "51", "3…
## $ X3      <chr> "51", "44", "37", "49", "42", "24", "100", "79", "57", "…
## $ X4      <chr> "43", "37", "30", "24", "21", "13", "69", "54", "39", "3…
## $ X5      <chr> "42", "34", "26", "37", "25", "12", "85", "66", "47", "3…
## $ X6      <chr> "45", "42", "38", "45", "40", "36", "100", "93", "85", "…
## $ X7      <chr> "38", "30", "21", "36", "20", "-3", "92", "61", "29", "3…
## $ X8      <chr> "29", "24", "18", "28", "16", "3", "92", "70", "47", "30…
## $ X9      <chr> "49", "39", "29", "49", "41", "28", "100", "93", "86", "…
## $ X10     <chr> "48", "43", "38", "45", "39", "37", "100", "95", "89", "…
## $ X11     <chr> "39", "36", "32", "37", "31", "27", "92", "87", "82", "2…
## $ X12     <chr> "39", "35", "31", "28", "27", "25", "85", "75", "64", "2…
## $ X13     <chr> "42", "37", "32", "28", "26", "24", "75", "65", "55", "2…
## $ X14     <chr> "45", "39", "33", "29", "27", "25", "82", "68", "53", "2…
## $ X15     <chr> "42", "37", "32", "33", "29", "27", "89", "75", "60", "3…
## $ X16     <chr> "44", "40", "35", "42", "36", "30", "96", "85", "73", "3…
## $ X17     <chr> "49", "45", "41", "46", "41", "32", "100", "85", "70", "…
## $ X18     <chr> "44", "40", "36", "34", "30", "26", "89", "73", "57", "2…
## $ X19     <chr> "37", "33", "29", "25", "22", "20", "69", "63", "56", "3…
## $ X20     <chr> "36", "32", "27", "30", "24", "20", "89", "79", "69", "3…
## $ X21     <chr> "36", "33", "30", "30", "27", "25", "85", "77", "69", "3…
## $ X22     <chr> "44", "39", "33", "39", "34", "25", "89", "79", "69", "3…
## $ X23     <chr> "47", "45", "42", "45", "42", "37", "100", "91", "82", "…
## $ X24     <chr> "46", "44", "41", "46", "44", "41", "100", "98", "96", "…
## $ X25     <chr> "59", "52", "44", "58", "43", "29", "100", "75", "49", "…
## $ X26     <chr> "50", "44", "37", "31", "29", "28", "70", "60", "49", "3…
## $ X27     <chr> "52", "45", "38", "34", "31", "29", "70", "60", "50", "3…
## $ X28     <chr> "52", "46", "40", "42", "35", "27", "76", "65", "53", "2…
## $ X29     <chr> "41", "36", "30", "26", "20", "10", "64", "51", "37", "3…
## $ X30     <chr> "30", "26", "22", "10", "4", "-6", "50", "38", "26", "30…
## $ X31     <chr> "30", "25", "20", "8", "5", "1", "57", "44", "31", "30.3…

Параметр na.rm = TRUE убирает строки с пропущенными значениями(NA). Такие значения появляются, так как, например, для февраля может быть только 28 значений.

weather2 <- weather %>%
  gather(day, value, paste0('X',1:31), na.rm = TRUE)
head(weather2)

Столбец X это просто индекс наблюдения. Он нам ничего не дает, поэтому удалим его.

weather2 <- weather2 %>%
  select(-X)
head(weather2)
weather3 <- weather2 %>%
              spread(measure, value)
weather4 <- weather3 %>%
              mutate(day = str_replace(day,'X',''))
head(weather4)

1 способ

weather5 <- weather4 %>%
              unite(date, year, month, day, sep = "-") %>%
              mutate(date = ymd(date))
head(weather5)

2 способ

weather5 <- weather4 %>%
              mutate(date = make_date(year, month, day)) 
head(weather5)
weather6 <- weather5 %>%
              mutate(wd = wday(date))
head(weather6)
weather6 %>% 
  select(PrecipitationIn) %>% 
  arrange(desc(PrecipitationIn)) %>%
  unique() %>%
  head()
weather7 <- weather6 %>%
              mutate(PrecipitationIn = str_replace(PrecipitationIn, 'T','0'))
head(weather7)
glimpse(weather7)
## Observations: 366
## Variables: 27
## $ year                      <dbl> 2014, 2014, 2014, 2014, 2014, 2014, 20…
## $ month                     <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12…
## $ day                       <chr> "1", "10", "11", "12", "13", "14", "15…
## $ CloudCover                <chr> "6", "8", "8", "7", "5", "4", "2", "8"…
## $ Events                    <chr> "Rain", "Rain", "Rain-Snow", "Snow", N…
## $ Max.Dew.PointF            <chr> "46", "45", "37", "28", "28", "29", "3…
## $ Max.Gust.SpeedMPH         <chr> "29", "29", "28", "21", "23", "20", "2…
## $ Max.Humidity              <chr> "74", "100", "92", "85", "75", "82", "…
## $ Max.Sea.Level.PressureIn  <chr> "30.45", "29.58", "29.81", "29.88", "2…
## $ Max.TemperatureF          <chr> "64", "48", "39", "39", "42", "45", "4…
## $ Max.VisibilityMiles       <chr> "10", "10", "10", "10", "10", "10", "1…
## $ Max.Wind.SpeedMPH         <chr> "22", "23", "21", "16", "17", "15", "1…
## $ Mean.Humidity             <chr> "63", "95", "87", "75", "65", "68", "7…
## $ Mean.Sea.Level.PressureIn <chr> "30.13", "29.5", "29.61", "29.85", "29…
## $ Mean.TemperatureF         <chr> "52", "43", "36", "35", "37", "39", "3…
## $ Mean.VisibilityMiles      <chr> "10", "3", "7", "10", "10", "10", "10"…
## $ Mean.Wind.SpeedMPH        <chr> "13", "13", "13", "11", "12", "10", "6…
## $ MeanDew.PointF            <chr> "40", "39", "31", "27", "26", "27", "2…
## $ Min.DewpointF             <chr> "26", "37", "27", "25", "24", "25", "2…
## $ Min.Humidity              <chr> "52", "89", "82", "64", "55", "53", "6…
## $ Min.Sea.Level.PressureIn  <chr> "30.01", "29.43", "29.44", "29.81", "2…
## $ Min.TemperatureF          <chr> "39", "38", "32", "31", "32", "33", "3…
## $ Min.VisibilityMiles       <chr> "10", "1", "1", "7", "10", "10", "10",…
## $ PrecipitationIn           <chr> "0.01", "0.28", "0.02", "0", "0", "0.0…
## $ WindDirDegrees            <chr> "268", "357", "230", "286", "298", "30…
## $ date                      <date> 2014-12-01, 2014-12-10, 2014-12-11, 2…
## $ wd                        <dbl> 2, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 3, 7,…
weather8 <- weather7 %>%
              mutate_at(vars(CloudCover:WindDirDegrees, - Events), as.numeric)
glimpse(weather8)
## Observations: 366
## Variables: 27
## $ year                      <dbl> 2014, 2014, 2014, 2014, 2014, 2014, 20…
## $ month                     <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12…
## $ day                       <chr> "1", "10", "11", "12", "13", "14", "15…
## $ CloudCover                <dbl> 6, 8, 8, 7, 5, 4, 2, 8, 8, 7, 4, 7, 6,…
## $ Events                    <chr> "Rain", "Rain", "Rain-Snow", "Snow", N…
## $ Max.Dew.PointF            <dbl> 46, 45, 37, 28, 28, 29, 33, 42, 46, 34…
## $ Max.Gust.SpeedMPH         <dbl> 29, 29, 28, 21, 23, 20, 21, 10, 26, 30…
## $ Max.Humidity              <dbl> 74, 100, 92, 85, 75, 82, 89, 96, 100, …
## $ Max.Sea.Level.PressureIn  <dbl> 30.45, 29.58, 29.81, 29.88, 29.86, 29.…
## $ Max.TemperatureF          <dbl> 64, 48, 39, 39, 42, 45, 42, 44, 49, 44…
## $ Max.VisibilityMiles       <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10…
## $ Max.Wind.SpeedMPH         <dbl> 22, 23, 21, 16, 17, 15, 15, 8, 20, 23,…
## $ Mean.Humidity             <dbl> 63, 95, 87, 75, 65, 68, 75, 85, 85, 73…
## $ Mean.Sea.Level.PressureIn <dbl> 30.13, 29.50, 29.61, 29.85, 29.82, 29.…
## $ Mean.TemperatureF         <dbl> 52, 43, 36, 35, 37, 39, 37, 40, 45, 40…
## $ Mean.VisibilityMiles      <dbl> 10, 3, 7, 10, 10, 10, 10, 9, 6, 10, 10…
## $ Mean.Wind.SpeedMPH        <dbl> 13, 13, 13, 11, 12, 10, 6, 4, 11, 14, …
## $ MeanDew.PointF            <dbl> 40, 39, 31, 27, 26, 27, 29, 36, 41, 30…
## $ Min.DewpointF             <dbl> 26, 37, 27, 25, 24, 25, 27, 30, 32, 26…
## $ Min.Humidity              <dbl> 52, 89, 82, 64, 55, 53, 60, 73, 70, 57…
## $ Min.Sea.Level.PressureIn  <dbl> 30.01, 29.43, 29.44, 29.81, 29.78, 29.…
## $ Min.TemperatureF          <dbl> 39, 38, 32, 31, 32, 33, 32, 35, 41, 36…
## $ Min.VisibilityMiles       <dbl> 10, 1, 1, 7, 10, 10, 10, 5, 1, 10, 10,…
## $ PrecipitationIn           <dbl> 0.01, 0.28, 0.02, 0.00, 0.00, 0.00, 0.…
## $ WindDirDegrees            <dbl> 268, 357, 230, 286, 298, 306, 324, 79,…
## $ date                      <date> 2014-12-01, 2014-12-10, 2014-12-11, 2…
## $ wd                        <dbl> 2, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 3, 7,…
vis_miss(weather8)

miss_var_summary(weather8) 
miss_case_summary(weather8)

Переменная Events имеет очень много пропущенных значений, поэтому удалим эту переменную.

weather9 <- weather8 %>%
              select(-Events)

Переменная Max.Gust.SpeedMPH имеет небольшое количество пропущенных значений. Мы можем удалить их или заменить средним.

weather10 <-weather9 %>%
              drop_na()
weather11 <-weather9 %>%
              impute_mean_at("Max.Gust.SpeedMPH")
vis_miss(weather11)