4.8 Bases de datos con dplyr

Algunos enlaces:

4.8.1 Ejemplos (Práctica 1)

Como ejemplo emplearemos los ejercicios de la Práctica 1.

# install.packages('dbplyr')
library(dplyr)
library(dbplyr)

Conectar la base de datos:

chinook <- DBI::dbConnect(RSQLite::SQLite(), "data/chinook.db")

Listar tablas:

src_dbi(chinook)
## src:  sqlite 3.46.0 [C:\Users\guill\GitHub\tgdbook\data\chinook.db]
## tbls: albums, artists, customers, employees, genres, invoice_items, invoices,
##   media_types, playlist_track, playlists, sqlite_sequence, sqlite_stat1, tracks

Enlazar una tabla:

invoices <- tbl(chinook, "invoices")
invoices
## # Source:   table<`invoices`> [?? x 9]
## # Database: sqlite 3.46.0 [C:\Users\guill\GitHub\tgdbook\data\chinook.db]
##    InvoiceId CustomerId InvoiceDate      BillingAddress BillingCity BillingState
##        <int>      <int> <chr>            <chr>          <chr>       <chr>       
##  1         1          2 2009-01-01 00:0… Theodor-Heuss… Stuttgart   <NA>        
##  2         2          4 2009-01-02 00:0… Ullevålsveien… Oslo        <NA>        
##  3         3          8 2009-01-03 00:0… Grétrystraat … Brussels    <NA>        
##  4         4         14 2009-01-06 00:0… 8210 111 ST NW Edmonton    AB          
##  5         5         23 2009-01-11 00:0… 69 Salem Stre… Boston      MA          
##  6         6         37 2009-01-19 00:0… Berger Straße… Frankfurt   <NA>        
##  7         7         38 2009-02-01 00:0… Barbarossastr… Berlin      <NA>        
##  8         8         40 2009-02-01 00:0… 8, Rue Hanovre Paris       <NA>        
##  9         9         42 2009-02-02 00:0… 9, Place Loui… Bordeaux    <NA>        
## 10        10         46 2009-02-03 00:0… 3 Chatham Str… Dublin      Dublin      
## # ℹ more rows
## # ℹ 3 more variables: BillingCountry <chr>, BillingPostalCode <chr>,
## #   Total <dbl>

Ojo [?? x 9]: de momento no conoce el número de filas.

nrow(invoices)
## [1] NA

Mostrar la consulta SQL:

show_query(head(invoices))
## <SQL>
## SELECT `invoices`.*
## FROM `invoices`
## LIMIT 6
str(head(invoices))
## List of 2
##  $ src       :List of 2
##   ..$ con  :Formal class 'SQLiteConnection' [package "RSQLite"] with 8 slots
##   .. .. ..@ ptr                :<externalptr> 
##   .. .. ..@ dbname             : chr "C:\\Users\\guill\\GitHub\\tgdbook\\data\\chinook.db"
##   .. .. ..@ loadable.extensions: logi TRUE
##   .. .. ..@ flags              : int 70
##   .. .. ..@ vfs                : chr ""
##   .. .. ..@ ref                :<environment: 0x00000180f42bafd0> 
##   .. .. ..@ bigint             : chr "integer64"
##   .. .. ..@ extended_types     : logi FALSE
##   ..$ disco: NULL
##   ..- attr(*, "class")= chr [1:4] "src_SQLiteConnection" "src_dbi" "src_sql" "src"
##  $ lazy_query:List of 12
##   ..$ x                :List of 5
##   .. ..$ x         : 'dbplyr_table_path' chr "`invoices`"
##   .. ..$ vars      : chr [1:9] "InvoiceId" "CustomerId" "InvoiceDate" "BillingAddress" ...
##   .. ..$ group_vars: chr(0) 
##   .. ..$ order_vars: NULL
##   .. ..$ frame     : NULL
##   .. ..- attr(*, "class")= chr [1:3] "lazy_base_remote_query" "lazy_base_query" "lazy_query"
##   ..$ select           : tibble [9 × 5] (S3: tbl_df/tbl/data.frame)
##   .. ..$ name      : chr [1:9] "InvoiceId" "CustomerId" "InvoiceDate" "BillingAddress" ...
##   .. ..$ expr      :List of 9
##   .. .. ..$ : symbol InvoiceId
##   .. .. ..$ : symbol CustomerId
##   .. .. ..$ : symbol InvoiceDate
##   .. .. ..$ : symbol BillingAddress
##   .. .. ..$ : symbol BillingCity
##   .. .. ..$ : symbol BillingState
##   .. .. ..$ : symbol BillingCountry
##   .. .. ..$ : symbol BillingPostalCode
##   .. .. ..$ : symbol Total
##   .. ..$ group_vars:List of 9
##   .. .. ..$ : chr(0) 
##   .. .. ..$ : chr(0) 
##   .. .. ..$ : chr(0) 
##   .. .. ..$ : chr(0) 
##   .. .. ..$ : chr(0) 
##   .. .. ..$ : chr(0) 
##   .. .. ..$ : chr(0) 
##   .. .. ..$ : chr(0) 
##   .. .. ..$ : chr(0) 
##   .. ..$ order_vars:List of 9
##   .. .. ..$ : NULL
##   .. .. ..$ : NULL
##   .. .. ..$ : NULL
##   .. .. ..$ : NULL
##   .. .. ..$ : NULL
##   .. .. ..$ : NULL
##   .. .. ..$ : NULL
##   .. .. ..$ : NULL
##   .. .. ..$ : NULL
##   .. ..$ frame     :List of 9
##   .. .. ..$ : NULL
##   .. .. ..$ : NULL
##   .. .. ..$ : NULL
##   .. .. ..$ : NULL
##   .. .. ..$ : NULL
##   .. .. ..$ : NULL
##   .. .. ..$ : NULL
##   .. .. ..$ : NULL
##   .. .. ..$ : NULL
##   ..$ where            : NULL
##   ..$ group_by         : NULL
##   ..$ order_by         : NULL
##   ..$ distinct         : logi FALSE
##   ..$ limit            : num 6
##   ..$ select_operation : chr "select"
##   ..$ message_summarise: NULL
##   ..$ group_vars       : chr(0) 
##   ..$ order_vars       : NULL
##   ..$ frame            : NULL
##   ..- attr(*, "class")= chr [1:2] "lazy_select_query" "lazy_query"
##  - attr(*, "class")= chr [1:5] "tbl_SQLiteConnection" "tbl_dbi" "tbl_sql" "tbl_lazy" ...

Al trabajar con bases de datos, dplyr intenta ser lo más vago posible:

  • No exporta datos a R a menos que se pida explícitamente (colect()).

  • Retrasa cualquier operación lo máximo posible: agrupa todo lo que se desea hacer y luego hace una única petición a la base de datos.

invoices %>% head %>% collect
## # A tibble: 6 × 9
##   InvoiceId CustomerId InvoiceDate       BillingAddress BillingCity BillingState
##       <int>      <int> <chr>             <chr>          <chr>       <chr>       
## 1         1          2 2009-01-01 00:00… Theodor-Heuss… Stuttgart   <NA>        
## 2         2          4 2009-01-02 00:00… Ullevålsveien… Oslo        <NA>        
## 3         3          8 2009-01-03 00:00… Grétrystraat … Brussels    <NA>        
## 4         4         14 2009-01-06 00:00… 8210 111 ST NW Edmonton    AB          
## 5         5         23 2009-01-11 00:00… 69 Salem Stre… Boston      MA          
## 6         6         37 2009-01-19 00:00… Berger Straße… Frankfurt   <NA>        
## # ℹ 3 more variables: BillingCountry <chr>, BillingPostalCode <chr>,
## #   Total <dbl>
invoices %>% count # número de filas
## # Source:   SQL [1 x 1]
## # Database: sqlite 3.46.0 [C:\Users\guill\GitHub\tgdbook\data\chinook.db]
##       n
##   <int>
## 1   412
  1. Conocer el importe mínimo, máximo y la media de las facturas

    res <- invoices %>% summarise(min = min(Total, na.rm = TRUE), 
                            max = max(Total, na.rm = TRUE), med = mean(Total, na.rm = TRUE))
    show_query(res)
    ## <SQL>
    ## SELECT MIN(`Total`) AS `min`, MAX(`Total`) AS `max`, AVG(`Total`) AS `med`
    ## FROM `invoices`
    res  %>% collect
    ## # A tibble: 1 × 3
    ##     min   max   med
    ##   <dbl> <dbl> <dbl>
    ## 1  0.99  25.9  5.65
  2. Conocer el total de las facturas de cada uno de los países.

    res <- invoices %>% group_by(BillingCountry) %>% 
              summarise(n = n(), total = sum(Total, na.rm = TRUE))
    show_query(res)
    ## <SQL>
    ## SELECT `BillingCountry`, COUNT(*) AS `n`, SUM(`Total`) AS `total`
    ## FROM `invoices`
    ## GROUP BY `BillingCountry`
    res  %>% collect
    ## # A tibble: 24 × 3
    ##    BillingCountry     n total
    ##    <chr>          <int> <dbl>
    ##  1 Argentina          7  37.6
    ##  2 Australia          7  37.6
    ##  3 Austria            7  42.6
    ##  4 Belgium            7  37.6
    ##  5 Brazil            35 190. 
    ##  6 Canada            56 304. 
    ##  7 Chile              7  46.6
    ##  8 Czech Republic    14  90.2
    ##  9 Denmark            7  37.6
    ## 10 Finland            7  41.6
    ## # ℹ 14 more rows
  3. Obtener el listado de países junto con su facturación media, ordenado

    1. alfabéticamente por país
    res <- invoices %>% group_by(BillingCountry) %>% 
              summarise(n = n(), med = mean(Total, na.rm = TRUE)) %>%
              arrange(BillingCountry)
    show_query(res)
    ## <SQL>
    ## SELECT `BillingCountry`, COUNT(*) AS `n`, AVG(`Total`) AS `med`
    ## FROM `invoices`
    ## GROUP BY `BillingCountry`
    ## ORDER BY `BillingCountry`
    res  %>% collect
    ## # A tibble: 24 × 3
    ##    BillingCountry     n   med
    ##    <chr>          <int> <dbl>
    ##  1 Argentina          7  5.37
    ##  2 Australia          7  5.37
    ##  3 Austria            7  6.09
    ##  4 Belgium            7  5.37
    ##  5 Brazil            35  5.43
    ##  6 Canada            56  5.43
    ##  7 Chile              7  6.66
    ##  8 Czech Republic    14  6.45
    ##  9 Denmark            7  5.37
    ## 10 Finland            7  5.95
    ## # ℹ 14 more rows
  1. decrecientemente por importe de facturación media

    invoices %>% group_by(BillingCountry) %>% 
              summarise(n = n(), med = mean(Total, na.rm = TRUE)) %>%
              arrange(desc(med)) %>% collect
    ## # A tibble: 24 × 3
    ##    BillingCountry     n   med
    ##    <chr>          <int> <dbl>
    ##  1 Chile              7  6.66
    ##  2 Ireland            7  6.52
    ##  3 Hungary            7  6.52
    ##  4 Czech Republic    14  6.45
    ##  5 Austria            7  6.09
    ##  6 Finland            7  5.95
    ##  7 Netherlands        7  5.80
    ##  8 India             13  5.79
    ##  9 USA               91  5.75
    ## 10 Norway             7  5.66
    ## # ℹ 14 more rows
  1. Obtener un listado con Nombre y Apellidos de cliente y el importe de cada una de sus facturas (Hint: WHERE customer.CustomerID=invoices.CustomerID)

    customers <- tbl(chinook, "customers")
    tbl_vars(customers) 
    ## <dplyr:::vars>
    ##  [1] "CustomerId"   "FirstName"    "LastName"     "Company"      "Address"     
    ##  [6] "City"         "State"        "Country"      "PostalCode"   "Phone"       
    ## [11] "Fax"          "Email"        "SupportRepId"
    res <- customers %>% inner_join(invoices, by = "CustomerId") %>% select(FirstName, LastName, Country, Total) 
    show_query(res)
    ## <SQL>
    ## SELECT `FirstName`, `LastName`, `Country`, `Total`
    ## FROM `customers`
    ## INNER JOIN `invoices`
    ##   ON (`customers`.`CustomerId` = `invoices`.`CustomerId`)
    res  %>% collect
    ## # A tibble: 412 × 4
    ##    FirstName LastName  Country Total
    ##    <chr>     <chr>     <chr>   <dbl>
    ##  1 Luís      Gonçalves Brazil   3.98
    ##  2 Luís      Gonçalves Brazil   3.96
    ##  3 Luís      Gonçalves Brazil   5.94
    ##  4 Luís      Gonçalves Brazil   0.99
    ##  5 Luís      Gonçalves Brazil   1.98
    ##  6 Luís      Gonçalves Brazil  13.9 
    ##  7 Luís      Gonçalves Brazil   8.91
    ##  8 Leonie    Köhler    Germany  1.98
    ##  9 Leonie    Köhler    Germany 13.9 
    ## 10 Leonie    Köhler    Germany  8.91
    ## # ℹ 402 more rows
  2. ¿Qué porcentaje de las canciones son video?

    tracks <- tbl(chinook, "tracks")
    head(tracks) 
    ## # Source:   SQL [6 x 9]
    ## # Database: sqlite 3.46.0 [C:\Users\guill\GitHub\tgdbook\data\chinook.db]
    ##   TrackId Name          AlbumId MediaTypeId GenreId Composer Milliseconds  Bytes
    ##     <int> <chr>           <int>       <int>   <int> <chr>           <int>  <int>
    ## 1       1 For Those Ab…       1           1       1 Angus Y…       343719 1.12e7
    ## 2       2 Balls to the…       2           2       1 <NA>           342562 5.51e6
    ## 3       3 Fast As a Sh…       3           2       1 F. Balt…       230619 3.99e6
    ## 4       4 Restless and…       3           2       1 F. Balt…       252051 4.33e6
    ## 5       5 Princess of …       3           2       1 Deaffy …       375418 6.29e6
    ## 6       6 Put The Fing…       1           1       1 Angus Y…       205662 6.71e6
    ## # ℹ 1 more variable: UnitPrice <dbl>
    tracks %>% group_by(MediaTypeId) %>% 
        summarise(n = n()) %>% collect %>% mutate(freq = n / sum(n))
    ## # A tibble: 5 × 3
    ##   MediaTypeId     n    freq
    ##         <int> <int>   <dbl>
    ## 1           1  3034 0.866  
    ## 2           2   237 0.0677 
    ## 3           3   214 0.0611 
    ## 4           4     7 0.00200
    ## 5           5    11 0.00314
    media_types <- tbl(chinook, "media_types")
    head(media_types)
    ## # Source:   SQL [5 x 2]
    ## # Database: sqlite 3.46.0 [C:\Users\guill\GitHub\tgdbook\data\chinook.db]
    ##   MediaTypeId Name                       
    ##         <int> <chr>                      
    ## 1           1 MPEG audio file            
    ## 2           2 Protected AAC audio file   
    ## 3           3 Protected MPEG-4 video file
    ## 4           4 Purchased AAC audio file   
    ## 5           5 AAC audio file
    tracks %>% inner_join(media_types, by = "MediaTypeId") %>% count(Name.y) %>% 
        collect %>% mutate(freq = n / sum(n)) %>% filter(grepl('video', Name.y))
    ## # A tibble: 1 × 3
    ##   Name.y                          n   freq
    ##   <chr>                       <int>  <dbl>
    ## 1 Protected MPEG-4 video file   214 0.0611
  3. Listar los 10 mejores clientes (aquellos a los que se les ha facturado más cantidad) indicando Nombre, Apellidos, Pais y el importe total de su facturación.

    customers %>% inner_join(invoices, by = "CustomerId") %>% group_by(CustomerId) %>% 
        summarise(FirstName, LastName, Country, total = sum(Total, na.rm = TRUE)) %>%  
        arrange(desc(total)) %>% head(10) %>% collect
    ## # A tibble: 10 × 5
    ##    CustomerId FirstName LastName   Country        total
    ##         <int> <chr>     <chr>      <chr>          <dbl>
    ##  1          6 Helena    Holý       Czech Republic  49.6
    ##  2         26 Richard   Cunningham USA             47.6
    ##  3         57 Luis      Rojas      Chile           46.6
    ##  4         45 Ladislav  Kovács     Hungary         45.6
    ##  5         46 Hugh      O'Reilly   Ireland         45.6
    ##  6         24 Frank     Ralston    USA             43.6
    ##  7         28 Julia     Barnett    USA             43.6
    ##  8         37 Fynn      Zimmermann Germany         43.6
    ##  9          7 Astrid    Gruber     Austria         42.6
    ## 10         25 Victor    Stevens    USA             42.6
  4. Listar los géneros musicales por orden decreciente de popularidad (definida la popularidad como el número de canciones de ese género), indicando el porcentaje de las canciones de ese género.

    tracks %>% inner_join(tbl(chinook, "genres"), by = "GenreId") %>% count(Name.y) %>% 
        arrange(desc(n)) %>% collect %>% mutate(freq = n / sum(n))
    ## # A tibble: 25 × 3
    ##    Name.y                 n   freq
    ##    <chr>              <int>  <dbl>
    ##  1 Rock                1297 0.370 
    ##  2 Latin                579 0.165 
    ##  3 Metal                374 0.107 
    ##  4 Alternative & Punk   332 0.0948
    ##  5 Jazz                 130 0.0371
    ##  6 TV Shows              93 0.0265
    ##  7 Blues                 81 0.0231
    ##  8 Classical             74 0.0211
    ##  9 Drama                 64 0.0183
    ## 10 R&B/Soul              61 0.0174
    ## # ℹ 15 more rows
  5. Listar los 10 artistas con mayor número de canciones de forma descendente según el número de canciones.

    tracks %>% inner_join(tbl(chinook, "albums"), by = "AlbumId") %>% 
        inner_join(tbl(chinook, "artists"), by = "ArtistId") %>% 
        count(Name.y) %>% arrange(desc(n)) %>% collect
    ## # A tibble: 204 × 2
    ##    Name.y              n
    ##    <chr>           <int>
    ##  1 Iron Maiden       213
    ##  2 U2                135
    ##  3 Led Zeppelin      114
    ##  4 Metallica         112
    ##  5 Lost               92
    ##  6 Deep Purple        92
    ##  7 Pearl Jam          67
    ##  8 Lenny Kravitz      57
    ##  9 Various Artists    56
    ## 10 The Office         53
    ## # ℹ 194 more rows

Desconectar la base de datos:

DBI::dbDisconnect(chinook)