The games I played in 2018 Part One - EDA

# TODO - add lat and long details of gaming locations  
# TODO - group games by RPG, digital and board  
# TODO - add weekday names and weekend/weekday categories 
# TODO - add morning/afternoon/evening categories  

In 2018 I decided to track every game I played using the BG Stats app. The app comes with some built in statistics which are useful but don’t answer all my questions.

Today I’m going to do some exploratory work in R. I’m doing this pretty quickly, with lots of plotting and hacking around with different combinations of variables.

Here are the packages I’m using;

library(jsonlite) # To read in the .json format which is default export from BG Stats
library(here) # For relative file paths
library(tidyverse)  # For general wrangling and exploration  
library(padr) # To fill in or 'pad' dates in a date series  

Import

The export comes as a .json which is easy enough to read in.

d <- fromJSON("data/BGStatsExport.json")

It’s a list with six elements, four of them are interesting and each can be transformed into a data frame.

games <- data.frame(d[3])
plays <- data.frame(d[4])
locations <- data.frame(d[5])
players <- data.frame(d[6])

Exploration

So now I have a collection of data frames with info on the games, the plays, the locations and players. I’m going to poke around and see if there is anything worth following up.

ggplot(data = games %>%
         filter(games.bggYear != 0), aes(x = games.name, y = games.bggYear)) +
  geom_point() + coord_flip() +
  labs(title = "2018 Games played by date of publication",
       subtitle = "Excluding games with missing info") +
  theme_minimal()

How many game titles were competitive?

ggplot(data = games, aes(x = games.cooperative, fill = games.cooperative)) +
  geom_bar() +
  labs(title = "Most game titles were competitive",
       subtitle = "") +
  theme_minimal()

Looking at the ‘plays’ information with games should be interesting.

# Create plays_plus, an enhanced data frame with elements from games.  

## Create game ID lookup  
game_id_lookup <- games %>%
  select(games.id, games.name)

## Join to plays
plays_plus <- plays %>%
  left_join(game_id_lookup, by = c("plays.gameRefId" = "games.id"))

Now to find the quantity of time spent on each game.

time_per_game <- plays_plus %>%
  group_by(games.name) %>%
  summarise(total_mins = sum(plays.durationMin, na.rm = T),
            total_hours = round(total_mins /60, digits = 2))
ggplot(data = time_per_game, aes(x = reorder(games.name, total_hours), y = total_hours)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "I played a lot of Dungeons & Dragons",
       subtitle = "") +
  theme_minimal()

ggplot(data = plays_plus, aes(x = "", y = plays.durationMin)) +
  geom_point(alpha = 0.2)

I pre-process the date column.

glimpse(plays_plus)
## Observations: 139
## Variables: 21
## $ plays.uuid             <chr> "17B82776-4DF4-473C-B715-3D36B5DB9059",...
## $ plays.ignored          <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALS...
## $ plays.rating           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ plays.scoringSetting   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ plays.playerScores     <list> [<c("FALSE", "FALSE", "FALSE", "FALSE"...
## $ plays.modificationDate <chr> "2018-11-25 20:17:24", "2018-11-24 21:3...
## $ plays.playDate         <chr> "2018-11-25 20:16:52", "2018-11-24 21:1...
## $ plays.manualWinner     <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE...
## $ plays.locationRefId    <int> 7, 2, 2, 2, 2, 11, 11, 11, 12, 12, 12, ...
## $ plays.rounds           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ plays.usesTeams        <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, F...
## $ plays.bggId            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ plays.entryDate        <chr> "2018-11-25 20:16:52", "2018-11-24 21:1...
## $ plays.playImages       <chr> "[]", "[]", "[]", "[]", "[]", "[]", "[]...
## $ plays.durationMin      <int> 270, 13, 20, 8, 17, 49, 14, 0, 390, 86,...
## $ plays.board            <chr> "Tales From The Loop", "Cairo", "Cairo"...
## $ plays.nemestatsId      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ plays.gameRefId        <int> 38, 41, 41, 41, 41, 12, 14, 6, 10, 39, ...
## $ plays.bggLastSync      <chr> NA, NA, NA, NA, "2018-11-24 20:37:03", ...
## $ plays.comments         <chr> NA, NA, NA, NA, "", NA, NA, NA, "Hoard ...
## $ games.name             <chr> "Tales From The Loop", "Codenames Duet"...
plays_plus <- separate(plays_plus, col = plays.playDate, into = c("date","time"), sep = " ")

plays_plus$date <- as.Date(plays_plus$date, format = "%Y-%m-%d")

Now pad in all the days that I didn’t play

year_summary <- plays_plus %>%
  pad %>%
  group_by(date, games.name) %>%
  summarise(total_mins = sum(plays.durationMin, na.rm = T),
            total_hours = round(total_mins /60, digits = 2)) 
## pad applied on the interval: day
year_summary$cum_hours <- cumsum(year_summary$total_hours)


dnd_compare <- year_summary

dnd_compare$dndYN <- ifelse(year_summary$games.name == "Dungeons & Dragons 5E", "D&D", "Other")

dnd_compare <- dnd_compare %>%
  group_by(date, dndYN) %>%
  summarise(total_hours = sum(total_hours)) %>%
mutate(cum_hours = cumsum(total_hours))



year_game_score <- year_summary %>%
  group_by(date) %>%
  summarise(total_hours = sum(total_hours, na.rm = T),
            game_score = total_hours / 8) %>%
  mutate(avail_hours = 10)
ggplot(data = year_game_score, aes(x = date, y = avail_hours, col = game_score)) +
  geom_bar(stat = "identity")

ggplot(data = year_summary, aes(x = date, y = cum_hours)) +
  geom_line()

ggplot(data = year_summary, aes(x = date, y = total_hours)) +
  geom_point(alpha = 0.2) +
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data = year_summary, aes(x = "", y = total_hours)) +
  geom_boxplot()

ggplot(data = year_summary, aes(x = total_hours)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.