Parsing scrabble games in gcg format

Scrabble game format: gcg

The gcg file format is a human readable representation of a Scrabble game.

In its most basic form, it has comments up the top (preceded by the # character) and then the following rows represent a move by each player in turn.

An example gcg file is show below:

gcg_text <- '
#player1 Quackle Quackle Computer
#player2 David David Boys
#description Quackle Computer plays David Boys in Round 1 at the 2006 Human vs. Computer Showdown
#title 2006 Human vs. Computer Showdown Round 1
#incomplete
>Quackle: DEMJNOT  8d   JETON           +40   40
>David: ?EDYEIG   h2  rEDYEING        +64   64
>Quackle: BEDGMNP  7e   BEDIM           +26   66  BE, ET, DO
>David: HEALERS   j1  HEALERS         +75  139  BEDIMS
>Quackle: DFGINPS   k3  DIF             +29   95  AD, LI, EF
>David: COOAORS   l1  COOS            +28  167  ADO, LIS
>Quackle: EGNOPRS   m3  SPONGER         +92  187  ADOS, LISP
>David: AORWAVA  6c   AVOW            +37  204  OBE, WET
>Quackle: AEFMOVZ  8l   MEZE            +54  241
>David: AARTUNY   d8  JAUNTY          +32  236
>Quackle: ACFIOOV  1l   COOF            +27  268
>David: WALTIER  4c   WAILED          +20  256
>Quackle: AACEINV  3a   VIA             +22  290  AW
>David: IRUTRUT   a3  VIRTU            +9  265
>Quackle: AACEHLN  8a   EH              +42  332  VIRTUE
>David: QUBITUR  2b   BRUIT           +32  297  BI, RAW
>Quackle: AACILNR  9m   RAN             +16  348  ZA, EN
>David: PQUIEN? 13a   QUEY            +32  329
>Quackle: CALLIER   c13 EL               +2  350
>David: PINIR?N  1e   PIN             +11  340  PI, IT
>Quackle: ACEILOR 15a   CALORIE         +83  433  ELL
>David: TRAING? 14f   TRAdING         +67  407  TI, RE
>David:               (DATSXK)        +36  443
'

Use lex() to turn the text into tokens

  1. Start by defining the regular expression patterns for each element in the gcg file.
  2. Use flexo::lex() to turn the gcg text into tokens
gcg_regexes <- c(
  comment       = '(#.*?)\n',                 # Assume # only appears to denote comment to end of line
  newline       = '\n',
  whitespace    = '\\s+',
  player        = '>(.*?):',                  # start of each line with a `>`
  location      = '[a-o]\\d+|\\d+[a-o]|--|-', # Number first for horizontal words. -/-- for specials
  number        = flexo::re$number,
  symbol        = '[-+\\w\\./\\?\\(?:\\)]+',
  comma         = ","
)

tokens <- flexo::lex(gcg_text, gcg_regexes)
tokens <- tokens[!(names(tokens) %in% c('whitespace', 'newline', 'comment'))]
tokens[1:23]
#>     player     symbol   location     symbol     number     number     player 
#>  "Quackle"  "DEMJNOT"       "8d"    "JETON"      "+40"       "40"    "David" 
#>     symbol   location     symbol     number     number     player     symbol 
#>  "?EDYEIG"       "h2" "rEDYEING"      "+64"       "64"  "Quackle"  "BEDGMNP" 
#>   location     symbol     number     number     symbol      comma     symbol 
#>       "7e"    "BEDIM"      "+26"       "66"       "BE"        ","       "ET" 
#>      comma     symbol 
#>        ","       "DO"

Use TokenStream to help turn the tokens into coherent data.frame

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Initialise a TokenStream object so I can manipulate the stream of tokens
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
stream <- TokenStream$new(tokens)


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# A place to store game information
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
game <- list()
game_over <- FALSE

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Keep extrcting moves from the tokens until the game is over or we've
# run out of tokens
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
while(!game_over && !stream$end_of_stream()) {
  jnk <- stream$consume_until(name = 'player', inclusive = FALSE)
  jnk
  stream
  stream$assert_name('player')
  player <- stream$consume(1)
  tiles  <- stream$consume(1)
  if (startsWith(tiles, "(")) {
    game_over <- TRUE
    loc  <- ''
    word <- ''
  } else {
    loc    <- stream$consume(1)
    word   <- stream$consume(1)
  }
  
  # Create a 1-row data.frame for this move.
  df <- data.frame(
    player = player, 
    ties   = tiles,
    loc    = loc,
    word   = word,
    stringsAsFactors = FALSE
  )
  game <- append(game, list(df))
}

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# The game as a data.frame
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
do.call(rbind, game)
#>           player     ties loc     word
#> player   Quackle  DEMJNOT  8d    JETON
#> player1    David  ?EDYEIG  h2 rEDYEING
#> player2  Quackle  BEDGMNP  7e    BEDIM
#> player3    David  HEALERS  j1  HEALERS
#> player4  Quackle  DFGINPS  k3      DIF
#> player5    David  COOAORS  l1     COOS
#> player6  Quackle  EGNOPRS  m3  SPONGER
#> player7    David  AORWAVA  6c     AVOW
#> player8  Quackle  AEFMOVZ  8l     MEZE
#> player9    David  AARTUNY  d8   JAUNTY
#> player10 Quackle  ACFIOOV  1l     COOF
#> player11   David  WALTIER  4c   WAILED
#> player12 Quackle  AACEINV  3a      VIA
#> player13   David  IRUTRUT  a3    VIRTU
#> player14 Quackle  AACEHLN  8a       EH
#> player15   David  QUBITUR  2b    BRUIT
#> player16 Quackle  AACILNR  9m      RAN
#> player17   David  PQUIEN? 13a     QUEY
#> player18 Quackle  CALLIER c13       EL
#> player19   David  PINIR?N  1e      PIN
#> player20 Quackle  ACEILOR 15a  CALORIE
#> player21   David  TRAING? 14f  TRAdING
#> player22   David (DATSXK)