Finding specific strings in an array using R

Question

I am trying to find if there is a quick way to search for specific strings in arrays in R, kind of like the game Boggle, except you know the word upfront.

You are allowed to move in the following directions for the next letter of the string: up, down, right or left

Say for a simple example you have an array of the form:

> G    
A, Q, A, Q, Q,  
A, Q, P, Q, Q,   
Q, Q, P, L, Q,   
Q, Q, Q, E, Q

And you want to apply a function to G with the string APPLE, for the function to return TRUE, APPLE exists in this array, and FALSE if it doesn't.

Does there exist a pre-made function or package that can do this, or alternatively is there a smart way to do it, I'm relatively new to dealing with strings in R and I'm struggling to see a way.

Any help much appreciated. Thanks.


Show source
| R   | arrays   | string   2017-01-06 15:01 3 Answers

Answers ( 3 )

  1. 2017-01-06 16:01

    this will first check if there are any characters in your word that do not exist within the array and then will check if the number of characters in the array are sufficient to meet repeat letters in your word

    word <- strsplit("APPLE", "")
    pool <- c("A", "Q", "A", "Q",
              "Q", "A", "Q", "P",
              "Q", "Q", "Q", "Q",
              "P", "L", "Q", "Q",
              "Q", "Q", "E", "Q")
    
    t.word <- table(word)
    t.pool <- table(pool)
    
    length(setdiff(names(t.word), names(t.pool))) == 0
    min(t.pool[names(t.word)] - t.word) >= 0
    

    the last two functions will both output TRUE to show that all the letters from word exist in pool and that the count of a single letter in word is not greater than that of pool

    in function form that will output TRUE if found, otherwise FALSE

    word.find <- function(word, pool) {
      t.word <- table(strsplit(word, ""))
      t.pool <- table(pool)
      length(setdiff(names(t.word), names(t.pool))) == 0 & min(t.pool[names(t.word)] - t.word) >= 0
    }
    
    word.find("APPLE", pool)
    [1] TRUE
    
    word.find("APPLES", pool)
    [1] FALSE
    
    word.find("APPLEE", pool)
    [1] FALSE
    
  2. 2017-01-06 18:01

    This function works using only base R

    THE FUNCTION

    search_string = function(matrix_array, word_to_search){
    
        position = data.frame(NA,NA,NA) #Create empty dataframe
    
        word_to_search_inv = sapply(lapply(strsplit(word_to_search, NULL), rev), paste, collapse="") #Reverse word_to_search
    
        for (i in 1:nrow(matrix_array)){
            str_row = paste((matrix_array[i,]),collapse = "") #Collapse entire row into a string
            if (grepl(word_to_search,str_row)) { #Check if the word_to_search is in the string towards right
                position = rbind(position,c(i,regexpr(word_to_search,str_row)[1],"RIGHT")) #Get position and add it to the dataframe      
            }
            if (grepl(word_to_search_inv,str_row)) {#Check if the word_to_search is in the string towards left (by checking for reverse of word_to_search)
                position = rbind(position,c(i,regexpr(word_to_search_inv,str_row)[1],"LEFT"))       
            }
        }
    
        for (j in 1:ncol(matrix_array)){        
            str_column = paste((matrix_array[,j]),collapse = "")
            if (grepl(word_to_search, str_column)) { #Check if the word_to_search is in the string towards down
                position = rbind(position, c(regexpr(word_to_search,str_column)[1],j,"DOWN"))
            }
            if (grepl(word_to_search_inv, str_column)) { #Check if the word_to_search is in the string towards up
                position = rbind(position, c(regexpr(word_to_search_inv,str_column)[1],j,"UP"))
            }
        }
    
        colnames(position) = c("ROW","COLUMN","DIRECTION")
        position = position[c(2:nrow(position)),]
        rownames(position) = NULL
        return(position) #Return the datafram containing row, columnm, and direction where word_to_match is found
    }
    

    USAGE

    #Data
    mydata = structure(c("A", "A", "Q", "Q", "D", "Q", "Q", "Q", "Q", "B", 
                         "A", "P", "P", "L", "E", "Q", "Q", "L", "E", "S", "Q", "Q", "Q", 
                         "Q", "T", "A", "P", "P", "L", "E"), .Dim = c(5L, 6L), .Dimnames = list(NULL, c("V1", "V2", 
                                                                               "V3", "V4", "V5", "V6")))
    
    key = "APPLE"
    
    #Run the function
    pos = search_string(mydata,key)
    
  3. 2017-01-07 20:01

    Adding another approach, having:

    board = structure(c("A", "A", "Q", "Q", "Q", "Q", "Q", "Q", "A", "P", 
    "P", "Q", "Q", "Q", "L", "E", "Q", "Q", "Q", "Q"), .Dim = 4:5, .Dimnames = list(
        NULL, NULL))
    
    word = "APPLE"
    

    we start with:

    matches = lapply(strsplit(word, NULL)[[1]], function(x) which(x == board, arr.ind = TRUE))
    

    which is a simple -probably unavoidable- search of indices of "board" that match each letter of the word. It's a "list" containing the row/col indices like:

    #[[1]]
    #     row col
    #[1,]   1   1
    #[2,]   2   1
    #[3,]   1   3
    #
    #[[2]]
    #     row col
    #[1,]   2   3
    #[2,]   3   3
    #
    ##.....
    

    Having that, we need to find out, progressively, whether an index in each element has a neighbour (i.e. the right/left/up/down cell) in the next element. E.g. we need something like:

    as.matrix(find_neighbours(matches[[1]], matches[[2]], dim(board)))
    #      [,1]  [,2]
    #[1,] FALSE FALSE
    #[2,] FALSE FALSE
    #[3,]  TRUE FALSE
    

    which informs us, that the row 3 of matches[[1]] is a neighbour of row 1 of matches[[2]], i.e. [1, 3] and [2, 3] are, indeed, neighbouring cells. We need this for each successive element in "matches":

    are_neighs = Map(function(x, y) which(find_neighbours(x, y, dim(board)), TRUE), 
                     matches[-length(matches)], matches[-1])
    are_neighs
    #[[1]]
    #     [,1] [,2]
    #[1,]    3    1
    #
    #[[2]]
    #     [,1] [,2]
    #[1,]    2    1
    #[2,]    1    2
    #
    #[[3]]
    #     [,1] [,2]
    #[1,]    2    1
    #
    #[[4]]
    #     [,1] [,2]
    #[1,]    1    1
    

    Now that we have the pairwise ("i" with "i + 1") neighbour matches we need to complete the chain. For this example we'd like to have a vector like c(1, 2, 1, 1) which contains the info that the row 1 of are_neighs[[1]] is chained with the row 2 of are_neighs[[2]] which is chained with row 1 of are_neighs[[3]] which is chained with row 1 of are_neighs[[4]]. This smells like an "igraph" problem, but I'm not so familiar with it (hopefully someone has a better idea), so here's a naive approach to get that chaining:

    row_connections = matrix(NA_integer_, nrow(are_neighs[[1]]), length(are_neighs))
    row_connections[, 1] = 1:nrow(are_neighs[[1]])
    cur = are_neighs[[1]][, 2]
    for(i in 1:(length(are_neighs) - 1)) {
        im = match(cur, are_neighs[[i + 1]][, 1]) 
    cur = are_neighs[[i + 1]][, 2][im]
    row_connections[, i + 1] = im
    }
    row_connections = row_connections[complete.cases(row_connections), , drop = FALSE]
    

    Which returns:

    row_connections
    #     [,1] [,2] [,3] [,4]
    #[1,]    1    2    1    1
    

    Having this vector, now, we can extract the respective chain from "are_neighs":

    Map(function(x, i) x[i, ], are_neighs, row_connections[1, ])
    #[[1]]
    #[1] 3 1
    #
    #[[2]]
    #[1] 1 2
    #
    #[[3]]
    #[1] 2 1
    #
    #[[4]]
    #[1] 1 1
    

    which can be used to extract the appropriate row/col chain of indices from "matches":

    ans = vector("list", nrow(row_connections))
    for(i in 1:nrow(row_connections)) {
         connect = Map(function(x, i) x[i, ], are_neighs, row_connections[i, ])
         ans[[i]] = do.call(rbind, Map(function(x, i) x[i, ], matches, c(connect[[1]][1], sapply(connect, "[", 2))))
    }
    ans
    #[[1]]
    #     row col
    #[1,]   1   3
    #[2,]   2   3
    #[3,]   3   3
    #[4,]   3   4
    #[5,]   4   4
    

    Wrapping it all in a function (find_neighbours is defined inside):

    library(Matrix)
    ff = function(word, board)
    {
        matches = lapply(strsplit(word, NULL)[[1]], function(x) which(x == board, arr.ind = TRUE))
    
        find_neighbours = function(x, y, d)
        {
            neighbours = function(i, j, d = d) 
            {
                ij = rbind(cbind(i, j + c(-1L, 1L)), cbind(i + c(-1L, 1L), j))
                ijr = ij[, 1]; ijc = ij[, 2]
                ij = ij[((ijr > 0L) & (ijr <= d[1])) & ((ijc > 0L) & (ijc <= d[2])), ]
    
                ij[, 1] + (ij[, 2] - 1L) * d[1]
            }
    
            x.neighs = lapply(1:nrow(x), function(i) neighbours(x[i, 1], x[i, 2], dim(board)))
            y = y[, 1] + (y[, 2] - 1L) * d[1]
    
            x.sparse = sparseMatrix(i = unlist(x.neighs), 
                                    j = rep(seq_along(x.neighs), lengths(x.neighs)), 
                                    x = 1L, dims = c(prod(d), length(x.neighs)))
            y.sparse = sparseMatrix(i = y, j = seq_along(y), x = 1L, dims = c(prod(d), length(y)))                         
    
            ans = crossprod(x.sparse, y.sparse, boolArith = TRUE)
    
            ans
        }      
    
        are_neighs = Map(function(x, y) which(find_neighbours(x, y, dim(board)), TRUE), matches[-length(matches)], matches[-1])
    
        row_connections = matrix(NA_integer_, nrow(are_neighs[[1]]), length(are_neighs))
        row_connections[, 1] = 1:nrow(are_neighs[[1]])
        cur = are_neighs[[1]][, 2]
        for(i in 1:(length(are_neighs) - 1)) {
            im = match(cur, are_neighs[[i + 1]][, 1]) 
            cur = are_neighs[[i + 1]][, 2][im]
            row_connections[, i + 1] = im
        }
        row_connections = row_connections[complete.cases(row_connections), , drop = FALSE]
    
        ans = vector("list", nrow(row_connections))
        for(i in 1:nrow(row_connections)) {
            connect = Map(function(x, i) x[i, ], are_neighs, row_connections[i, ])
            ans[[i]] = do.call(rbind, Map(function(x, i) x[i, ], matches, c(connect[[1]][1], sapply(connect, "[", 2))))
        }
        ans
    }
    

    We can try it:

    ff("APPLE", board)
    #[[1]]
    #     row col
    #[1,]   1   3
    #[2,]   2   3
    #[3,]   3   3
    #[4,]   3   4
    #[5,]   4   4
    

    And with more than one matches:

    ff("AQQP", board)
    #[[1]]
    #     row col
    #[1,]   1   1
    #[2,]   1   2
    #[3,]   2   2
    #[4,]   2   3
    #
    #[[2]]
    #     row col
    #[1,]   1   3
    #[2,]   1   2
    #[3,]   2   2
    #[4,]   2   3
    #
    #[[3]]
    #     row col
    #[1,]   1   3
    #[2,]   1   4
    #[3,]   2   4
    #[4,]   2   3
    

    Although, it's flexible in returning multiple matches, it does not return all possible matches and, in a nutshell, that's because of the use of match when building the chain of neighbours -- a linear search could be used instead, but -at the moment- adds significant code complexity.

◀ Go back