## Jan 23 2026 00:02:22
## Based on observedtab.R but using aggregate/unstack and capable of using weights

Rtimestamp <- "Time-stamp: <2026-02-12 12:28:36 brendan>"

library(tidyr) ## required for complete()


## Basic table of observed (optionally weighted) data
## aggregate()/complete()/unstack() emulate table() but allow summing weights
## complete() is necessary because unstack loses track if there are empty cells
## For internal use by tabxw()
tabxwbase <- function(data, r, c, wt="") {
    ## Drop levels that do not occur (rare but screws up expected values, etc)
    data[,r] = factor(data[,r])
    data[,c] = factor(data[,c])
    r = droplevels(data[,r]); c = droplevels(data[,c])
    tx = table(r, c)
    colnames = colnames(tx); rownames = rownames(tx) # Save colnames/rownames to re-attach below
    
    dftemp = data.frame(r=r, c=c)
    dftemp$n <- if (wt == "NULL") rep(1,length(r)) else data[,wt]


    xx = complete(aggregate(x = list("N"=dftemp$n), by=(list(r = dftemp$r, c = dftemp$c)), FUN=sum),
                  r, c, fill=list(N=0))
    ## complete() is necessary if there are zero cells, otherwise unstack() misaligns rows.
    temptab = unstack(data.frame(xx), form=(N ~ c), FUN=sum)
    colnames(temptab) = colnames
    rownames(temptab) = rownames
    return(temptab)
}

## Return a table of observed, row or col pcts, expected values, residuals, pearson residuals, chisq components, or adjusted residuals
## Parameters: Dataframe, row and col, and optional weight
#' Return a table of observed values, optionally weighted
#' Parameters: Dataframe, row and col, table type, and optional weight, digitsoo
#' @export
tabxw <- function(data, rowvar, colvar=NULL, tabtype="observed", wt=NULL, digits=2, chisq=FALSE, gamma=FALSE) {
    ## Turn params to strings: https://stackoverflow.com/questions/2641653/pass-a-data-frame-column-name-to-a-function
    rowvar <- deparse(substitute(rowvar))
    colvar <- deparse(substitute(colvar))
    wt <- deparse(substitute(wt))
    if (colvar == "NULL") { ## 1D table
        colvar <- "ones"
        tabtype <- "col"
        data$ones <- factor(1, labels=("N"))
        if (!(tabtype %in% c("observed", "col"))) {
            stop(sprintf("tabtype %s not suitable for 1-way table", tabtype))
        }
    } 
    ## if (is.null(wt)) {
    ##     wt <- ""
    ## }
    ## else {
    ##     wt <- deparse(substitute(wt))
    ## }
    ## print(wt)
    obs = tabxwbase(data.frame(data), rowvar, colvar, wt=wt)
    colnames = colnames(obs)
    rownames = rownames(obs)
    nr = nrow(obs)
    nc = ncol(obs)
    ## Margins
    rowsums = rowSums(obs)
    colsums = append(colSums(obs), sum(obs))

    if (chisq) {
        chitest <- chisq.test(obs)
        message(sprintf("Chi-sq: %7.4f; df: %d; p=%8.5f", chitest$statistic, chitest$parameter, chitest$p.value))
    }
    if (gamma) {
        gammatest <- (gamma(obs, trimtotals=FALSE))
        message(sprintf("Gamma: %7.4f (ASE=%7.4f)\n", gammatest$statistic, gammatest$ase))
    }
    
    ## Respond according to tabtype parameter
    ## - observed
    if (tabtype == "observed") {
        rettab <- obs
    }
    ## - percentages, two types
    else if (tabtype %in% c("row", "col")) {
        margintype <- if (tabtype == "row") 1 else 2
        rettab <- proportions(as.matrix(obs), margin = margintype)*100
    }
    ## - residuals, multiple types
    else if (tabtype %in% c("expected", "chisqcontrib", "resid", "pearson", "adjres")) {
        ev = matrix(ncol=nc, nrow=nr, 0)
        adj = matrix(ncol=nc, nrow=nr, 0)
        rprop = rowSums(obs)/sum(obs)
        cprop = colSums(obs)/sum(obs)
        for (r in 1:nr) {
            for (c in 1:nc) {
                ev[r, c] = rowsums[r]*colsums[c]/sum(obs)
                adj[r, c] = (1 - rprop[r])*(1 - cprop[c])
            }
        }
        if (tabtype == "expected") {
            rettab <- ev
        }
        else if (tabtype == "chisqcontrib") {
            rettab <- (obs - ev)^2/ev
            dof = (nr-1)*(nc-1)
            chistat = sum(rettab)
##            message(sprintf("Chi-sq(%d): %6.4f; p: %8.6f", dof, chistat, 1 - pchisq(chistat, dof)))
        }
        else if (tabtype == "resid") {
            rettab <- obs - ev
        }
        else if (tabtype == "pearson") {
            rettab <- (obs - ev)/sqrt(ev)
        }
        else if (tabtype == "adjres") {
            rettab <- ((obs - ev)/sqrt(ev*adj))
        }
    }
    ## - error if tabtype not known
    else {
        stop(sprintf("tabtype %s not known", tabtype))
    }

    ## Attach margins to table and return
    rettab <- rbind(cbind(rettab, rowsums),colsums)
    if (colvar=="ones") {
        colnames(rettab) <- c("N", "")
        if (tabtype=="col") {
            colnames(rettab) <- c("%", "N")
            rettab[nrow(rettab),1] <- 100
        }
    }
    else {
        colnames(rettab) = append(colnames, "Total")
    }
    
    rownames(rettab) = append(rownames, "Total")

    rettab <- round(rettab, digits=digits)
#    message(rettab)
    return(rettab)
}

## Return the variable label, assuming the var.labels attribute exists
## Files read by foreign::read.dta() seem to have the attribute
getvarlab <- function(df, vname) {
    if (vname %in% names(df)) {
        return(attr(df, "var.labels")[which.max(names(df) == vname)])
    }
    return("")
}


## Goodman--Kruskal Gamma statistic for tables with row and column totals
## Set trimtotals to FALSE for tables from table()
gamma <- function(tab, trimtotals=TRUE) {
    nr <- nrow(tab)
    nc <- ncol(tab)
    if (trimtotals) {
        nr <- nr - 1
        nc <- nc - 1
        }
    conc <- 0
    disc <- 0
    N <- sum(tab[1:nr,1:nc])
    A <- matrix(nrow=nr, ncol=nc, data=0)
    D <- matrix(nrow=nr, ncol=nc, data=0)
    for (i in 1:nr) {
        for (j in 1:nc) {
            for (i2 in 1:nr) {
                for (j2 in 1:nc) {
                    if ((i2>i) & (j2>j)) A[i,j] <- A[i,j] + tab[i,j]*tab[i2,j2]
                    if ((i2<i) & (j2<j)) A[i,j] <- A[i,j] + tab[i,j]*tab[i2,j2]
                    if ((i2>i) & (j2<j)) D[i,j] <- D[i,j] + tab[i,j]*tab[i2,j2]
                    if ((i2<i) & (j2>j)) D[i,j] <- D[i,j] + tab[i,j]*tab[i2,j2]
                }
            }
        }
    }
    conc = sum(A); disc=sum(D)

    ase <- 0
    for (i in 1:nr) {
        for (j in 1:nc) {
            ase <- ase + tab[i,j]*(disc*A[i,j] - conc*D[i,j])^2
        }
    }
    ase <- ase/(conc+disc)^4
    ase <- 1/0
    statistic = (conc-disc)/(conc+disc)
    list(statistic=statistic,
         conc=conc,
         disc=disc)
}

message(sprintf("tabxw2.R: %s", substring(Rtimestamp,14,32)))