# codeBbigeye2.r - calculations from manuscript regarding bigeye
# tuna data (Issues 1-3). See codeAbigeye1.r for standard
# likelihood and Akaike weight calculations. In the
# comments and printed out to the console are direct chunks
# of text from our manusript, followed by the printout of the
# corresponding calculation from the code. So to check a
# calculation reported in the code, find the corresponding
# text in this file. To do analyses on other data sets, adapt
# this code. Includes R code for producing final .tiff
# file for PLoS ONE for Figure 2.
#
# Provided in the spirit of Barnes, N. (2010), Publish your computer
# code - it is good enough, Nature 467:753.
#
# Andrew Edwards, Andrew.Edwards@dfo-mpo.gc.ca .
# 25th June 2012.
# To see the code echoed in your console, run
# source("codeBbigeye2.r", echo=TRUE)
# R code extracted from my original Sweave file bigeyemle.Snw
# that was started on 13th October 2009.
rm(list=ls())
# require(xtable)
require(bbmle) # For BICtab
### R function definitions ###
# For printing output to screen (I normally use Sweave and it
# would go straight into a LaTeX file, but am sticking with R
# for simplicity to make this code available):
catPaste = function(x)
cat(paste(as.character(substitute(x)), "=", x), "\n")
### End R function definitions ###
a = NA # Minimum of range of data over
# which to test model. Either
# predefine a here or use NA to
# test the full data set. For
# bigeye data, using minimum value
# as that's what was originally
# done.
# bigeyeFigS2h.csv is the original data file from David Sims of
# 29,900 steps. It contains two columns: logstep and logrank.
# logstep is the log10 of the step size,
# logrank is the log10 of the rank of each step size.
# The steps were already sorted into descending order,
# with the largest step having a rank of 1 (logrank of 0)
# and the smallest step having a rank of 29,900
# (logrank of 4.475671).
# pseudoData.txt is the pseudo data file (that was obtained by
# sampling from the original step sizes with replacement to obtain a
# pseudo data set of the same size with similar properties, that
# can be made publicly available).
original.data = 0 # switch: 1 for original, 0 for pseudo
if(original.data)
{ bigeye = read.table(file="bigeyeFigS2h.csv", header=TRUE,
sep=",") } else
{ bigeye = read.table(file="pseudoBigeye.txt", header=TRUE,
sep=",")}
if(original.data)
{ print("Using original data") } else
{ print("**Note: using pseudo data so numbers will not exactly match published results**")}
# dim(bigeye) # should be 29900 2
log10step = bigeye$logstep # step sizes between depths on log10
# scale
rawvalsall = 10^log10step # Raw unlogged data are step sizes
rawvalsall = rawvalsall[!is.na(rawvalsall)]
# include < a, remove NA's
rawvalsall = rawvalsall[rawvalsall > 0] # and remove zeros
# Doesn't affect bigeye values
#if(bround) { b = ceiling(max(rawvalsall)) } else
# { b = max(rawvalsall) }
if( is.na(a) ) a = min(rawvalsall)
# else it's the value prescribed above
x = rawvalsall[rawvalsall >= a]
n = length(x)
if(n != 29900) stop("***Error here***")
# check sample size is correct
log10rank = bigeye$logrank # ranks on log10 scale
rank = 10^log10rank # ranks (data set is already sorted)
print("**Issue one**")
# Issue one: likelihoods were computed from linear fits of models, rather than from the underlying probability distributions being tested.
# For the rank/frequency method (Table 1, method {\bf e}) movement steps, $x$, were put in descending order such that their respective ranks were given by $y = 1,2,3,...,n$; $y(x)$ thus represents the number of steps $\geq x$. The exponential model was tested by fitting a straight line to log$_{10} y$ against $x$ (page 4 of Supplementary Information of \cite{sims08}). Thus,
# \begin{eqnarray}
# \log_{10} y = \alpha x + \beta
# \label{alphabeta}
# \end{eqnarray}
# where $\alpha$ and $\beta$ are the fitted coefficients. For the tuna data (sample size $n = 29,900$), we obtain $\alpha = -0.0169$ and $\beta = 4.40$ using linear regression, and compute a log-likelihood of 29,016.7 using the {\tt logLik()} function in R \cite{R09}. This reproduces the log-likelihood value in Supplementary Table 7 of \cite{sims08}. Whether this is the exact approach used in \cite{sims08} could not be confirmed by the authors when queried, but our calculation exactly agrees with the reported value.
# Data are:
# x - step sizes
# y - ranks of the data
log10x = log10(x)
y = rank
log10y = log10rank
# linear regression for the exponential model:
lm.exp = lm(log10y ~ x)
alpha = lm.exp$coeff[2]
beta = lm.exp$coeff[1]
print("Calculations for Issue one:")
catPaste(alpha)
catPaste(beta)
catPaste(n)
print("Reproducing the log-likelihood value for exponential model in original Supplementary Table 7:")
print(logLik(lm.exp))
# print(summary(lm.exp))
# Note that the Adjusted R-squared is exactly the percent
# deviance explained in Supp Table 7.
print("We also exactly reproduce the other two log-likelihood values reported for bigeye tuna in Supplementary Table 7 of [27]:")
# linear regression for the power-law model (called LL-lin in
# original Supplementary Table 7):
lm.pow = lm(log10y ~ log10x)
print(logLik(lm.pow))
# print(summary(lm.pow))
# Again, Adjusted R-squared is the percent deviance explained.
# multiple linear regression fit for the quadratic model
# LL-quad (see later in manuscript)
lm.quad = lm(log10y ~ log10x + I(log10x^2))
# print(summary(lm.quad))
print(logLik(lm.quad))
# And these match Supp Table 7.
print("And to get the BIC weights in original Supp Table 6, as given in our Table 1e:")
BICpower = - 2 * logLik(lm.pow) + 2 * log(n)
BICexp = - 2 * logLik(lm.exp) + 2 * log(n)
BICquad = - 2 * logLik(lm.quad) * log(n)
BICall = c(BICpower, BICexp, BICquad)
#giving BIC differences $\Delta_i = BIC_i - BIC_{min}$:
BICmin = min(BICall)
DeltaBIC = BICall - BICmin
#and 'relative likelihoods' e$^{-\Delta_i / 2}$:
likeBIC = exp( - 0.5 * DeltaBIC)
#giving BIC weights:
BICweights = likeBIC/sum(likeBIC)
print("BIC weights for all three models (power, exp, quad)")
print(BICweights)
# which agree with the reported values.
# Without the quadratic model:
BICpowexp = c(BICpower, BICexp)
# giving BIC differences $\Delta_i = BIC_i - BIC_{min}$
BIC2min = min(BICpowexp)
DeltaBIC2 = BICpowexp - BIC2min
# and 'relative likelihoods' e$^{-\Delta_i / 2}$
likeBIC2 = exp( - 0.5 * DeltaBIC2)
# giving BIC weights
BIC2weights = likeBIC2/sum(likeBIC2)
print("BIC weights without quadratic model (power, exp)")
print(BIC2weights)
# as intuitively is obvious from the likelihood values.
# Or can just do the following, which I found later from
# Ben Bolker's bbmle package:
# print(BICtab(lm.pow, lm.exp, lm.quad, nobs = length(x), weights=TRUE, sort=FALSE))
# However, this log-likelihood calculation is based on the standard assumption of Gaussian errors when fitting a straight line. Since $y(x)$ are ranks $1, 2, 3, ..., n$, the interpretation of such errors is problematic. More importantly, the resulting log-likelihood corresponds to the likelihood of the observed residuals around the fitted straight line assuming a Gaussian residual model, rather than the likelihood of the observed data coming from the exponential probability distribution (which is the hypothesis being tested). The resulting log-likelihood depends on the sum of squared residuals around the fitted line, given on page 12 of \cite{ba02} as
# \begin{eqnarray}
#\log( \mbox{likelihood}) = - \frac{n}{2} \log( \hat{\sigma}^2 ) - \frac{n}{2}\log(2 \pi) - \frac{n}{2}
#\label{LL}
#\end{eqnarray}
#where $\hat{\sigma}^2$ is the maximum likelihood estimate of the variance of the assumed Gaussian errors and is given by $\hat{\sigma}^2 = \mbox{RSS} / n$, where RSS is the residual sum of squares of the errors (see also page 172 of \cite{hm97}).
# Calculations from the data give $\hat{\sigma}^2 = 0.00840615$, yielding a log-likelihood from (\ref{LL}) of 29,016.7, matching the value given by \cite{sims08} and the aforementioned value calculated using the R function {\tt logLik()}.
RSS = sum( (log10y - alpha * x - beta)^2 )
sigma2mle = RSS / n
print("For equation (6), sigma-hat squared is:")
print(sigma2mle)
# The 'log-likelihood' from the equation is then
logLikResidual = - n/2 * ( log( sigma2mle) + log(2*pi) + 1)
print("The log(likelihood) calculated from our equation (6) is:")
print(logLikResidual)
# Inspection of the source code of {\tt logLik()}, by typing {\tt stats:::logLik.lm} in R, confirms that it does use (\ref{LL}) to give a log-likelihood value. This is the correct approach if testing a \emph{functional relationship}, whereby $y(x)$ is a function of $x$ (and Gaussian errors are assumed). But the situation here requires the testing of a \emph{probability distribution}, whereby $f(x)$ is the probability density function of $x$.
# To see this type
# stats:::logLik.lm
# The calculation is the line:
# val <- 0.5 * (sum(log(w)) - N * (log(2 * pi) + 1 - log(N) +
# log(sum(w * res^2))))
# w are the weights which are all 1, reducing the line to
# val <- 0.5 * ( - N * (log(2 * pi) + 1 - log(N) +
# log(sum(res^2))))
# giving our equation (6) with a bit of rearrangement.
# Thus, 29,016.7 is not the log-likelihood of the exponential \emph{distribution}, which we calculate to be $-116,297$.....
# Calculations given above.
# End of Issue One.
print("**Issue Two**")
# Issue two: the tested models are not normalised probability distributions.
# Equation (\ref{expfx}) requires $C = 1$ to be a correctly normalised exponential distribution; otherwise $\int_a^\infty f(x) \mbox{d}x~\neq~1$. However, the regression calculation gives $\alpha=-0.0169$ and $\beta = 4.40$, leading to $\lambda = 0.039$ and $C = 0.74 \neq 1$. There is no constraint on the regression coefficients $\alpha$ and $\beta$ to correctly normalise the probability density function such that it integrates to one.
catPaste(alpha)
catPaste(beta)
lambdaMerv = - alpha * log(10) # lambda formulation derived
catPaste(lambdaMerv) # by Mervyn
Cmerv = 10^beta * exp(- lambdaMerv * a) / n
catPaste(Cmerv)
# So the reported log-likelihood from the incorrect regression method (reproduced above) relates to a function that is not an exponential distribution. The estimated value of $\lambda = 0.039$ differs from the correct maximum likelihood estimate (e.g.~\cite{edwa07}), which is simply $\hat{\lambda} = 1 / (\Sigma x / n - a) = 0.056$.
# This was already calculated above:
lambdamleinfanal = 1 / ( sum(x)/n - a)
catPaste(lambdamleinfanal)
# End of issue two.
print("**Issue three**")
# Issue three: the quadratic model obscured support for the exponential model over the power-law model.
# However, this model also corresponds to an invalid probability density function. Similar calculations to those described for (\ref{expfx}) give the resulting probability density function
# \begin{eqnarray}
# f(x) = - \frac{10^\kappa}{n} x^{\theta \log_{10} x + \gamma - 1} \left( 2 \theta \log_{10} x + \gamma \right),~~~x \geq a,
# \label{gx}
# \end{eqnarray}
# obtained by writing $\log_{10} y = \gamma \log_{10} x + \theta (\log_{10} x)^2 + \kappa$ to fit the quadratic model on $\log_{10}$ axes, where $\gamma, \theta$ and $\kappa$ are the regression coefficients; see the \emph{Methods} for the full derivation. Multiple linear regression \cite{bolk08} gives $\gamma = 0.818, \theta = -0.752$ and $\kappa = 4.19$, (and the residual-based log-likelihood of 40,526.4, reproducing that in Supplementary Table 7 of \cite{sims08}). These coefficients give $y(x=a) = 25,603 \neq n$, showing that (\ref{gx}) is not a normalised probability density function.
gamma = as.numeric(lm.quad$coeff[2])
theta = lm.quad$coeff[3]
kappa = lm.quad$coeff[1]
catPaste(gamma)
catPaste(theta)
catPaste(kappa)
# print(summary(lm.quad))
logLikResQuad = logLik(lm.quad)
catPaste(logLikResQuad)
# To calculate y(x=a) use equation (39)
yaQuad = 10^kappa * a^gamma * a^(theta * log10(a))
paste("y at x=a from equation (39) is ", yaQuad, "\n")
# Figure 2
# The curves given by (\ref{expfx}) and (\ref{gx}) (and (\ref{hx}) which is defined below) are plotted in Figure \ref{fig:quadpdf}, to see how the models compare when plotted as probability density functions. Given the quadratic model's highly nonlinear formulation (\ref{gx}), we anticipated that it might have a hump shape, unlike the decreasing power-law and exponential functions. However, Figure \ref{fig:quadpdf} shows that it actually takes negative values.
# bigeye quadratic plot:
xvec = seq(a,400, 0.1)
fquad = -10^kappa / n * xvec^(theta * log10(xvec) + gamma -1) *
(2 * theta * log10(xvec) + gamma) # equation (8)
fexp = Cmerv * lambdaMerv * exp(-lambdaMerv * ( xvec - a ) )
# equation (7)
mu.pow = 1-lm.pow$coeff[2] # from rank/freq lin regression
kappa.pow = lm.pow$coeff[1]
fpow = 10^kappa.pow / n *(mu.pow - 1) * xvec ^(- mu.pow)
# equation (9)
# Figure 2
if(original.data) {fig.name = "figure2orig"} else
{fig.name= "figure2pseudo"}
# postscript, for incorporation into LaTeX for submission
postscript(paste(fig.name, ".eps", sep=""), horizontal=FALSE, paper="special", height = 8, width = 4+0.4)
# + 0.4 otherwise it cuts off 500 when printing
par(mfrow=c(2,1))
oldmai = par("mai") # 1.02 0.82 0.82 0.42 for two figs
# think may be indpt of fig size
par(mai = c(0.65 - 0.1, 0.82, 0.1 + 0.5, 0.42+0.4))
# so keep left and right the same, then 0.1 is taken off bottom here (as no x label) and taken off top of second figure. Adding 0.5 to top of this and bottom of second one and 1 inch to toal height so it prints out okay. + 0.4 for final one is so it doesn't cut off 100 when printing.
plot(xvec, fpow, type="l", xlim=c(0, 120), ylim=c(-.03,0.06), col="blue", ylab="Probability density", xlab="", xaxs="i", yaxs="i", mgp=c(2.0,0.5,0))
abline(h=0, col="grey", lwd=2)
lines(xvec, fexp, col="red")
lines(xvec, fquad, col="black", lwd=1.5)
figlabpos = 0.95 # proportion of x and y axis lengths to put (a)
# etc. in the corner of
text(figlabpos*120, -0.03 + figlabpos*0.09, "A", font=2)
# Now blowing up quadratic rank/frequency to show the hump
xvals = seq(3,4.5,0.01)
x.lim = c(min(xvals)-0.5, max(xvals))
# For quadratic model
quadbigeye = 10^kappa * xvals^gamma * xvals^(theta * log10(xvals))
par(mai = c(0.65 + 0.5, 0.82, 0.1 - 0.1, 0.42+0.4))
# shifting up for this figure
plot(xvals, quadbigeye, ylab=expression( paste("Number of ", moves >= x)), xlab="Movement length, x (m)", xlim=x.lim, ylim=c(25500, 25850), xaxs="i", yaxs="i", mgp=c(2.0,0.5,0), col="black", lwd=1.5, type="l") # axes=FALSE,
points(xvals[1], quadbigeye[1]) # point at x=3.0
points(xvals[xvals==4], quadbigeye[xvals==4]) # point at x=4.0
text(min(x.lim) + figlabpos*diff(x.lim) ,figlabpos*350 + 25500, "B", font=2)
dev.off()
# Figure 2 again.
# R code for tiff file for PLoS ONE publication (resized to
# PLoS ONE guidelines, and compressed tiff).
tiff(paste(fig.name, ".tiff", sep=""), height = 6.1, width = 3.25,
units="in", res=400, compression="lzw") # think res is dpi
par(mfrow=c(2,1))
# par(mai = c(0.65 - 0.1, 0.82, 0.1 + 0.5, 0.42+0.4)) #for postscript
maivalsA = c(0.5, 0.6, 0.20, 0.18)
par(mai = maivalsA)
plot(xvec, fpow, type="l", xlim=c(0, 120), ylim=c(-.03,0.06), col="blue", ylab="Probability density", xlab="", xaxs="i", yaxs="i", mgp=c(2.0,0.5,0))
axis(1, at=120, labels=120, mgp=c(2.0, 0.5, 0)) # otherwise 120
abline(h=0, col="grey", lwd=2) # at this fig size
lines(xvec, fexp, col="red")
lines(xvec, fquad, col="black", lwd=1.5)
figlabpos = 0.95 # proportion of x and y axis lengths to put (a)
# etc. in the corner of
text(figlabpos*120, -0.03 + figlabpos*0.09, "A", font=2)
# Now blowing up quadratic rank/frequency to show the hump
xvals = seq(3,4.5,0.01)
x.lim = c(min(xvals)-0.5, max(xvals))
# For quadratic model
quadbigeye = 10^kappa * xvals^gamma * xvals^(theta * log10(xvals))
vertshift = maivalsA[3] # shift top and margin by the same
maivalsB = maivalsA + c(vertshift, 0, -vertshift, 0)
par(mai = maivalsB)
plot(xvals, quadbigeye, ylab=expression( paste("Number of ", moves >= x)), xlab="Movement length, x (m)", xlim=x.lim, ylim=c(25500, 25850), xaxs="i", yaxs="i", mgp=c(2.0,0.5,0), col="black", lwd=1.5, type="l") # axes=FALSE,
points(xvals[1], quadbigeye[1]) # point at x=3.0
points(xvals[xvals==4], quadbigeye[xvals==4]) # point at x=4.0
text(min(x.lim) + figlabpos*diff(x.lim) ,figlabpos*350 + 25500, "B", font=2)
dev.off()
# End of Figure 2.
# Figure \ref{fig:quadpdf}(B) magnifies the start of the quadratic function plotted on rank/frequency axes (blowing up the start of Supplementary Figure 1(h) of \cite{sims08}, but without logarithmic axes). The negative density function means that the model predicts 25,603 moves $\geq 3$~m, yet almost 200 more (25,801) moves $\geq 3.5$~m, and then 25,649 moves $\geq 4$~m. Obviously, a model should not predict more moves $\geq 4$~m than it does moves $\geq 3$~m. The fundamental reason that this problematic situation arose is that a quadratic function, which is hump-shaped, was fitted to a rank/frequency plot, which by definition cannot be hump-shaped (it must be non-increasing).
print("For quadratic model, number of moves >= 3 m:")
print(quadbigeye[1])
print("Number of moves >= 3.5 m:")
print(quadbigeye[xvals==3.5])
print("Number of moves >= 4 m:")
print(quadbigeye[xvals==4])
# We agree that the quadratic model ``has no particular statistical or biological justification'' (page 4 of Supplementary Information of \cite{sims08}). Without it, we find that the erroneous rank/frequency method of \cite{sims08} actually favours the exponential model (Table 1, {\bf f}) for the tuna data. This conclusion was obscured by the introduction of the third (quadratic) model.
print("Table 1e: BIC weights for all three models, as in original Supplementary Table 6:")
print(BICtab(lm.pow, lm.exp, lm.quad, nobs = length(x), weights=TRUE,
sort=FALSE))
print("Table 1f: BIC weights for just power-law and exponential models")
print(BICtab(lm.pow, lm.exp, nobs = length(x), weights=TRUE, sort=FALSE))
# Issues one and two also apply to the binning methods ({\bf b-d}) of \cite{sims08} -- likelihoods were incorrectly calculated and tested models are not normalised probability distributions. These issues are in addition to the inaccuracies known to occur when using such regression approaches to estimate power-law exponents \cite{newm05, srp07, edwa07, weg08, edwa08}; also, goodness-of-fit was not properly assessed \cite{edwa07, edwa08, asld11, pc11}. Thus, distributions were tested erroneously throughout \cite{sims08}, and the original result of close resemblance to ``an inverse-square power law ...~that is typical of ideal L\'evy walks'' \cite{sims08} was based on incorrect methods.
# End of issue 3
# Save workspace
if(original.data) {save.image(file = "codeBbigeye2orig.RData")} else
{save.image(file = "codeBbigeye2pseudo.RData")}
# Extra code that was used with the original data to reproduce
# the original Supplementary Figure 1h. Commented out.
#if(original.data) {fig.name = "SuppFig1horig"} else
# {fig.name="SuppFig1hpseudo"}
#postscript(paste(fig.name, ".eps", sep=""), horizontal=FALSE, paper="special", height = 6, width = 6)
# plot(sort(x, decreasing=TRUE), 1:n, log="xy", ylab=expression( paste("Number of ", moves >= x)), xlab="Movement length, x (m)", xlim=c(2, 500), ylim=c(0.8, 40000), axes=FALSE, xaxs="i", yaxs="i", mgp=c(2.0,0.5,0))
# axis(1, at=c(1, 10, 100, 1000, 10000), labels = c(1, 10, 100, 1000, 10000), mgp=c(1.7,0.7,0))
# axis(1, at=2, labels=2, mgp=c(1.7,0.7,0))
# axis(1, at=c( ll*0.1, ll, ll*10, ll*100), labels=rep("",36), tcl=-0.2)
# axis(2, at=c(1, 10, 100, 1000, 10000), labels=c(1, 10, 100, 1000, 10000), mgp=c(1.7,0.7,0))
# axis(2, at=c(ll*0.1, ll, ll*10, ll*100, ll*1000, ll*10000), labels=rep("",54), tcl=-0.2)
# box()
# lines(xrankfreqinf, yrankfreqpowinf, col="blue")
## For exponential model
## This was the 4th attempt at figuring out the curve
## (we had to match up the plot then work backwards to
## work out how it was derived): (use xfrankfreqinf not x?)
#expmaybe4 = 10^beta * 10^(alpha * x) # equation (13)
# lines(x, expmaybe4, col="red", lty=5)
## For quadratic model
#quadbigeye = 10^kappa * x^gamma * x^(theta * log10(x))
#lines(x, quadbigeye, col="green", lty=5)
#dev.off()