\documentclass[article,nojss]{jss} %\VignetteIndexEntry{RobPer_vignette} \usepackage{amsmath,amssymb,subfig,caption,dsfont,tabularx,diagbox,capt-of,multirow} \usepackage{thumbpdf,lmodern} \newcommand{\SE}{\operatorname{SE}} \newcommand{\SY}{\operatorname{SY}} \newcommand{\transp}{\top} \newcommand{\Per}{\operatorname{Per}} \newcommand{\var}{\operatorname{var}} \newcommand{\mysize}{\footnotesize} \newcommand{\med}{\operatorname{med}} \definecolor{hellgrau}{gray} {0.9} \newcolumntype{C}[1]{>{\centering\arraybackslash}p{#1}} \newlength{\zelleI} \newlength{\zelleZ} \newlength{\zelleE} \newlength{\zelleA} \newlength{\zelleS} \newlength{\zelleG} \newlength{\zelleIbisE} \newlength{\zelleIbisS} \newlength{\zelleZbisS} \newlength{\zelleZbisG} \newlength{\zelleEbisA} \newlength{\zelleEbisS} \newlength{\zelleEbisG} \newlength{\zelleAbisS} \newlength{\zelleAbisG} \newlength{\zelleSbisG} \newlength{\zelleZbisE} \newlength{\laengeII} \newlength{\laengeIII} \DefineVerbatimEnvironment{Sinput}{Verbatim}{xleftmargin=0em} \author{Anita M. Thieler\\TU Dortmund University \And Roland Fried\\TU Dortmund University \And Jonathan Rathjens\\TU Dortmund University} \Plainauthor{Anita M. Thieler, Roland Fried, Jonathan Rathjens} \title{\pkg{RobPer}: An \proglang{R}~Package to Calculate Periodograms for Light Curves Based on Robust Regression} \Plaintitle{RobPer: An R Package to Calculate Periodograms for Light Curves Based on Robust Regression} \Shorttitle{\pkg{RobPer}: Calculating Periodograms Based on Robust Regression in \proglang{R}} \Abstract{ An important task in astroparticle physics is the detection of periodicities in irregularly sampled time series, called light curves. The classic Fourier periodogram cannot deal with irregular sampling and with the measurement accuracies that are typically given for each observation of a light curve. Hence, methods to fit periodic functions using weighted regression were developed in the past to calculate periodograms. We present the \proglang{R}~package \pkg{RobPer} which allows to combine different periodic functions and regression techniques to calculate periodograms. Possible regression techniques are least squares, least absolute deviations, least trimmed squares, M-, S- and $\tau$-regression. Measurement accuracies can be taken into account including weights. Our periodogram function covers most of the approaches that have been tried earlier and provides new model-regression-combinations that have not been used before. To detect valid periods, \pkg{RobPer} applies an outlier search on the periodogram instead of using fixed critical values that are theoretically only justified in case of least squares regression, independent periodogram bars and a null hypothesis allowing only normal white noise. Finally, the package also includes a generator to generate artificial light curves. } \Keywords{periodogram, light curves, period detection, irregular sampling, robust regression, outlier detection, Cram\'{e}r-von-Mises distance minimization, time series analysis, beta distribution, measurement accuracies, astroparticle physics, weighted regression, regression model} \Plainkeywords{periodogram, light curves, period detection, irregular sampling, robust regression, outlier detection, Cramer-von-Mises distance minimization, time series analysis, beta distribution, measurement accuracies, astroparticle physics, weighted regression, regression model} %\Volume{69} %\Issue{9} %\Month{March} %\Year{2016} %\Submitdate{2013-09-26} %\Acceptdate{2015-03-03} %\DOI{10.18637/jss.v069.i09} \Address{ Anita M. Thieler\\ Faculty of Statistics\\ TU Dortmund University\\ 44221 Dortmund, Germany\\ E-mail: \email{anita.thieler@tu-dortmund.de}\\ URL: \url{http://www.statistik.tu-dortmund.de/thieler-en.html} } \input{chart/boxenecht.tex} \begin{document} \SweaveOpts{echo=TRUE, results=hide} \vspace*{-0.5cm} \section{Introduction}\label{sec:intro} We introduce the \proglang{R} \citep{r} package \pkg{RobPer} \citep{RobPer}, which can be used to calculate periodograms and detect periodicities in irregularly sampled time series. Our special objective are light curves, which occur in astroparticle physics and are irregularly sampled times series $(t_i, y_i, s_i)_{i=1,\ldots,n}$ consisting of unequally spaced observation times $t_1, \ldots, t_n$, observed values $y_1, \ldots, y_n$ and measurement accuracies $s_1, \ldots, s_n$. The measurement accuracies $s_i$ give information about how precise the $y_i$ were measured. They can be interpreted as estimates for the standard deviations of the observed values. The observed values possibly contain a periodic fluctuation $y_f$ with fluctuation period $p_f$ and the irregularly spaced observation times $t_i$ are realizations of random variables with a periodically shaped density. <>= options(prompt = "R> ", continue = "+ ", width = 70, useFancyQuotes = FALSE) library("RobPer") @ \begin{figure}[t!] \subfloat[]{ {\setkeys{Gin}{width=.5625\textwidth} <>= data(Mrk421) temp <- Mrk421 par(mgp=c(2, 1, 0), mar=c(3,3,3,1)) jahre <- seq(1992,2010, by=3) xlim <- c(48622, 55000) yats <- seq(0, 15, by=5) plot(temp[,1], temp[,2], pch=16, col=rgb(0,0,0,alpha=0.5), axes=F, xlab="t (Gregorian date)", ylab="y", xlim=xlim, ylim=max(temp[,2]+temp[,3])*c(0, 1.1), cex=0.7) rect(temp[,1], temp[,2]+temp[,3], temp[,1], temp[,2]-temp[,3], border=rgb(0,0,0,alpha=0.5)) mtext("t (Markarian Julian days)", 3, line=2) axis(2, at=yats) axis(1, at= as.Date(paste(jahre, "-01-01", sep=""))- as.Date("1858-11-17"),labels=jahre) axis(3) box() @ } \label{fig:mrk421_a}} \subfloat[]{ {\setkeys{Gin}{width=.3375\textwidth} <>= par(mar=c(3,3,0.3,0.1), mgp=c(2,1,0)) p_sam <- 27.31 test <- hist(((Mrk421[,1])%%p_sam), breaks=seq(from=0, to=p_sam, length.out=20),main="", col="grey80",freq=F, xlim=c(0,p_sam), xlab="Cycle", axes=F) axis(1, at=(0:9)*3) axis(2, at=(0:2)*4/100) y <- test$density x <- test$mid fak1 <- sin(x/p_sam*2*pi) fak2 <- cos(x/p_sam*2*pi) mo <- lm(y~fak1+fak2)$coeff ff <- function(x){mo[1]+ mo[2]*sin(x/p_sam*2*pi) + mo[3]*cos(x/p_sam*2*pi)} curve(ff, add=T, lwd=2) box() @ } \label{fig:mrk421_b}} \caption{Light curve with gamma particle emissions for the very high energy gamma particle source Mrk~421 \citep[see][and references therein]{tluczykont2010long}. Panel~\ref{fig:mrk421_a} shows the light curve, vertical lines at each point show the reported measurement accuracies. Panel~\ref{fig:mrk421_b} depicts a histogram of the observation times $t_i$ modulo the period $p_s=27.31$. A sine represents the shape rather well.} \label{fig:mrk421} \end{figure} Such periodicity in the pattern of the observation times is a typical phenomenon, as the sampling of astroparticle physics' time series is influenced among others by astronomical constellations. For example, plotting a histogram of the observation times for the gamma particle source Mrk~421 modulo the period $p_s=27.31$ shows an unequal distribution over a cycle of this length (see Figure~\ref{fig:mrk421}). This is due to the fact that observations cannot be sampled during full moon and the moon period is similar to $p_s$. So we assume the following model for the observations indexed by $i=1,\ldots,n$: \begin{alignat}{2} &T_i = T^\star_{(i)}, &&T_1^\star,\ldots, T_n^\star \sim \mathcal D(p_s)\; \text{i.i.d.}, \label{eq:messzeiten}\\ &Y_i =Y_{f;i} + Y_{w;i}, \label{eq:flukpluswr}\\ &Y_{f;i} = f\left(\frac{T_i}{p_f}\right),&& f(\xi) = f(\xi+1) \;\forall \xi \in \mathds R \label{eq:perfluk}\\ &Y_{w;i} \sim \mathcal N(0,\sigma_i^2), \label{eq:heterorausch}\\ &s_i \text{ : given estimate for }&&\sigma_i \text{ independent from } Y_1,\ldots,Y_n, \notag \end{alignat} where $T^\star_{(i)}$ denotes the $i$th ordered observation time in $T_1^\star,\ldots, T_n^\star$ and $\mathcal D(p_s)$ is a periodic sampling density with period $p_s$. The observation times $t_1,\ldots,t_n$ and the observed values $y_1,\ldots,y_n$ are realizations of $T_1,\ldots,T_n$ and $Y_1,\ldots,Y_n$, respectively. We assume the observation times to be measured without error. $Y_{f;i}$ is the systematic periodic component in the observations, corresponding to an unknown periodic function $f$ and the period $p_f$ we are searching for. $Y_{w;i}$ is additive noise. To detect a periodic fluctuation with period $p_f$ in the observed values $y_i$, it is not possible to use the standard periodogram of Fourier analysis. This method can only be applied to time series with equidistant observation times, while light curves are typically irregularly sampled. A setting-adapted procedure, the Deeming periodogram \citep{deeming75}, is not recommendable either in this case, because it is known to react to a periodicity $p_s$ in the sampling (see \citealt{hall06}). In order to determine periodicity in light curves, other methods than the classical Fourier periodogram or the Deeming periodogram should be used. Popular periodogram methods in astroparticle physics are for example the Lomb-Scargle periodogram \citep{scargle82} or the phase dispersion minimization periodogram \citep{stellingwerf78}. These and many other approaches can be generalized to fitting periodic functions to the light curve using least squares regression and calculating periodogram bars based on $\SE$ and $\SY$, where $\SE$ is the remaining variance in the residuals of the fit and $\SY$ is the overall variance in the observed values $y_i$. An even broader class of periodogram methods additionally allows application of robust regression instead of least squares regression and weighted regression to take the measurement accuracies $s_i$ into account. The function \code{RobPer} in our homonymous \proglang{R}~package calculates a periodogram of a light curve based on fitting periodic functions to $\left(t_i,y_i\right)_{i=1,\ldots,n}$ using least squares or a robust regression technique, optionally taking measurement accuracies $s_i$ into account using weighted regression. The coefficient of determination corresponding to the objective function of the regression technique is used as periodogram bar. This proceeding incorporates analogues to most of the existing periodograms and introduces several new techniques. Preliminary implementations of most of these periodogram methods have been compared by \citet{thieler2013periodicity}. Here, we explain the usage of the \proglang{R}~package \pkg{RobPer}, which makes improved and extended methods for period detection publicly available. This article is organized as follows: In Section~\ref{sec:robper}, the usage and the structure of the function \code{RobPer} are explained. Especially, the different periodic functions and regression techniques are discussed and related to the existing periodogram methods. Diagrams which show how this \proglang{R}~function is implemented in detail are displayed in Appendix~\ref{sec:app:struktogramme}. Section~\ref{sec:betacvmfit} is devoted to the question how to find valid periods using a periodogram. \citet{thieler2013periodicity} propose robust fitting of a beta distribution combined with outlier detection. The function \code{betaCvMfit} in the package \pkg{RobPer} performs this. In Section~\ref{sec:tsgen}, the function \code{tsgen} is presented which allows to generate artificial light curves. Some examples for how to use the package are given in Section~\ref{sec:application}. Section~\ref{sec:summary} concludes with a summary. The \pkg{RobPer} software package is available from the Comprehensive \proglang{R} Archive Network (CRAN) at \url{https://CRAN.R-project.org/package=RobPer}. Other \proglang{R} packages implementing periodograms of irregularly sampled time series are the packages \pkg{lomb} (\citealt{ruf1999lomb}, function \code{lsp}), \pkg{cts} (\citealt{wang2013cts}, function \code{spec.ls}) and \pkg{nlts} (\citealt{bjornstad2013nlts}, function \code{spec.lomb}). They calculate the Lomb-Scargle periodogram, which is based on the least squares fit of a sine function. Furthermore, the package \pkg{GeneCycle} (\citealt{ahdesmaeki2012GeneCycle}, function \code{robust.spectrum}) fits sine functions using robust M-regression to calculate a periodogram based on the square of the estimated amplitude. None of these functions permits taking measurement accuracies using weighted regression into account and most of them (apart from the function \code{spec.lomb}) have restrictions concerning the trial periods fitted. \section[Calculate periodograms with RobPer]{Calculate periodograms with \code{RobPer}}\label{sec:robper} %% Note: If there is markup in \(sub)section, then it has to be escape as above. The \proglang{R}~function \code{RobPer} calculates a periodogram of a given light curve $\left(t_i, y_i, s_i\right)_{i=1,\ldots,n}$. This is done by fitting a periodic function $g$ to the data $\left(t_i, y_i\right)_{i=1,\ldots,n}$. The function $g$ has $m$ parameters entering $g$ linearly. It has a period of 1 and is transformed by $g\left(\frac{t}{p_j}\right)$ for each given trial period $\left(p_j\right)_{j=1,\ldots,q}$. A simple example is $g(t)=\sin(2\pi t)\beta_1+\cos(2\pi t)\beta_2$. The periodogram bars for the different trial periods are defined as the coefficients of determination of the respective fits. Using weighted regression with weights $1/s_i$ makes it possible to take the measurement accuracies into account. As the shape of the true fluctuation $f$ in Equation~\ref{eq:perfluk} is usually unknown, we will typically have $g\neq f$. \begin{table}[t!] \newlength{\abstand} \setlength{\abstand}{1em} %\bgroup \footnotesize \tabcolsep 1pt \begin{tabularx}{\linewidth}{lX} \hline Argument & Comment\\ \hline \code{ts} $\in\mathds R^{n\times 3}$ or $\mathds R^{n\times 2}$ & Light curve $(t_i, y_i, s_i)$ or $(t_i, y_i)$, $i=1, \ldots,n$ ;\\ &If \code{weighting = FALSE} the measurement accuracies $s_i$ column may be omitted.\\[\abstand] \code{weighting} $\in$ \{\code{T}, \code{F}\} & If \code{TRUE}, weighted regression is performed to take into account the $s_i$.\\[\abstand] \code{periods} $\in \mathds R_{>0}^q$& Trial periods $p_1,\ldots,p_q$.\\[\abstand] \code{regression} & Regression technique (see Section~\ref{sec:regression}), possible choices:\\& \code{"L2"}, \code{"L1"}, \code{"LTS"}, \code{"S"}, \code{"huber"}, \code{"bisquare"}, \code{"tau"}.\\[\abstand] \code{model} & Periodic fluctuation to be fitted (see Section~\ref{sec:model}), possible choices:\\& \code{"step"}, \code{"2step"}, \code{"sine"}, \code{"fourier(2)"}, \code{"fourier(3)"}, \code{"splines"}.\\[\abstand] \code{steps} $\in \mathds N$ & Number of steps per cycle for periodic step functions.\\ & Default: 10\\[\abstand] \code{var1} $\in$ \{\code{T}, \code{F}\} & \code{TRUE} sets variance estimate to one for weighted M-regression.\\ & Default: \code{weighting}\\[\abstand] \code{tol} $\in \mathds R_{>0}$& Precision for convergence criteria.\\& Used in case of M-regression and in case of LTS regression if \code{LTSopt = TRUE}.\\ & Default: $10^{-3}$ \\[\abstand] \code{genoudcontrol} $\in \mathds N^3$ & Settings for \code{genoud} (see paragraph about LTS regression in Section~\ref{sec:regression}): \\ & \code{max.generations}, \code{wait.generations}, \code{pop.size} \\ & Used if \code{regression = "bisquare"} or \code{LTSopt = TRUE \& regression = "LTS"}.\\ & Default: \{50, 5, 50\}\\[\abstand] \code{LTSopt} $\in$ \{\code{T}, \code{F}\} & Determines whether the regression result of \code{ltsReg} should be optimized.\\ & Default: \code{TRUE} if \code{regression = "LTS"}\\[\abstand] \code{taucontrol} $\in \mathds N^4 \times$ \{\code{T}, \code{F}\} & Settings for $\tau$-regression:\\ & \code{N}, \code{kk}, \code{tt}, \code{rr}, \code{approximate}. \\ & Used if \code{regression = "tau"}, \code{rr} only necessary for \code{approximate = TRUE}.\\ & Default: \{100, 2, 5, 2, \code{FALSE}\}\\[\abstand] \code{Scontrol} $\in \mathds N^3\times\mathds R_{>0}^2\times\mathds N $ \phantom{m} & Settings for S-regression:\\ & \code{N}, \code{kk}, \code{tt}, \code{b}, \code{cc}, \code{seed}.\\ & Used in case of \code{regression = "S"}.\\& \code{seed} can be fixed in order to get reproducible results or can be left empty.\\ & Default: \{\code{N}, 2, 5, 0.5, 1.547, \code{NULL}\} \\& with \code{N} = 50 if \code{weighting = FALSE} and \code{N = 200} if \code{weighting = TRUE}.\\[2\abstand] \hline Return value & \\ \hline \code{periodogram} $\in\mathds R^q$ & Vector of periodogram bars belonging to the trial periods.\\[\abstand] Possibly warnings & \\ \hline \end{tabularx} \caption{Arguments and return values of the function \code{RobPer}. \{\code{T}, \code{F}\} means \{\code{TRUE}, \code{FALSE}\}.}\label{tab:robperinput} %\egroup \end{table} Table~\ref{tab:robperinput} gives an overview over all arguments of \code{RobPer}. The possible shapes of the function $g$ that may be fitted by \code{RobPer} are presented in Section~\ref{sec:model}. Fitting them using least squares regression is in many cases equivalent to already existing periodogram methods (see Table~\ref{tab:modellregschema} or \citealt{thieler2013periodicity} for a more detailed discussion). In addition to least squares regression, \code{RobPer} offers a selection of robust regression techniques to fit $g\left(\frac t p_j\right)$, see Section~\ref{sec:regression}. All regression techniques implemented in \code{RobPer} are based on minimizing an objective value \begin{align} \SE = \zeta\left(y-X\beta\right) \label{eq:SE} \end{align} with respect to the unknown parameter value $\beta \in \mathds R^m$, where $X \in \mathds R^{n\times m}$ is the design matrix containing the known components of $g\left(\frac t p\right)$ at the measurement times $t_1,\ldots,t_n$ with $p$ being a trial period and $y$ the vector of observations $y_1,\ldots,y_n$. In the simple example mentioned above, the $i$th row of $X$ has the elements $\sin(2\pi t_i/p)$ and $\cos(2\pi t_i/p)$. The function $\zeta:\mathds R^n \rightarrow [0,\infty[$ is chosen according to the regression method, e.g., $\zeta(r)=\sum_{i=1}^n r_i^2$ for least squares regression. Using the same regression technique, the location $\mu$ of the observations $y_1,\ldots,y_n$ can be estimated minimizing \begin{align} \SY = \zeta\left(y-\mathfrak{i}\mu \right) \label{eq:SY} \end{align} with $\mathfrak{i} =\mathds 1_n$ being an $n$-variate vector of ones in case of unweighted regression. The periodogram bar can then be calculated as $R^2=1-\frac\SE\SY$. This definition for the coefficient of determination does not only apply for least squares regression, but also for least absolute deviation- ($L_1$) and M-regression in general (see~\citealt[p.~171]{maronna2006robust}) as well as for S-, least trimmed squares- (LTS) and $\tau$-regression (see \citealt{croux2003estimators}). If it is intended to take given measurement accuracies $s_1, \ldots,s_n$ into account, weighted regression can be performed. In this case, the terms $y$, $X$ and $\mathfrak i$ in the two fitted models \begin{align} y & = X\beta + \epsilon \qquad \text{(full model)},\\ y &= \mathfrak i \mu + \epsilon \qquad \text{(location model)},\\ \mbox{with }&\epsilon \in \mathds R^n, \epsilon_i\underset{\text{i.i.d}}{\sim} \mathcal N(0,\sigma^2), \end{align} are replaced by $\widetilde y_i = y_i/s_i$, $\widetilde X_{ij}= X_{ij}/s_i$ and $\widetilde {\mathfrak i_i} = \mathfrak i_i /s_i = \mathds 1_n/s_i$, respectively. %\begin{align} %\widetilde y & = \widetilde X\beta + \widetilde \epsilon,\\ %\widetilde y &= \widetilde { \mathfrak i} \mu + \epsilon,\\ %\text{with }& \widetilde y_i = y_i/s_i,\\ % & \widetilde X_{ij}= X_{ij}/s_i,\\ % & \widetilde {\mathfrak i_i} = \mathfrak i_i /s_i = \mathds 1_n/s_i. %\end{align} In the following, we will focus on the case of unweighted regression and only point out the handling of weighted regression, when both procedures differ. Table~\ref{tab:modellregschema} displays periodogram methods following the principle of fitting periodic functions. Up to now, weighted regression or robust regression in affiliation with periodic step functions has only been performed by \citet{thieler2013periodicity}, though the unweighted least squares versions belong to the most popular periodogram methods in this area of research. S- or $\tau$-regression, which are also available in \code{RobPer}, have not been investigated up to now in this context. \begin{table}[t!] \setlength{\abstand}{1.4em} \footnotesize \begin{tabularx}{\linewidth}{llX} \hline Model & Regression technique& Publication\\ &&\qquad(Name of the method)\\ \hline \code{step}& \code{L2} & \citet{leahy83} \\ &&\qquad(epoch folding)\\ & \code{L2} & \citet{schwarzenbergczerny89} \\ &&\qquad(analysis of variance)\\ &\code{L2}, \code{L1}, \code{huber}, \code{bisquare} & \underline{\citet{thieler2013periodicity}}\\[\abstand] \code{2step} & \code{L2} & \citet{stellingwerf78} \\ &&\qquad(phase dispersion minimization) \\ &\code{L2}, \code{L1}, \code{huber}, \code{bisquare} & \underline{\citet{thieler2013periodicity}}\\[\abstand] \code{sine} & \code{L2} & \citet{scargle82} \\ &&\qquad(Lomb-Scargle)\\ & \code{L2} & \citet{zechmeister09}\\ &&\qquad (\underline{generalized Lomb-Scargle})\\ &\code{L2} & \citet{cumming99} \\ &&\qquad(\underline{floating mean})\\ &\code{L2} & \citet{ferraz81} \\ &&\qquad (\underline{date compensated Fourier transform*})\\ &\code{L2} & \citet{reegen07} \\ &&\qquad(\underline{SigSpec*})\\ & \code{L1} & \citet{li09patent}*, \citet{li2010nonlinear}*\\ & \code{LTS} &\citet{ahdesmaeki07}*\\ & \code{bisquare} & \citet{ahdesmaeki07}*\\ & \code{huber} & \citet{zhang05robust}*\\ &\code{L2}, \code{L1}, \code{huber}, \code{bisquare} & \underline{\citet{thieler2013periodicity}}\\[\abstand] \code{fourier(2)},& \code{L2} & \citet{hall00}\\ \qquad\code{fourier(3)}& \code{L2} & \citet{palmer09} \\ &&\qquad(\underline{Fast-$\chi^2$})\\ &\code{L2}, \code{L1}, \code{huber}, \code{bisquare} & \underline{\citet{thieler2013periodicity}}\\[\abstand] \code{splines} &\code{L2} & \citet{akerlof1994application}\\ &\code{L2} & \citet{hall00}\\ & \code{L2} & \citet{oh04}\\ &&\qquad (generalized cross validation)\\ & \code{huber} &\citet{oh04} \\ &&\qquad(robust cross validation) \\ &\code{L2}, \code{L1}, \code{huber}, \code{bisquare} & \underline{\citet{thieler2013periodicity}}\\[\abstand] \hline \end{tabularx} \caption{Published periodogram methods that rely on fitting a periodic model $g$ to a light curve using a regression technique. Models (see Section~\ref{sec:model}): periodic step functions and pairwise overlapping step functions (\code{step} and \code{2step}), the sine function (\code{sine}), Fourier series of second and third degree and periodic spline functions (\code{fourier(2)}, \code{fourier(3)} and \code{splines}). Regression techniques: See Table~\ref{tab:regressiontechniques} for labels. The underlined methods can take into account measurement accuracies using weighted regression. The periodogram bars of methods marked by * do not base on SE or SY, but on the parameter vector of the function fitted (e.g., squared amplitude).}\label{tab:modellregschema} \end{table} \subsection[Periodic function fitted: Argument model]{Periodic function fitted: Argument \code{model}}\label{sec:model} For each trial period $p_i$, $i\in \{1,\ldots,q\}$ (given by the argument \code{periods}, see Table~\ref{tab:robperinput}), a periodic function (defined by \code{model}) is fitted to the light curve (using regression technique \code{regression}). Implemented periodic functions include step functions, sine functions, Fourier series and spline functions. \subsubsection{Step functions}\label{sec:step} Many periodogram methods from astroparticle physics such as the epoch folding periodogram \citep{leahy83} or the analysis of variance periodogram \citep{schwarzenbergczerny89} can be interpreted as fitting a step function to a light curve (see \citealt{schwarzenbergczerny98dist} or \citealt{thieler2013periodicity}). They use periodogram bars related to $R^2$, $n$ and the numbers of steps per cycle. Another typical periodogram method in astroparticle physics is the phase dispersion minimization periodogram (PDM,~\citealt{stellingwerf78}). Depending on the particular setting the periodogram bar in many cases equals the mean of the coefficients of determination of two fits with different step functions with staggered jumps (see \citealt{thieler2013periodicity} or \citealt{thieler2013phd} for more details). \code{RobPer} provides two options to fit periodic step functions. The number of steps per cycle is controlled by the argument \code{steps}. Using \code{model = "step"}, a single periodic step function with steps of equal width is fitted for each trial period. Performing \code{regression = "L2"}, \code{model = "step"} is equivalent to calculating an epoch folding- or analysis of variance periodogram. Using \code{model = "2step"}, two different step functions with opposed jump times and steps of equal width are fitted separately and the periodogram bar is the mean of both coefficients of determination. This is the only option where two periodic functions are fitted for one trial period. It is included to provide the PDM periodogram with overlapping bins. \subsubsection{Sine functions}\label{sec:sine} Sine functions are periodic and quite popular for investigating periodicity. The classic periodogram of Fourier analysis for equally sampled time series represents the explained variance SE of a least squares fit of a sine model to the zero-centered time series. The Lomb-Scargle periodogram \citep{scargle82} works equivalently for unequally sampled time series. As the mean of an irregularly sampled time series is not identical to the least squares fit of an intercept in a sine model, more recent methods use the uncentered data and fit a model with intercept, e.g., the floating mean periodogram by~\citet{cumming99} and the generalized Lomb-Scargle periodogram by~\citet{zechmeister09}. Performing \code{regression = "L2"}, \code{model = "sine"} is equivalent to calculating those periodograms and in case of equidistant observation times also equivalent to the Fourier periodogram. Some other methods as the Date Compensated Fourier Transform by~\citet{ferraz81}, the SigSpec periodogram by~\citet{reegen07} or robust approaches by \citet{ahdesmaeki07} and \citet{zhang05robust} apply the same regression step as the floating mean- and the generalized Lomb-Scargle periodogram, but use the squared amplitude of the fitted sinusoid as the periodogram bar. In case of regular sampling, this is another representation of the classical periodogram of Fourier analysis. As the amplitude is a concept closely related to trigonometric functions, \code{RobPer} uses the coefficient of determination only, to obtain a general method independent of the periodic function chosen. \subsubsection{Further periodic functions}\label{sec:furthermodels} Recently, fitting more complex periodic functions has been proposed for periodograms. Fourier series (see~\citealt{hall00} and ~\citealt{palmer09}) and periodic splines (see~\citealt{akerlof1994application}, \citealt{hall00} and \citealt{oh04}) may provide better adaptivity compared to sine functions, but still present a continuous function, unlike the step function. \code{RobPer} offers the possibility to fit Fourier series of second (\code{model = "fourier(2)"}) or third (\code{model = "fourier(3)"}) degree or a periodic spline function with four knots per cycle (\code{model = "splines"}). For the latter option, B-splines are generated using the function \code{spline.des} from the package \pkg{splines} \citep{splines}. \subsection[Regression techniques: Argument regression]{Regression techniques: Argument \code{regression}}\label{sec:regression} Instead of fitting the models mentioned above by the popular least squares regression (see Table~\ref{tab:modellregschema}), \code{RobPer} also allows application of six robust regression techniques, see Table~\ref{tab:regressiontechniques}. Robust regression techniques like least absolute deviations, least trimmed squares \citep{rousseeuw1984robust} and M-regression \citep{huber1981robust} have already been used to fit sines (evaluating the squared amplitude) by \citet{zhang05robust}, \citet{ahdesmaeki07}, \citet{li09patent} and \citet{li2010nonlinear}. M-regression with the Huber function was applied to fit periodic splines by \citet{oh04}. \citet{thieler2013periodicity} use least absolute deviations and M-regression and all models described in this article to calculate periodograms based on the coefficient of determination. To the best of our knowledge, S- \citep{rousseeuw1984robust} and $\tau$-regression \citep{yohai1988high} have not been used before in periodogram calculation. For the latter, \code{RobPer} uses the algorithms Fast-S from \citet{salibian2006fast} and Fast-$\tau$ from \citet{salibian2008fast} and slightly modified versions of the code distributed with the respective publication (see the respective paragraphs entitled in Section~\ref{sec:regression}). The following paragraphs outline the algorithms used by \code{RobPer} for calculating the different regression estimators. For the basic definitions of these regression techniques we refer to the literature mentioned above and the book by \citet{maronna2006robust}. \begin{table}[t!] \small \begin{tabularx}{\linewidth}{llX} \hline Regression technique & \code{regression} & \proglang{R}~function (\textbf{package})\\ \hline Least squares & \code{"L2"} & \code{lm} (\pkg{stats}, \citealt{stats})\\ Least absolute deviations & \code{"L1"} & \code{rq} (\pkg{quantreg}, \citealt{koenker2012quantrag})\\ Least trimmed squares & \code{"LTS"} & \code{ltsReg} (\pkg{robustbase}, \citealt{rousseeuw2012robustbase})\\ M-regression\\ \dots with Huber function & \code{"huber"} & Own implementation. \\ \dots with Bisquare function & \code{"bisquare"} & \code{lmrob..M..fit} (\pkg{robustbase}, \citealt{rousseeuw2012robustbase}) \\ S-regression & \code{"S"} & Slightly modified code from \citet{salibian2006fast}.\\ $\tau$-regression & \code{"tau"} & Slightly modified code from \citet{salibian2008fast}.\\ \hline \end{tabularx} \caption{Regression techniques implemented in \code{RobPer} and \proglang{R}~functions used to perform the regression technique. For more details see Section~\ref{sec:regression}.}\label{tab:regressiontechniques} \end{table} \subsubsection{LTS regression}\label{sec:ltsregression} The \proglang{R}~function \code{ltsReg} from package \pkg{robustbase} \citep{rousseeuw2012robustbase} is used to perform LTS regression in \code{RobPer}. In preliminary studies we observed that the function can have problems finding a good solution for some of the candidate periods. This results in coefficients of determination which are too small or sometimes even negative. By setting \code{LTSopt = TRUE}, it is possible to let \code{RobPer} further optimize the solution of \code{ltsReg} by using the \proglang{R}~function \code{genoud} from package \pkg{rgenoud} \citep{mebane2011rgenoud}. This function uses an evolutionary approach to improve the given solution, locally optimizing the temporarily best solutions in a gradient descent algorithm. Further arguments \code{pop.size} (size of one generation), \code{max.generations} (maximum of generations before stopping the algorithm) and \code{wait.generations} (maximum number of generations to wait for an improvement of the optimization criterion) control the behavior of the algorithm and can be set in \code{RobPer} by the argument \code{genoudcontrol} (see Table~\ref{tab:robperinput}). The argument \code{tol} controls the precision for convergence criteria. A further problem we observed is that \code{ltsReg} sometimes aborts the fit. However, it is typically able to perform the fit if it is run again. In case of a crash, \code{RobPer} calls \code{ltsReg} up to three times. After the third failed attempt, the respective periodogram bar is set to \code{NA}, or a least absolute deviation regression is performed. The latter is done, if the \code{ltsReg} regression result should be further processed, using the \code{genoud} algorithm or using the LTS result as initial estimate for an M-regression fit (see next paragraph). \subsubsection{M-regression}\label{sec:Mregression} In case of M-regression, a periodogram bar, i.e., the coefficient of determination $R^2=1-\frac{\SE}{\SY}$ is calculated from the values \begin{align} \SE &= \underset{\beta}{\min} \sum_{i=1}^n \rho\left(\frac{y_i-x_i^\transp \beta}{\widehat\sigma}\right)\label{eq:SE_M} \end{align} and \begin{align} \SY &= \underset{\mu}{\min} \sum_{i=1}^n \rho\left(\frac{y_i-\mathfrak{i}_i\mu}{\widehat\sigma}\right),\label{eq:SY_M} \end{align} where $\widehat\sigma$ is an estimate of the error scale $\sigma$ in the regression model. As explained above, Equations~\ref{eq:SE_M} and~\ref{eq:SY_M} represent the minimization criteria of the fits of the chosen periodic fluctuation ($\SE$ in Equation~\ref{eq:SE}) and of a location estimate (SY in Equation~\ref{eq:SY}), respectively. The function $\rho$ is a distance measure. The vector $\mathfrak{i}$ consists of ones in case of unweighted regression. As mentioned before, in case of weighted regression, $y_i$, $\mathfrak{i}_i$ and the rows $x_i$ of the design matrix are standardized by the measurement accuracy $s_i$ (see Figure~\ref{app:fig:robper} in Appendix~\ref{sec:app:struktogramme}). The value $\widehat \sigma$ is obtained in an initial estimation of the periodic fluctuation, calculating a scale estimate of the fitted residuals. In principle, one could use a different estimate of $\sigma$ calculated from fitting only an intercept in Equation~\ref{eq:SY_M}, but \citet[p.~171]{maronna2006robust} recommend using the scale estimate from the (larger) regression model. In our context this means that $\SY$ depends on the trial period and cannot be calculated globally. On the other hand this ensures that the regression model $Y=X\beta+\epsilon$, $\epsilon\sim\mathcal N(0,\sigma^2)$ is a generalization of the intercept model $Y=\mathfrak{i}\mu+\epsilon$, $\epsilon\sim\mathcal N(0,\sigma^2)$ and thus $\SE \leq \SY$ and $R^2\geq 0$. So for this regression technique, an implementation is needed where the scale estimate can be fixed in advance. For M-regression using the biweight function, the function \code{lmrob..M..fit} from package \pkg{robustbase} by \citet{rousseeuw2012robustbase} is used. This \proglang{R}~function includes Huber M-regression only as a limiting case of Hampel M-regression with all but one of its tuning constants set to very large values. In other \proglang{R} functions known to us for M-regression (\code{rlm} from package \pkg{MASS} by\citealt{venables2002MASS}, \code{iwlsm} from package \pkg{RSiena} by \citealt{ripley2012RSiena} and \code{robustregBS} and \code{robustRegH} from package \pkg{robustreg} by \citealt{johnson2011robustreg}), the scale estimate cannot be fixed in advance. Hence M-regression using the Huber function is newly implemented for \pkg{RobPer}. Like the functions specified before, this implementation is based on an iteratively reweighted least squares (IRWLS) approach (see \citealt[pp.~104--105]{maronna2006robust}), and meets our special requirements. For M-regression using the biweight function, the implementation makes also use of the function \code{genoud} from package \pkg{rgenoud} (see previous paragraph) to overcome possible problems with local optima. As noted above, weighted regression scales observed values and design matrices by the measurement accuracies. The variance of the error is expected to be about one then. Hence it can be reasonable to set $\widehat \sigma$ to one. This can be done in \code{RobPer} setting the argument \code{var1} to \code{TRUE}, as is recommendable in our experience in case of weighted M-regression. To calculate a periodogram bar using M-regression with IRWLS, three initial estimates are needed: A scale estimate $\widehat \sigma$ (if not set to one) and initial location estimates $\widehat \beta^{(0)}$ and $\widehat \mu^{(0)}$ for $\beta$ and $\mu$. The initial estimates should be obtained using robust techniques. As proposed by \citet[p.~105]{maronna2006robust} we use the median (weighted if the $s_i$ shall be taken into account) to initially estimate $\mu$. For $\beta$, LTS regression (see previous paragraph) is used. It has a high breakdown point and is appropriate in situations with many observations not agreeing with the best fit. This situation will often occur in periodogram calculation, as many trial periods and thus many wrong models are fitted to the light curve. The scale estimate $\widehat \sigma$ is calculated as the (weighted) median of the residuals of the LTS fit. \subsubsection{S-regression}\label{sec:Sregression} In case of \code{regression = "S"}, \code{RobPer} uses the Fast-S algorithm by \citet{salibian2006fast} to perform S-regression for fitting the periodic function efficiently. The algorithm starts with a set of \code{N} parameter candidates, locally optimizes them using \code{kk} iterations, then optimizes the \code{tt} best of these candidates until convergence and finally chooses the best parameter candidate. The \proglang{R}~function \code{FastS} used in \pkg{RobPer} is a slightly modified version of the function \code{fast.s} published by \citet{salibian2006fast}. It was changed in order to work more efficiently in the context given here, especially when fitting step functions, and to specify one parameter candidate in advance. This candidate is set to \begin{align} \widehat \beta_\mu = \begin{cases} (\hat\mu, \ldots, \hat\mu)^\transp \in \mathds R^m &\text{\code{model}}\in\{\text{\code{"step"}},\text{\code{"2step"}}, \text{\code{"splines"}}\}\\ (\hat \mu, 0,\ldots,0)^\transp \in \mathds R^m & \text{\code{model}}\in\{\text{\code{"sine"}}, \text{\code{"fourier(2)"}}, \text{\code{"fourier(3)"}}\}\end{cases}\label{eq:betagamma} \end{align} where $m$ denotes the dimension of the linear model of the periodic function and $\hat\mu$ denotes the location estimate. $\widehat \beta_\mu$ arises from plugging in the fit obtained from the location model into the parametrization of the full model. This ensures that fitting the full periodic function will not give a worse fit than fitting only a location parameter. Otherwise it could happen that SY $<$ SE and the coefficient of determination (which has to be in $[0,1]$) would be negative. Further changes in \code{FastS} are: \begin{enumerate} \item The arguments \code{k} and \code{best.r} are renamed to \code{kk} and \code{tt} to unify notation as in \code{FastTau}. The arguments \code{int}, \code{N}, \code{kk}, \code{tt}, \code{b}, \code{cc} and \code{seed} are merged to a list \code{Scontrol}, which is also an argument of \code{RobPer} (except for \code{int}, which is fixed in \code{RobPer}). \item If an intercept column is added to the design matrix (using \code{Scontrol$int = TRUE}), this is done before the dimension of the design matrix is determined (instead of doing this first and redoing it in case of \code{Scontrol$int = TRUE}). \item \label{item:gp} To find a subsample in general position, regressors $x_{i^\star}^\transp$ are sampled from the set of rows of the design matrix $X$ ignoring the frequency of occurrence in $X$. For each regressor $x_{i^\star}^\transp$, one value $y_i$ is then sampled from the entries of $y$ belonging to this regressor. In case of a step function to be fitted, one observation per step is drawn to get a subsample. \item If no subsample can be found in 100 trials, \code{FastS} returns \code{NA}. \code{RobPer} then releases a warning, but can calculate further periodogram bars for other trial periods. \item The internal functions \code{loss.S}, \code{re.s}, \code{f.w}, \code{scale1}, \code{our.solve} and \code{rho} are now defined outside \code{FastS}. Otherwise \proglang{R} would have to redefine them for each periodogram bar. \item The subfunction \code{norm} is replaced by the function \code{norm(..., "2")} from the package \pkg{base} \citep{base}. \item The labels of the return values are changed for better interpretation. \end{enumerate} \subsubsection[Tau-regression]{$\tau$-regression}\label{sec:tauregression} In case of \code{regression = "tau"}, $\tau$-regression is used to fit the periodic function. \code{RobPer} uses the Fast-$\tau$ algorithm of \citet{salibian2008fast} which works according to the same optimizing principle as \code{FastS} for S-regression (see previous paragraph), i.e., optimizing \code{N} candidates in \code{kk} iterations and further optimizing the \code{tt} best of these until convergence. Since computation of the objective value is expensive, it is possible to approximate it with \code{rr} iteration steps when choosing \code{approximate = TRUE}. For more details see \citet{salibian2008fast}. The \proglang{R}~function \code{FastTau} used in \pkg{RobPer} is a slightly modified version of the \proglang{R} code published in \citet{salibian2008fast} with similar changes as in \code{FastS} compared to \code{fast.s} (see previous paragraph). The changes are: \begin{enumerate} \item A candidate for $\beta_\mu$, see Equation~\ref{eq:betagamma}, is allowed. \item Arguments \code{N}, \code{kk}, \code{tt}, \code{rr} and \code{approximate} are combined to a list \code{taucontrol}, which is also an argument for \code{RobPer}. \item Subsamples in general position are found as in \code{FastS} (change~\ref{item:gp} in the previous paragraph). \item If no subsample can be found, \code{FastTau} returns \code{NA} instead of a break using the \code{stop}~function. This allows \code{RobPer} to release a warning, while calculating further periodogram bars for other trial periods. \item A block of code used several times to check new regression parameter candidates for providing the best optimization value so far has been modularized into the subfunction \code{checkbest}. \item Due to rounding errors, it may happen in the IRWLS algorithm that negative values close to zero occur, although they have to be non-negative by theory. This is avoided by setting such values to zero. \item The subfunction \code{randomset} is replaced by the \proglang{R}~function \code{sample} from the \pkg{base}~package as both functions fulfill the same task and \code{sample} is faster. \item The labels of the return values are changed for better interpretation. \end{enumerate} \section[Fit beta distributions with betaCvMfit]{Fit beta distributions with \code{betaCvMfit}}\label{sec:betacvmfit} In this section we present the function \code{betaCvMfit}, which robustly fits a beta distribution to a sample using Cram\'er-von-Mises (CvM) distance minimization. The function is adapted from \proglang{R} code by Brenton R. Clarke for fitting a gamma distribution (see \citealt{clarke2012fast}) using CvM distance minimization. Section~\ref{sec:betamotivation} motivates the application of this function, while its usage is explained in more detail in Section~\ref{sec:betaRfunction}. \subsection{Motivation}\label{sec:betamotivation} After a periodogram is calculated, one might be interested in the automatic detection of significant periods. A period shall be called significant, if the respective periodogram bar is atypical from the distribution of the applied criterion under the null hypothesis of no periodic fluctuation. To determine significance, this distribution needs to be known or estimated. Let $Q_\alpha$ be the $\alpha$-quantile of this distribution. Assuming independent identically distributed periodogram bars $\Per(p_1),\ldots,\Per(p_q)$ we get \begin{align} &P\Big(\max\big(\Per(p_1),\ldots,\Per(p_q)\big)\geq Q_{\sqrt[q]{1-\alpha}}\Big) = \alpha. \label{niveau} \end{align} A single periodogram bar calculated as described in Section~\ref{sec:robper} using unweighted least squares regression is $\mathcal B(\frac{m-1}{2},\frac{n-m}2)$-distributed, where $\mathcal B$ denotes the beta distribution and $m$ is the dimension of the model. This result can be found in~\cite{schwarzenbergczerny98dist} or easily be deduced from~\citet[p.~110]{seber2003} and~\citet[p.~51]{gupta2004}. Already small violations of the assumptions made about the method or the light curve disturb this proceeding. In this work, we consider weighted and robust regression in addition to ordinary least squares. Besides, we have to take into account small deviations from our model assumptions like bad estimates $s_i$. An example is shown in Figure~\ref{fig:standardbetabspl}. Panel~\ref{fig:standardbetabspl_a} shows the weighted least squares periodogram (using a sine model) of a light curve only consisting of white noise. The observed values were generated as \begin{align} y_i&=y_{w;i}+c\cdot y_{r;i}, \qquad i=1,\ldots,n\\ \intertext{with $y_{w;i}$ and $y_{r;i}$ being realizations from} Y_{w;i}&\sim \mathcal N(0,s_i^2),\\ Y_{r;i} &\sim \mathcal N(0,1). \intertext{The value of $s_i$ is given for all $i$, and $c$ is chosen to fulfill} &\frac{\var(c\cdot y_r)}{\var(y_w)+\var(c\cdot y_r)} = 0.2, \label{observedvaluesgenerated} \end{align} where $\var()$ denotes the empirical variance. This means, there is roughly an extra 20 percent noise which is not explained by the measurement accuracies. Evidently, no periodogram bar is outstanding, but using the $\sqrt[q]{0.95}$ quantile of a $\mathcal B(\frac{m-1}{2},\frac{n-m}2)$ distribution (dashed line), several periods are found automatically. <>= set.seed(923) zr <- tsgen(ttype="sine", npoints=500, ytype="const", pf=150, ncycles=71, ps=4, redpart=0.2, alpha=0,SNR=2, interval=FALSE) PP <- RobPer(zr, model="sine", regression="L2", weighting=TRUE, periods=1:100) shapes <- betaCvMfit(PP) myf <- function(x) dbeta(x, shapes[1], shapes[2]) stdf <- function(x) dbeta(x, 1,497/2) @ {\setkeys{Gin}{width=.48\textwidth} \begin{figure}[t!] \subfloat[]{ <>= par(mar=c(4,4,0.1,0.1)) plot(PP, xlab="Trial period", ylab="Periodogram", type="l", axes=FALSE) axis(1) axis(2, at=seq(0,0.06, by=0.02)) abline(h=qbeta(0.95^0.01, 1,497/2), lty=2) box() @ \label{fig:standardbetabspl_a}} \subfloat[]{ <>= par(mar=c(4,4,0.1,0.1)) hist(PP, freq=FALSE, main=" ", xlab="Periodogram",ylab="Density", col="grey80", breaks=15, axes=FALSE) axis(1, at=seq(0,0.06, by=0.02)) axis(2) box() curve(myf, add=TRUE) curve(stdf, add=TRUE, lty=2) @ \label{fig:standardbetabspl_b}} \caption{Example illustrating that a predefined $\mathcal B\left(\frac{m-1}2, \frac{n-m}2\right)$ distribution is sometimes not flexible enough if the model restrictions are slightly violated (see text for details). Panel~\ref{fig:standardbetabspl_a} shows the periodogram of a light curve not completely following the assumed data model with the $\sqrt[q]{0.95}$ quantile of a $\mathcal B(\frac{m-1}{2},\frac{n-m}2)$ distribution (dashed line). Panel~\ref{fig:standardbetabspl_b} shows a histogram of the periodogram bars, with the density of the $\mathcal B(\frac{m-1}{2},\frac{n-m}2)$ (dashed) and the CvM-fitted beta distribution with parameters $0.8<1=\frac{m-1}{2}$ and $40.18<248.5=\frac{n-m}2$ (solid).}\label{fig:standardbetabspl} \end{figure} } To circumvent these problems, \citet{thieler2013periodicity} propose to relax the assumption of a predefined $\mathcal B\left(\frac{m-1}2, \frac{n-m}2\right)$-distribution and only assume that the periodogram values can be approximated by any beta distribution. As peculiar periods are expected to show up as outliers, robustly fitting a $\mathcal B(\theta_1, \theta_2)$-distribution to $\Per(p_1),\ldots,\Per(p_q)$ is proposed. The authors use CvM distance minimization for this, which has been recommended by \citet{clarke2012fast} for fitting gamma distributions in the presence of outliers. The CvM is defined as \begin{align} \int_{0}^\infty \left(F_n(u) - F_\theta(u)\right)^2dF_\theta(u) =\frac 1n \sum_{i=1}^n \left(F_\theta(u_{(i)}) - \frac{i-0.5}{n}\right)^2+ \frac{1}{12n^2}, \label{CvMdefined} \end{align} where $u_{(1)},\ldots,u_{(n)}$ is the ordered sample, $F_n$ is the empirical distribution function and $F_\theta$ is the distribution function of $\mathcal{B}(\theta_1, \theta_2)$. Panel~\ref{fig:standardbetabspl_b} shows the predefined (solid) and the CvM-fitted (dashed) beta density for a periodogram calculated from the only-noise-data described above. While the $\sqrt[q]{0.95}$ quantile of the predefined distribution is about 0.03, the related quantile of the fitted distribution is 0.16 and no period is detected automatically. The above approach falls within the framework of outlier detection described by \cite{davies1993ident} and is successfully used by \citet{thieler2013periodicity} in the context discussed here. However, it assumes independent periodogram bars. This may cause problems when the periodogram peaks are broad (because the assumption of independency of the periodogram bars is violated): Then it can be hard for the automatism to find any outlying periodogram value, as there are many high values. One might try to ease this problem choosing a selection of trial periods with large distances or considering only the periods referring to local maxima in the periodogram as (roughly) independent trial periods (modifying and expanding an approach of \citealt{zechmeister09}) and fit the beta distribution to them using a CvM fit. Simulations indicate that the beta distribution describes the distribution under the null hypothesis rather well for the different periodograms. Nevertheless, in the following we will call detected periods ``valid'' and not ``significant'' to stress that our approach to detect periods lacks a theoretical justification. \subsection[The R function betaCvMfit]{The \proglang{R}~function \code{betaCvMfit}} \label{sec:betaRfunction} The function \code{betaCvMfit} fits a $\mathcal B(\theta_1, \theta_2)$-distribution with mean $\theta_1/(\theta_1+\theta_2)$ to a sample vector \code{data} using CvM distance minimization and has been applied in \citet{thieler2013periodicity} for fitting beta distributions to periodograms to detect valid periods. As it may happen that the periodogram bars become negative due to fitting problems, the function sets all negative entries of \code{data} to zero. If the logical argument \code{CvM} is set to \code{TRUE}, a CvM fit is calculated. As initial values for the optimization, the moment estimates of the beta distribution \begin{align} \widehat \theta_1&=-\frac{\bar x \cdot (-\bar x + \bar x^2 + \hat s^2)}{\hat s^2},& \widehat \theta_2&=\frac{\widehat \theta_1 - \widehat \theta_1 \cdot \bar x}{\bar x}\label{momentestimates} \end{align} are used. If the argument \code{rob} is set to \code{TRUE}, the median and the median absolute deviation from the median (MAD) are used instead of the arithmetic mean for $\bar x$ and the standard deviation for $\hat s$, respectively. In case of a very small estimate $\hat s$ (which happens particularly if $\hat s$ is the MAD), the function stops as it is not possible to calculate the estimates $\widehat \theta_1$ and $\widehat \theta_2$ shown above. The parameters of a beta distribution are strictly positive. Since it can happen that $\widehat\theta_1$ or $\widehat\theta_2$ are negative, the initial estimates are clipped to be at least 0.00001. If \code{CvM} is set to \code{FALSE}, the CvM distance is not optimized, and the initial estimates $\widehat\theta_1$ and $\widehat\theta_2$ are returned. Figure~\ref{fig:betaCvMfit} shows the different fits varying the arguments \code{CvM} and \code{rob} for 50 $\mathcal B(4,15)$-distributed observations containing 10 percent outliers between 0.8 and 1. \begin{figure}[t!] \centering {\setkeys{Gin}{width=.75\textwidth} <>= par(mar=c(3,3,0.3,0.1), mgp=c(2,1,0)) set.seed(12) PP <- c(rbeta(45, shape1=4, shape2=15), runif(5, min=0.8, max=1)) hist(PP, freq=FALSE, breaks=30, ylim=c(0,7), col="grey90", main="", xlab="Periodogram bar") # true parameters: myf.true <- function(x) dbeta(x, shape1=4, shape2=15) curve(myf.true, add=TRUE, lwd=2) # method of moments: par.mom <- betaCvMfit(PP, rob=FALSE, CvM=FALSE) myf.mom <- function(x) dbeta(x, shape1=par.mom[1], shape2=par.mom[2]) curve(myf.mom, add=TRUE, lwd=2, lty=2) # robust method of moments par.rob <- betaCvMfit(PP, rob=TRUE, CvM=FALSE) myf.rob <- function(x) dbeta(x, shape1=par.rob[1], shape2=par.rob[2]) curve(myf.rob, add=TRUE, lwd=2, lty=3) # CvM distance minimization par.CvM <- betaCvMfit(PP, rob=TRUE, CvM=TRUE) myf.CvM <- function(x) dbeta(x, shape1=par.CvM[1], shape2=par.CvM[2]) curve(myf.CvM, add=TRUE, lwd=2, col="grey50") # Searching for outliers... abline(v=qbeta((0.95)^(1/50), shape1=par.CvM[1], shape2=par.CvM[2]), col="grey30") legend("topright", col=c("black", "grey50","black", "black"),lwd=2, lty=c(1,1,3,2), legend=c("True", "CvM", "Robust moments", "Moments")) box() @ } \caption{Gray-scale-version of the example for \code{betaCvMfit} given in the \pkg{RobPer} manual: Histogram of 45 $\mathcal B(4,15)$-distributed observations and 5 outliers uniformly distributed between 0.8 and 1. The black solid line shows the $\mathcal B(4,15)$-distribution, the other curves show different fits using \code{betaCvMfit} (in case of \code{CvM = TRUE}, the different settings for \code{rob} lead to the same result).}\label{fig:betaCvMfit} \end{figure} \section[Generate light curves with tsgen]{Generate light curves with \code{tsgen}}\label{sec:tsgen} To investigate our periodogram methods in simulations, we implemented the \proglang{R}~function \code{tsgen} to generate artificial light curves. A preliminary version of this function is used in \citet{thieler2013periodicity}. The light curves $\left(t_i,y_i,s_i\right)_{i=1,\ldots,n}$ are generated as realizations of the model \begin{align} &T_i = T^\star_{(i)},\quad T_1^\star,\ldots, T_n^\star \sim \mathcal D(p_s),\\ &Y_i =\begin{cases} Y_{f;i} + Y_{w;i} + Y_{r;i}, & \text{$Y_i$ ``behaves regularly''}\\ Y_i^\star, & \text{$Y_i$ is an outlier}\end{cases} ,\\ &Y_{f;i} = f\left(\frac{T_i}{p_f}\right),\quad f(\xi) = f(\xi+1) \;\forall \xi \in \mathds R \\ &Y_{w;i} \sim \mathcal N(0,\sigma_i^2), \\ &s_i= \begin{cases} \text{ given estimate for } \sigma_i \text{ independent from } Y_1,\ldots,Y_n, & \text{$s_i$ ``behaves regularly''}\\ s_i^\star, &\text{$s_i$ is an outlier} \end{cases} , \label{lightcurvesrealisationsmodel} \end{align} where $T^\star_{(i)}$ denotes the $i$th ordered observation time in $T_1^\star,\ldots, T_n^\star$ and $\mathcal D(p_s)$ is a periodic sampling density with period $p_s$. The noise component $Y_r$ is a power law noise (see~\citealt{timmer95}) with power exponent $\alpha$ and is white noise in case of $\alpha=0$. Inserting another noise component and two types of outliers, this extended model allows to generate data violating the model introduced in Section~\ref{sec:intro}. The function calls several autonomous subfunctions one by one which perform individual simulation steps. These are: \begin{enumerate} \item Generate a sampling $t_1, \ldots, t_n$ (using \code{sampler}, see Section~\ref{sec:sampler}). \item Generate a periodic signal $y_{f;1},\ldots,y_{f;n}$ (using \code{signalgen}, see Section~\ref{sec:signalgen}). \item Add noise $y_{w;1},\ldots,y_{w;n}$ with related measurement accuracies $s_1,\ldots,s_n$ and a noise component $y_{r;1},\ldots,y_{r;n}$ unrelated to the $s_i$ (using \code{lc_noise}, see Section~\ref{sec:lc_noise}). \item Disturb the light curve replacing measurement accuracies $s_i$ by outliers, or replacing observations $y_i=y_{f;i}+y_{w;i}+y_{r_i}$ by aperiodic features (using \code{disturber}, see Section~\ref{sec:disturber}). \end{enumerate} Table~\ref{tab:inputtsgen} lists all arguments for the subfunctions. The gray-shaded arguments are also arguments for \code{tsgen}, which passes them to the respective subfunction. \begin{table}[t!] \begin{tabularx}{\linewidth}{lp{2cm}X} \hline Argument & Subfunction & Comment\\ \hline \colorbox{hellgrau}{\code{ps} $ \in\mathds R_{>0}$ } & \code{sampler}, \code{disturber} & Sampling period $p_s$. Default 1.\\ \colorbox{hellgrau}{\code{ncycles} $ \in \mathds N$} & \code{sampler} & Number $n_s$ of sampling cycles. \\ \colorbox{hellgrau}{\code{npoints} $ \in \mathds N$}& \code{sampler} & Sample size $n$.\\ \colorbox{hellgrau}{\code{ttype} } & \code{sampler} & Distribution $\mathcal D(p_s)$ of the unsorted observation times. Options are: \code{"equi"} (equidistant sampling), \code{"unif"} (uniform sampling), \code{"sine"} (sine-shaped density, see Section~\ref{sec:sampler}) and \code{"trian"} (triangular density, see Section~\ref{sec:sampler}).\\ \code{tt} $ \in\mathds R^n$ & \code{signalgen}, \code{lc}\texttt{\textunderscore}\code{noise}, \code{disturber}& Observation times $t_1,\ldots,t_n$, e.g., return value from \code{sampler}.\\ \colorbox{hellgrau}{\code{pf} $ \in\mathds R_{>0}$} & \code{signalgen}& Fluctuation period $p_f$. Default 1. \\ \colorbox{hellgrau}{\code{ytype} } & \code{signalgen}& Type of periodic fluctuation $f$. Options: \code{"const"} (constant), \code{"sine"} (sine), \code{"trian"} (triangular function) and \code{"peak"} (peak function).\\ \code{sig} $ \in\mathds R^n$ & \code{lc}\texttt{\textunderscore}\code{noise} & Values $y_{f;1}, \ldots, y_{f;n}$ of the periodic fluctuation, e.g., return value from \code{signalgen}.\\ \colorbox{hellgrau}{\code{SNR} $ \in \mathds R_{>0}$ } & \code{lc}\texttt{\textunderscore}\code{noise}& Relation $\var(y_f) / \var(y_w+y_r)$.\\ \colorbox{hellgrau}{\code{redpart} $ \in[0,1]$} & \code{lc}\texttt{\textunderscore}\code{noise}& Fraction $\var(y_r)/(\var(y_w)+\var(y_r))$ of noise not related to measurement accuracies.\\ \colorbox{hellgrau}{\code{alpha} } & \code{lc}\texttt{\textunderscore}\code{noise} & Power law coefficient of the noise component $y_r$. Set to zero for $y_{r;1},\ldots, y_{r;n} \underset{\text{i.i.d.}}\sim \mathcal N(0,\sigma^2)$.\\ \code{y} $ \in\mathds R^n$ &\code{disturber} & Observed values $y_{1}, \ldots, y_{n}$, e.g., return value from \code{lc}\texttt{\textunderscore}\code{noise}.\\ \code{s} $ \in \mathds R_{>0}^n$&\code{disturber} & Measurement accuracies $s_{1}, \ldots, s_{n}$, e.g., return value from \code{lc}\texttt{\textunderscore}\code{noise}.\\ \colorbox{hellgrau}{\code{s.outlier.fraction} $ \in [0,1]$ } &\code{disturber}& Fraction of measurement accuracies to be replaced by outliers.\\ \colorbox{hellgrau}{\code{interval} $\in$ \{\code{TRUE}, \code{FALSE}\} } &\code{disturber}& If \code{TRUE}, the $y_i$ belonging to a random time interval are disturbed.\\ \hline \end{tabularx} \caption{Arguments for the subfunctions of \code{tsgen}. See the respective section for more details. Gray-shaded values are also arguments for \code{tsgen}, which passes the values to the respective subfunction. ``$\var$'' denotes the empirical variance.}\label{tab:inputtsgen} \end{table} \subsection[Generate sampling using sampler]{Generate sampling using \code{sampler}}\label{sec:sampler} The \proglang{R}~function \code{sampler} is used to sample observation times $t_1,\ldots,t_n$ in the interval $[0,n_s \cdot p_s]$ with a possibly periodic sampling of period $p_s$. The sampling pattern depends on the argument \code{ttype} (see Table~\ref{tab:inputtsgen}). If a periodic pattern is chosen, the observed time interval covers $n_s$ cycles of it. In case of \code{ttype = "equi"}, the observation times are equidistantly sampled with $t_i=i\frac{p_s\cdot n_s}{n}$. For \code{ttype = "unif"}, the observation times are drawn independently from a uniform distribution on $[0,n_s\cdot p_s]$. Both these sampling schemes are aperiodic, the sampling period $p_s$ only influences the duration $t_n-t_1$ of the sampling. For \code{ttype = "sine"} and \code{ttype = "trian"}, the observation times are sampled from a periodic density with sampling period $p_s$. First, observation cycles $z^\star_i$ are drawn from a discrete uniform distribution on $\{1,\ldots,n_s\}$ to determine the cycle the $i$th observation is part of. Second, observation phases $\varphi^\star_i$ are sampled with density \begin{align} d_{sine}(x)&= \sin(2\pi x)+1 &\text{(for \code{ttype = "sine"})}\\ \text{or } d_{trian}(x)&= \begin{cases} 3x, & 0\leq x\leq\frac{2}{3},\\ 6-6x, &\frac{2}{3}>= library("RobPer") set.seed(22) lightcurve <- tsgen(ttype = "sine", ytype = "peak", pf = 7, redpart = 0.1, s.outlier.fraction = 0.1, interval = TRUE, npoints = 200, ncycles = 25, ps = 20, SNR = 3, alpha = 0) @ % This light curve has a sine-shaped sampling (\code{ttype}) with sampling period 20 (\code{ps}) and covers a time interval of about 25 sampling cycles (\code{ncycles}), so 500 time units. It consists of 200 observations (\code{npoints}) and the observed values contain a peak-shaped periodic fluctuation (\code{ytype}) with fluctuation period 7 (\code{pf}). The measurement accuracies are related to about 90 percent of the noise component (1-\code{redpart}), the rest of the noise is white as well (\code{alpha}). The empirical variance of the periodic fluctuating component in the observed values is three times larger than the empirical variance in the noise component (\code{SNR}). The light curve contains 10 per cent outliers in the measurement accuracies (\code{s.outlier.fraction}) and atypical observed values (\code{interval}). Alternatively, the functions \code{sampler}, \code{signalgen}, \code{lc_noise} and \code{disturber} can be used to generate the same light curve, see Section~\ref{sec:tsgen}. Sampling observation times: % <>= set.seed(22) tt <- sampler(ttype = "sine", npoints = 200, ncycles = 25, ps = 20) @ % Generate periodic fluctuation: % <>= yf <- signalgen(tt, ytype = "peak", pf = 7) @ % Add noise and scale signal to the right \code{SNR}: % <>= temp <- lc_noise(tt, sig = yf, SNR = 3, redpart = 0.1, alpha = 0) y <- temp$y s <- temp$s @ % Replace measurement accuracies by tiny outliers and include a peak: % <>= temp <- disturber(tt, y, s, ps = 20, s.outlier.fraction = 0.1, interval = TRUE) @ % The result is the same: % <>= all(cbind(tt, temp$y, temp$s) == lightcurve) @ % Figure~\ref{fig:ex_art_tsgen} shows plots of the generated light curve. {\setkeys{Gin}{width=.3\textwidth} \begin{figure}[t!] \subfloat[\label{fig:ex_art_zr}]{ <>= par(mar=c(3,3,0.3,0.3), mgp=c(2,1,0)) plot(lightcurve[,1], lightcurve[,2], pch=16, xlab="t", ylab="y", main=" ", ylim=range(c(lightcurve[,2]+lightcurve[,3],lightcurve[,2]-lightcurve[,3])), col=rgb(0,0,0,alpha=0.5), cex=0.7, axes=FALSE) axis(1, at=(0:5)*100, labels=c(0, "", 200, "", 400, "")) axis(2) rect(lightcurve[,1], lightcurve[,2]+lightcurve[,3], lightcurve[,1], lightcurve[,2]-lightcurve[,3], border=rgb(0,0,0,alpha=0.5)) box() @ } \subfloat[\label{fig:ex_art_phase}]{ <>= par(mar=c(3,3,0.3,0.3), mgp=c(2,1,0)) plot(lightcurve[,1]%%7, lightcurve[,2], pch=16, col=rgb(0,0,0,alpha=0.5), xlab="t modulo 7", ylab="y", main=" ", ylim=range(c(lightcurve[,2]+lightcurve[,3],lightcurve[,2]-lightcurve[,3])), cex=0.7) rect(lightcurve[,1]%%7, lightcurve[,2]+lightcurve[,3], lightcurve[,1]%%7, lightcurve[,2]-lightcurve[,3], border=rgb(0,0,0,alpha=0.5)) @ } \subfloat[\label{fig:ex_art_ps}]{ <>= par(mar=c(3,3,0.3,0.3), mgp=c(2,1,0)) hist(lightcurve[,1]%%20, xlab="t modulo 20", col="grey", main=" ", freq=FALSE) dsin <- function(tt) (sin(2*pi*tt/20)+1)/20 curve(dsin, add=TRUE, lwd=2) box() @ } \caption{Artificial light curve in Panel~\ref{fig:ex_art_zr} with vertical bars marking the $s_i$. Plotting time axis modulo 7 in Panel~\ref{fig:ex_art_phase} reveals the periodic fluctuation of $p_f=7$. Histogram and sampling density of the observation times modulo 20 in Panel~\ref{fig:ex_art_ps} shows the sampling periodicity of $p_s=20$.}\label{fig:ex_art_tsgen} \end{figure} } In the next step, we calculate a periodogram of the light curve. The periodogram is calculated fitting a step model using unweighted M-regression with the Huber function. The light curve spans a time interval of approximately \code{ncycles} $\cdot$ \code{ps} $ =500$ time units, so it is sensible to investigate periods up to 50 (one tenth, see \citealt{halpern2003extreme}). % <>= PP <- RobPer(lightcurve, model = "splines", regression = "huber", weighting = FALSE, var1 = FALSE, periods = 1:50) @ % Outstanding periodogram bars are sought fitting a beta distribution to the periodogram values using Cram\'er-von-Mises distance minimization (CvM) and determining the $\sqrt[q]{0.95}$-quantile with $q=50$ as the number of periodogram bars. % <>= betavalues <- betaCvMfit(PP) crit.val <- qbeta((0.95)^(1 / 50), shape1 = betavalues[1], shape2 = betavalues[2]) @ % Panel~\ref{fig:ex_art_hist} depicts the histogram of the periodogram bars, the beta distribution fitted (solid line) and its $\sqrt[50]{0.95}$-quantile (solid vertical line). Further fits of a beta distribution (method of moments, dashed, and robust method of moments, dotted) and their respective $\sqrt[50]{0.95}$-quantiles are shown as well. % <>= hist(PP, breaks = 20, freq = FALSE, xlim = c(0, 0.08), col = "grey", main = "", xlab="Periodogram") betafun <- function(x) dbeta(x, shape1 = betavalues[1], shape2 = betavalues[2]) curve(betafun, add = TRUE, lwd = 2) abline(v = crit.val, lwd = 2) @ % Application of method of moments: % <>= par.mom <- betaCvMfit(PP, rob = FALSE, CvM = FALSE) myf.mom <- function(x) dbeta(x, shape1 = par.mom[1], shape2 = par.mom[2]) curve(myf.mom, add = TRUE, lwd = 2, lty = 2) crit.mom <- qbeta((0.95)^(1 / 50), shape1 = par.mom[1], shape2 = par.mom[2]) abline(v = crit.mom, lwd = 2, lty = 2) @ % Application of robust method of moments: % <>= par.rob <- betaCvMfit(PP, rob = TRUE, CvM = FALSE) myf.rob <- function(x) dbeta(x, shape1 = par.rob[1], shape2 = par.rob[2]) curve(myf.rob, add = TRUE, lwd = 2, lty = 3) crit.rob <- qbeta((0.95)^(1 / 50), shape1 = par.rob[1], shape2 = par.rob[2]) abline(v = crit.rob, lwd = 2, lty = 3) legend("topright", lty = 1:3, legend = c("CvM", "Moments", "Robust moments"), bg = "white", lwd = 2) box() @ % {\setkeys{Gin}{width=.48\textwidth} \begin{figure}[t!] \subfloat[\label{fig:ex_art_hist}]{ <>= par(mar=c(3,3,1,0.7), mgp=c(2,1,0)) hist(PP, breaks = 20, freq = FALSE, xlim = c(0, 0.08), col = "grey", main = "", xlab="Periodogram") curve(betafun, add = TRUE, lwd = 2) abline(v = crit.val, lwd = 2) curve(myf.mom, add = TRUE, lwd = 2, lty = 2) abline(v = crit.mom, lwd = 2, lty = 2) curve(myf.rob, add = TRUE, lwd = 2, lty = 3) abline(v = crit.rob, lwd = 2, lty = 3) legend("topright", lty = 1:3, legend = c("CvM", "Moments", "Robust moments"), bg = "white", lwd = 2) box() @ } \subfloat[\label{fig:ex_art_per}]{ <>= par(mar=c(3,3,1,0.1), mgp=c(2,1,0)) plot(1:50, PP, xlab = "Trial period", ylab = "Periodogram", main = "", type = "l") abline(h = crit.val, lwd = 2) text(7, PP[7]-0.002,7, pos=4) text(14, PP[14]+0.002,14, pos=4) @ } \caption{Periodogram bars calculated fitting a spline model using unweighted M-regression with the Huber function to the artificial example from Figure~\ref{fig:ex_art_tsgen}: Robustly fitting a beta distribution to the periodogram bars in Panel~\ref{fig:ex_art_hist} leads to two outstanding trial periods in Panel~\ref{fig:ex_art_per}.}\label{fig:ex_art} \end{figure} } Using the $\sqrt[50]{0.95}$ quantile of the CvM fit (solid line), a period of 7 time units seems to be valid, see Panel~\ref{fig:ex_art_per}. Twice this period, which is 14, might be valid, too. So the real periodic fluctuation of $p_f =7$ is well recognized within the disturbed signal, as intended. Of course, a periodic function with period $p$ is also periodic with period $k\cdot p$, $k \in \mathds N$. % <>= plot(1:50, PP, xlab = "Trial period", ylab = "Periodogram", main = "", type = "l") abline(h = crit.val, lwd = 2) text(7, PP[7]-0.002,7, pos=4) text(14, PP[14]+0.002,14, pos=4) @ % While the robust M-regression recognizes the real periodic fluctuation, fitting the same model by least squares regression does not, as shown in Figure~\ref{fig:ex_art_nonrob}. Only the periodogram is calculated in another way. % <>= PP <- RobPer(lightcurve, model = "splines", regression = "L2", weighting = FALSE, var1 = FALSE, periods = 1:50) @ % The analysis proceeds as before. {\setkeys{Gin}{width=.48\textwidth} \begin{figure}[t!] \subfloat[\label{fig:ex_art_hist_nonrob}]{ <>= betavalues <- betaCvMfit(PP) crit.val <- qbeta((0.95)^(1 / 50), shape1 = betavalues[1], shape2 = betavalues[2]) par(mar=c(3,3,1,0.7), mgp=c(2,1,0)) hist(PP, breaks = 20, freq = FALSE, ylim = c(0, 50), col = "grey", main = "", xlab="Periodogram") betafun <- function(x) dbeta(x, shape1 = betavalues[1], shape2 = betavalues[2]) curve(betafun, add = TRUE, lwd = 2) abline(v = crit.val, lwd = 2) par.mom <- betaCvMfit(PP, rob = FALSE, CvM = FALSE) myf.mom <- function(x) dbeta(x, shape1 = par.mom[1], shape2 = par.mom[2]) curve(myf.mom, add = TRUE, lwd = 2, lty = 2) crit.mom <- qbeta((0.95)^(1 / 50), shape1 = par.mom[1], shape2 = par.mom[2]) abline(v = crit.mom, lwd = 2, lty = 2) par.rob <- betaCvMfit(PP, rob = TRUE, CvM = FALSE) myf.rob <- function(x) dbeta(x, shape1 = par.rob[1], shape2 = par.rob[2]) curve(myf.rob, add = TRUE, lwd = 2, lty = 3) crit.rob <- qbeta((0.95)^(1 / 50), shape1 = par.rob[1], shape2 = par.rob[2]) abline(v = crit.rob, lwd = 2, lty = 3) legend("topright", lty = 1:3, legend = c("CvM", "Moments", "Robust moments"), bg = "white", lwd = 2) box() @ } \subfloat[\label{fig:ex_art_per_nonrob}]{ <>= par(mar=c(3,3,1,0.1), mgp=c(2,1,0)) plot(1:50, PP, xlab = "Trial period", ylab = "Periodogram", main = "", type = "l") abline(h = crit.val, lwd = 2) @ } \caption{Analysis of the artificial example as in Figure~\ref{fig:ex_art}, now using least squares regression.\label{fig:ex_art_nonrob}} \end{figure} } \subsection{Disturbed data from GROJ0422+32} The first real data set we analyze is a light curve for gamma ray emission of the source GROJ0422+32, obtained by the BATSE Earth Occultation Monitoring project of the NASA. These experiments are described in \citet{Har02} and \citet{Har04}. The data have been kindly provided by the NASA, are available from \url{http://gammaray.nsstc.nasa.gov/batse/occultation}, and are shown in Panel~\ref{fig:groj_zr}. A large peak is visible starting at about 48900 Markarian Julian days (which corresponds to December 10 1991 in the Gregorian calendar), a so called gamma ray burst. It occasionally occurs in gamma ray observations and can be considered as outlier. The light curve covers a time interval of about 3312 days, so following \citet{halpern2003extreme} we consider periods up to 330 days (about one tenth of the overall duration of the light curve). Figure~\ref{fig:groj_L2} shows the periodogram obtained fitting a sine function using least squares regression, which is the classical approach in astroparticle physics. It is calculated using % <>= data(star_groj0422.32) PP <- RobPer(star_groj0422.32, periods = 1:330, model = "sine", regression = "L2", weighting = FALSE) @ % Periodograms for $\tau$-regression and M-regression using the Huber function are obtained replacing \code{"L2"} by \code{"tau"} or \code{"huber"} in the code above. The respective periodograms are shown in Panels~\ref{fig:groj_tau} and~\ref{fig:groj_huber}. All three periodograms do not show any outstanding peak. Apart from this, the periodograms using robust regression have a completely different shape than the least squares periodogram, which seems to have problems with the gamma ray burst. It might be questionable if the least squares periodogram can find a periodic structure in the observations in the presence of the gamma ray burst. We add a sine with period 30 and amplitude 0.005 to the observed values and repeat the analysis. The results can be seen in Figure~\ref{fig:grojfluc}. In Panel~\ref{fig:grojfluc_zr} it is visible that we did not introduce a strong periodic behavior. Nevertheless, the robust periodograms, Panels~\ref{fig:grojfluc_tau} and \ref{fig:grojfluc_huber}, easily detect it, while there is only a small local peak in the least squares periodogram in Panel~\ref{fig:grojfluc_L2}. The horizontal lines in Panels~\ref{fig:grojfluc_tau} and \ref{fig:grojfluc_huber} show the respective $\sqrt[330]{0.95}$-quantiles of the CvM-fitted beta distribution and are calculated from a periodogram \code{PP} using % <>= shapes <- betaCvMfit(PP) Crit <- qbeta(0.95^(1 / 330), shape1 = shapes[1], shape2 = shapes[2]) @ % So, as opposed to least squares regression, robust techniques are able to detect an (added) periodic fluctuation although the data are disturbed seriously by the gamma ray burst. <>= ##################### ## In order to ease the building of this Vignette, some objects needed ## for the following figures (7 and 8) have previously been calculated ## and can be loaded from grojanalysis.Rdata in the inst/extdata ## directory. ## The calculations can be found in inst/extdata/grojanalysis.R. ##################### ## Objects in grojanalysis.Rdata: ### star_groj0422.32: the light curve ### PP_groj: a list with the three periodograms shown in Figure 7 (b-d) ### Crit_groj: the critical values for Figure 7 (horizontal lines in 7(b-d) ### grojfluc: the light curve with a sine added shown in Figur 8 (a) ### PP_grojfluc: a list with the three periodograms shown in Figure 8 (b-d) ### Crit_grojfluc: the critical values for Figure 8 (horizontal lines in 8(b-d) ## (each (b-d): [[1]] LS-, [[2]] tau-, [[3]] Huber-regression) ##################### load(system.file("extdata", "grojanalysis.Rdata", package="RobPer")) @ {\setkeys{Gin}{width=.48\textwidth} \begin{figure}[t!] \subfloat[\label{fig:groj_zr}]{ <>= par(mar=c(3,3,3,1), mgp=c(2,1,0)) xlim_jahr <- as.numeric(substr(as.Date(range(star_groj0422.32[,1]), origin = "1858-11-17", as.character=T),1,4))+c(0,1) xlim <- as.Date(paste(xlim_jahr, "-01-01", sep=""))-as.Date("1858-11-17") plot(star_groj0422.32[,1], star_groj0422.32[,2], pch=16, col=rgb(0,0,0,alpha=0.5),xlab="t (Gregorian date)", ylab="y", xlim=xlim, axes=FALSE, cex=0.7) mtext( "t (Markarian Julian days)", 3, line=2) axis(1, at= as.Date(paste(xlim_jahr[1]:xlim_jahr[2], "-01-01", sep=""))- as.Date("1858-11-17"),labels=xlim_jahr[1]:xlim_jahr[2]) axis(2) axis(3) box() @ } \subfloat[\label{fig:groj_L2}]{ <>= par(mar=c(3,3,1,0.1), mgp=c(2,1,0)) plot(1:330, PP_groj[[1]], ylim=c(0,1.1*Crit_groj[[1]]), xlab="Trial period", ylab="Least squares periodogram", type="l", cex=0.7) abline(h=Crit_groj[[1]]) @ }\\ \subfloat[\label{fig:groj_tau}]{ <>= par(mar=c(3,3,1,0.1), mgp=c(2,1,0)) plot(1:330, PP_groj[[2]], ylim=c(0,1.1*Crit_groj[[2]]), xlab="Trial period", ylab="Tau periodogram", type="l", cex=0.7) abline(h=Crit_groj[[2]]) @ } \subfloat[\label{fig:groj_huber}]{ <>= par(mar=c(3,3,1,0.1), mgp=c(2,1,0)) plot(1:330, PP_groj[[3]],ylim=c(0,1.1*Crit_groj[[3]]), xlab="Trial period", ylab="Huber periodogram", type="l", cex=0.7) abline(h=Crit_groj[[3]]) @ } \caption{Analysis of GROJ0422+32: Panel~\ref{fig:groj_zr} shows the light curve, while the other panels show the periodograms fitting a sine using least squares in Panel~\ref{fig:groj_L2}, $\tau$- in Panel~\ref{fig:groj_tau}, Huber M-regression in Panel~\ref{fig:groj_huber}. No periodogram bar exceeds the respective$\sqrt[330]{0.95}$-quantile of the CvM-fitted beta distribution (horizontal line).}\label{fig:groj} \end{figure} } {\setkeys{Gin}{width=.48\textwidth} \begin{figure}[t!] \subfloat[\label{fig:grojfluc_zr}]{ <>= par(mar=c(3,3,3,1), mgp=c(2,1,0)) xlim_jahr <- as.numeric(substr(as.Date(range(star_groj0422.32[,1]), origin = "1858-11-17", as.character=T),1,4))+c(0,1) xlim <- as.Date(paste(xlim_jahr, "-01-01", sep=""))-as.Date("1858-11-17") plot(grojfluc[,1], grojfluc[,2], xlab="t (Gregorian date)", ylab="y", pch=16, col=rgb(0,0,0,alpha=0.5), xlim=xlim, axes=FALSE, cex=0.7) mtext( "t (Markarian Julian days)", 3, line=2) axis(1, at= as.Date(paste(xlim_jahr[1]:xlim_jahr[2], "-01-01", sep=""))- as.Date("1858-11-17"),labels=xlim_jahr[1]:xlim_jahr[2]) axis(2) axis(3) box() @ } \subfloat[\label{fig:grojfluc_L2}]{ <>= par(mar=c(3,3,1,0.1), mgp=c(2,1,0)) plot(1:330, PP_grojfluc[[1]], ylim=c(0,Crit_grojfluc[[1]]), xlab="Trial period", ylab="Least squares periodogram", type="l", cex=0.7) abline(h=Crit_grojfluc[[1]]) @ }\\ \subfloat[\label{fig:grojfluc_tau}]{ <>= par(mar=c(3,3,1,0.1), mgp=c(2,1,0)) plot(1:330, PP_grojfluc[[2]], xlab="Trial period", ylab="Tau periodogram", type="l", cex=0.7) abline(h=Crit_grojfluc[[2]]) @ } \subfloat[\label{fig:grojfluc_huber}]{ <>= par(mar=c(3,3,1,0.1), mgp=c(2,1,0)) plot(1:330, PP_grojfluc[[3]], xlab="Trial period", ylab="Huber periodogram", type="l", cex=0.7) abline(h=Crit_grojfluc[[3]]) @ } \caption{Adding a sine with amplitude 0.005 to the light curve of GROJ0422+32. Panel~\ref{fig:grojfluc_zr} shows the modified light curve, while the other panels show the periodograms fitting a sine using least squares in Panel~\ref{fig:grojfluc_L2}, $\tau$- in Panel~\ref{fig:grojfluc_tau}, Huber M-regression in Panel~\ref{fig:grojfluc_huber}. The horizontal lines in those three panels show the respective $\sqrt[330]{0.95}$-quantile of the CvM-fitted beta distribution.}\label{fig:grojfluc} \end{figure} } \subsection{Data from Markarian 421 and 501} A further real data example are gamma ray light curves from Markarian~421 (Mrk~421) and Markarian~501 (Mrk~501), kindly provided by the Gamma Astronomy group of the Deutsches Elektronen-Synchrotron. The data have been collected from various original sources, combined, and published by \citet{tluczykont2010long}, and are available from \url{http://astro.desy.de/gamma_astronomy/magic/projects/light_curve_archive/index_eng.html}. See the \pkg{RobPer} manual for details about the original sources and references. The light curve obtained for Mrk~421 is shown in Panel~\ref{fig:mrk421_a} on page~\pageref{fig:mrk421}. Periodograms obtained fitting a sine are shown in Figure~\ref{fig:mrk421_pers}. Using the least squares periodogram in Panel ~\ref{fig:mrk421_pers_a}, no valid period is detected, but considering the shape of the periodogram, one might wonder if there is a periodicity of 31 hidden in the same way as when adding a small periodic fluctuation to the GROJ0422+32 data, see Panel~\ref{fig:grojfluc_L2}. However, the periodograms for $\tau$-regression in Panel~\ref{fig:mrk421_pers_b} and Huber M-regression in Panel~\ref{fig:mrk421_pers_c} show a different behavior from Figure~\ref{fig:grojfluc}, so this does not seem to be the case. Especially, the least squares and the Huber M periodogram show a quite similar behavior regarding the local maxima. This could mean that there are not many observations weighted down in Huber M-regression. <>= ##################### ## In order to ease the building of this Vignette, some objects needed ## for the following figures (9 and 10) have previously been calculated ## and can be loaded from MrkAnalysis.Rdata in the inst/extdata ## directory. ## The calculations can be found in inst/extdata/MrkAnalysis.R. ##################### ## Objects in MrkAnalysis.Rdata: ### PPs: list of two lists: #### 1. with the three periodograms of Mrk421 shown in Figure 9 (a-c) #### 2. with the three periodograms of Mrk501 shown in Figure 10 (b-d) ### Crits: list of two lists: #### 1. with the critical values of Mrk421 for Figure 9 (a-c) #### 2. with the critical values of Mrk501 for Figure 10 (b-d) ### perioden: list of two: #### 1. trial periods for Mrk421 #### 2. trial periods for Mrk501 ## (each (a-c) and (b-d): [[1]] LS-, [[2]] tau-, [[3]] Huber-regression) ##################### load(system.file("extdata", "MrkAnalysis.Rdata", package="RobPer")) @ {\setkeys{Gin}{width=.3\textwidth} \begin{figure}[t!] \subfloat[]{ <>= par(mar=c(3,3,0.6,0.5), mgp=c(2,1,0)) plot(perioden[[1]], PPs[[1]][[1]], xlab="Trial period", ylab="Least squares periodogram", type="l") abline(h=Crits[[1]][[1]]) @ \label{fig:mrk421_pers_a}} \subfloat[]{ <>= par(mar=c(3,3,0.6,0.5), mgp=c(2,1,0)) plot(perioden[[1]], PPs[[1]][[2]], xlab="Trial period", ylab="Tau periodogram", type="l") abline(h=Crits[[1]][[2]]) @ \label{fig:mrk421_pers_b}} \subfloat[]{ <>= par(mar=c(3,3,0.6,0.5), mgp=c(2,1,0)) plot(perioden[[1]], PPs[[1]][[3]], xlab="Trial period", ylab="Huber periodogram", type="l") abline(h=Crits[[1]][[3]]) @ \label{fig:mrk421_pers_c}} \caption{Periodograms for Mrk~421, see Panel~\ref{fig:mrk421_a}, obtained fitting a sine with least squares regression in Panel~\ref{fig:mrk421_pers_a}, $\tau$-regression in Panel~\ref{fig:mrk421_pers_b}, Huber M-regression in Panel~\ref{fig:mrk421_pers_c}.}\label{fig:mrk421_pers} \end{figure} } Another light curve, obtained for Mrk~501, and periodograms using least squares regression, $\tau$-regression and Huber M-regression are shown in Figure~\ref{fig:mrk501}. Here we apply step regression, which is equivalent to epoch folding or phase dispersion minimization when using least squares regression (see~Section \ref{sec:robper}). The periodogram is calculated applying % <>= data(Mrk501) RobPer(Mrk501, periods = 1:400, model = "step", regression = "L2", weighting = FALSE) @ % in case of least squares regression and with \code{regression = "tau"} or \code{regression = "huber"} in case of $\tau$- or Huber M-regression, respectively. For least squares regression in Panel~\ref{fig:mrk501_b} and Huber M-regression in Panel~\ref{fig:mrk501_d} we see a broad peak between the trial periods 200 and 300, much too broad to be considered as valid period (see~\citealt{halpern2003extreme}). For $\tau$-regression in Panel~\ref{fig:mrk501_c}, this behavior is not observed. In the examples from the previous section, robust techniques recognize some periodicity in a light curve, while the least squares periodogram only provides a slightly atypical behavior for the trial period in question. Here it is the other way round: the least squares periodogram does not indicate a valid period, but exhibits some interesting feature similar to the previous data set, where a periodicity was hidden in noisy data. This initial suspicion cannot be confirmed by using robust regression instead of least squares regression. In summary, using our methods, we do not find a periodicity in the light curves for Mrk~421 and Mrk~501, neither using least squares nor robust regression. \begin{figure}[t!] \subfloat[]{ {\setkeys{Gin}{width=.9\textwidth} <>= temp <- Mrk501 par(mgp=c(2, 1, 0), mar=c(3,3,3,0.5)) jahre <- seq(1996,2008, by=1) xlim <- c(50100,54400) yats <- seq(0,12, by=3) plot(temp[,1], temp[,2], pch=16, col=rgb(0,0,0,alpha=0.5), axes=F, xlab="t (Gregorian date)", ylab="y", xlim=xlim, ylim=max(temp[,2]+temp[,3])*c(0, 1.1), cex=0.7) rect(temp[,1], temp[,2]+temp[,3], temp[,1], temp[,2]-temp[,3], border=rgb(0,0,0,alpha=0.5)) mtext("t (Markarian Julian days)", 3, line=2) axis(2, at=yats) axis(1, at= as.Date(paste(jahre, "-01-01", sep=""))- as.Date("1858-11-17"),labels=jahre) axis(3) box() @ } \label{fig:mrk501_a}}\\ {\setkeys{Gin}{width=.3\textwidth} \subfloat[]{ <>= par(mar=c(3,3,0.6,0.5), mgp=c(2,1,0)) plot(perioden[[2]], PPs[[2]][[1]], xlab="Trial period", ylab="Least squares periodogram", type="l", axes=FALSE) axis(1) axis(2, at=(0:3)/10) box() abline(h=Crits[[2]][[1]]) @ \label{fig:mrk501_b}} \subfloat[]{ <>= par(mar=c(3,3,0.6,0.5), mgp=c(2,1,0)) plot(perioden[[2]], PPs[[2]][[2]], xlab="Trial period", ylab="Tau periodogram", type="l") abline(h=Crits[[2]][[2]]) @ \label{fig:mrk501_c}} \subfloat[]{ <>= par(mar=c(3,3,0.6,0.5), mgp=c(2,1,0)) plot(perioden[[2]], PPs[[2]][[3]], xlab="Trial period", ylab="Huber periodogram", type="l") abline(h=Crits[[2]][[3]]) @ \label{fig:mrk501_d}} } \caption{Light curve in Panel~\ref{fig:mrk501_a} and periodograms for Mrk~501 obtained fitting a periodic step function with least squares regression in Panel~\ref{fig:mrk501_b}, $\tau$-regression in Panel~\ref{fig:mrk501_c}, Huber M-regression in Panel~\ref{fig:mrk501_d}.}\label{fig:mrk501} \end{figure} \section{Conclusions}\label{sec:summary} The \proglang{R}~package \pkg{RobPer} presented in this work allows searching for periodicity in irregularly sampled time series, possibly taking into account additional information on the precision of the measurement, if available. These are the typical characteristics of light curves, that is time series occurring in astroparticle physics. The periodogram is calculated fitting periodic functions to the light curve. The user can choose between six different periodic functions and seven different regression techniques, meaning that 42 possible combinations are offered, not taking into account further options like choosing the number of steps for the step model or using weighted regression. The function \code{betaCvMfit} allows to search for prominent periodogram bars as outliers in a beta distribution robustly fitted to the periodogram. The function \code{tsgen} allows generation of artificial light curves for investigative use. \section*{Acknowledgments} The authors are grateful to the reviewers for their constructive comments which led to several improvements of the paper and the programs. The financial support of the Deutsche Forschungsgemeinschaft (DFG) (within the Collaborative Research Center SFB 876 ``Providing Information by Resource-Constrained Data Analysis'', project C3, and Graduate School GK 1032 ``Statistische Modellbildung'') is gratefully acknowledged. We thank the ITMC at TU Dortmund University for providing computer resources on LiDO. We thank the NASA and the Deutsches Elektronen-Synchrotron for providing data. \bibliography{v69i09} \newpage \begin{appendix} \section[Implementation diagrams for RobPer]{Implementation diagrams for \code{RobPer}}\label{sec:app:struktogramme} In this appendix, the structure of the \code{RobPer} function is displayed as Nassi-Shneiderman diagram (structogram after \citeauthor{din66261}). Figure~\ref{app:fig:struktogramm} contains a reading guidance for the blocks used in the structogram. The structogram for \code{RobPer} is displayed in Figure~\ref{app:fig:robper}, for the algorithm \code{singleFUN} in Figure~\ref{app:fig:singlefun} and for the function \code{IRWLS} in Figure~\ref{app:fig:irwls}. The arguments and return values of the latter are shown in Table~\ref{app:tab:irwlsinput}. The following definitions are used: % \begin{align} \zeta_{L_2}(r)&= \sum_{i=1}^n r_i^2\\ \zeta_{LTS}(r)&= \sum_{i=1}^{h(m)} r_{(i)}, & h(m)&= \left\lfloor \frac{n}{2} \right\rfloor + \left\lfloor \frac{m+1}{2} \right\rfloor,\displaybreak[0]\\ \zeta_{L_1}(r)&= \sum_{i=1}^n |r_i|,\displaybreak[0]\\ \rho_{MH}(\nu) &= \begin{cases}\nu^2 & |\nu|\leq k\\ 2k|\nu|-k^2 & |\nu|> k \end{cases}, &\rho_{MB}(\nu)&=\begin{cases} 1-\left(1-\left(\frac{\nu}{k}\right)^2\right)^3 & |\nu|\leq k\\1& |r|> k \end{cases},\displaybreak[0]\\ \zeta_{MH}(r)&= \sum_{i=1}^n \rho_{MH}\left(\frac{r_i}{\widehat \sigma}\right), &\zeta_{MB}(r)&= \sum_{i=1}^n \rho_{MB}\left(\frac{r_i}{\widehat \sigma}\right),\displaybreak[0]\\ W_{MH}(\nu) &= \begin{cases} c_{MH} & |\nu|\leq k\\ c_{MH}\cdot\frac{k}{|\nu|} & |\nu|> k \end{cases}, & W_{MB}(\nu) &=\begin{cases} c_{MB}\cdot \left(1-\left(\frac{\nu}{k}\right)^2\right)^2 & |\nu|\leq k\\0 & |\nu|> k \end{cases}. \label{rhozetadefs} \end{align} % The normalization constant can be set to $c_{MH}=c_{MB}=1$ due to the scale invariance of the least squares estimation used in the iteratively reweighted least squares (IRWLS) step. \clearpage \begin{figure}[t!] \subfloat[\label{app:fig:struktogramm_a}]{\begin{minipage}{\textwidth} \parindent 0pt \renewcommand\tabcolsep{0pt} \footnotesize \begin{tabular}{|C{3cm}|p{1cm}>{}C{8cm}} \cline{1-1} B1 && \\ \cline{1-1} B2&&\\ \cline{1-1} B3&& \multirow{-3}{8cm}{First run B1, afterwards run B2, at last run B3. Horizontal lines between subsequent blocks are sometimes omitted for better readability.}\\ \cline{1-1} \end{tabular} \\[0.3cm] \begin{tabular}{|C{1cm}|C{1cm}|C{1cm}|p{1cm}>{}C{8cm}} \cline{1-3} \multicolumn{3}{|c|}{case} && \\ \multicolumn{2}{|c}{\diagbox[width=2cm, height=0.5cm]}&\diagbox[width=1cm,dir=NE, height=0.5cm]&&\\ 1&2&3&&\\ \cline{1-3} B1 & B2 & B3&& \multirow{-4}{8cm}{If case 1, run B1; if case 2, run B2; if case 3, run B3.}\\ \cline{1-3} \end{tabular} \\[0.3cm] \begin{tabular}{|p{1cm}|C{1cm}|p{1cm}|p{1cm}>{}C{8cm}} \cline{1-3} \multicolumn{3}{|c|}{} && \\ \cline{2-2} &\code{sub} & &&\\ \cline{2-2} \multicolumn{3}{|c|}{}&&\multirow{-3}{8cm}{Run \code{sub} (some algorithm, code or function outsourced).}\\ \cline{1-3} \end{tabular} \\[0.3cm] \begin{tabular}{|p{1cm}|C{2cm}|p{1cm}>{}C{8cm}} \cline{1-2} \multicolumn{2}{|c|}{condition} & &\\ \cline{2-2} &block && \multirow{-2}{8cm}{Reiteration of a block with a check in advance, whether a condition is fulfilled (e.g., a \code{for}-loop)}\\ \cline{1-2} \end{tabular} \\[0.3cm] \begin{tabular}{|p{1cm}|C{2cm}|p{1cm}>{}C{8cm}} \cline{1-2} &block && \\ \cline{2-2} \multicolumn{2}{|c|}{condition} & &\multirow{-2}{8cm}{Reiteration of a block with a check afterwards, whether a condition is fulfilled (e.g., by \code{if(!...)}$\ldots$\code{break})}\\ \cline{1-2} \end{tabular} \\[0.3cm] \end{minipage}} \subfloat[\label{app:fig:struktogramm_b}]{\begin{minipage}{\textwidth} \parindent 0pt \footnotesize \renewcommand\tabcolsep{0pt} \begin{tabular}{|p{0.3cm}C{2.7cm}|C{3cm}|p{1cm}>{}p{8cm}} \cline{1-3} \multicolumn{3}{|c|}{\fbox{\code{runifgen}}\rule{0pt}{0.5cm}} && \code{R> eval(parse(text = runifgen))} \\ \multicolumn{3}{|c|}{$t\leftarrow \tau\cdot 30$} && \code{R> t <- tau * 30} \\ \multicolumn{3}{|c|}{Sort $t$.} && \code{R> t <- sort(t)} \\ \multicolumn{3}{|c|}{Choose \code{wei} randomly from \{\code{TRUE}, \code{FALSE}\}.} && \code{R> wei <- sample(c(TRUE, FALSE), 1)} \\ \cline{1-3} \multicolumn{3}{|c|}{\code{wei}} && \code{R> if (wei) \{} \\ \multicolumn{2}{|c}{\diagbox[width=3cm, height=0.5cm]}&\diagbox[width=3cm,dir=NE, height=0.5cm]&& \code{+ \qquad eval(parse(text = runifgen))} \\ \multicolumn{2}{|c|}{\code{TRUE}} & \code{FALSE} && \code{+ \qquad s <- tau \}} \\ \cline{1-3} \multicolumn{2}{|c|}{\fbox{\code{runifgen}}\rule{0pt}{0.5cm}} & && \code{R> if(!wei) \{}\\ \multicolumn{2}{|c|}{$s\leftarrow \tau$} & $s\leftarrow 0.5\cdot \mathds1_{100}$&& \code{+ \qquad s <- rep(0.5, 100) \}}\\ \cline{1-3} \multicolumn{3}{|c|}{\rule{0pt}{0.5cm}$y\in \mathds R^{100}$}&& \code{R> y <- numeric(100)} \\ \cline{1-3} \multicolumn{3}{|c|}{\rule{0pt}{0.5cm}For $i = 1,\ldots,100$}&& \code{R> for (i in 1:100) \{} \\ \cline{2-3} \rule{0pt}{0.5cm}&\multicolumn{2}{|c|}{Choose $y_i$ from $\mathcal N\left(\sin(2\pi/5t_i), s_i^2\right)$} &&\code{+ \qquad y[i] <- rnorm(1, mean = sin(2 * pi / 5 * \qquad {+ \qquad t[i]), sd = s)} \}} \\ \cline{1-3} \end{tabular} \\[0.3cm] Block \code{runifgen}:\\ \begin{tabular}{|p{0.3cm}|C{5.7cm}|p{1cm}>{}p{8cm}} \multicolumn{2}{c}{} && \code{R> runifgen <- paste("} \\ \cline{1-2} \multicolumn{2}{|c|}{\rule{0pt}{0.5cm}$\tau \in \mathds R^{100}$} & & \code{+ \qquad tau <- numeric(100)} \\ \multicolumn{2}{|c|}{$i\leftarrow 1$} && \code{+ \qquad i <- 1} \\ \cline{1-2} & && \code{+ \qquad repeat \{}\\ &Choose $\tau_i$ from $\mathcal U_{[0,1]}$ && \code{+ \qquad tau[i] <- runif(1)} \\ & $i\leftarrow i+1$ && \code{+ \qquad i <- i+1} \\ \cline{2-2} \multicolumn{2}{|c|}{$i<101$} && \code{+ \qquad if(!i < 101) break \}")}\\ \cline{1-2} \end{tabular} \end{minipage}} \caption{Reading guidance for the structograms: In Panel~\ref{app:fig:struktogramm_a}, the blocks used for the representation of an algorithm. In Panel~\ref{app:fig:struktogramm_b}, a structogram (left) for a simple \proglang{R} code (right), which generates the observations $\left(t_i, y_i, s_i\right)_{i=1,\ldots,100}$ of a simple light curve with fluctuation period 5. This \proglang{R} code is for demonstration only and not programmed efficiently.}\label{app:fig:struktogramm} \end{figure} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \input{chart/RobPer.tex} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \input{chart/singleFun.tex} \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{table}[t!] \begin{tabularx}{\linewidth}{llX} \hline Argument & Symbol & Explanation\\ \hline \code{yy} $\in \mathds R^n$ &\code{yy} & {Observed values}\\ \code{matrix}\texttt{\textunderscore} $\in \mathds R^{n\times m}$ &$\mathfrak{X}$ & {Design matrix}\\ \code{W}: $\mathds{R}\rightarrow \mathds R_{\geq0}$ &\code{W} & {Weight function}\\ \code{residuals}\texttt{\textunderscore} $\in \mathds R^n $ & $\mathfrak{e}$ &{Vector of residuals}\\ \code{scale}\texttt{\textunderscore} $\in \mathds R_{>0}$& $\sigma$& {(Estimate of) Standard deviation}\\ \code{tol} $\in \mathds R_{>0}$ &\code{tol}& {Precision for convergence}\\ \hline Return value &&\\ \hline \code{tempIRWLS}\texttt{\$}\code{coeff} & $\widehat b$ & Fitted vector of parameters\\ \hline \end{tabularx} \caption{Arguments and return value of the function \code{IRWLS}.}\label{app:tab:irwlsinput} \end{table} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \input{chart/IRWLS.tex} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \end{appendix} \end{document}