# Learning R: Harvard-Smithsonian Center for Astrophysics tutorial
# Eric Feigelson
# January 2014

# Adapted from R scripts of 
# Modern Statistical Methods for Astronomy With R Applications
# Eric D. Feigelson & G. Jogesh Babu
# http://astrostatistics.psu.edu/MSMA

# These scripts may be cut and pasted into any R console.  
# They are self-contained.

# I. Query the session environment and make a vector of three numbers
# A hash mark (#) denotes comments in an R script
# A semi-colon (;) has the same action as a carriage return (<CR>)
# All R functions use () to list arguments and parameters

getwd()   					# find working directory.  
# setwd('/Users/myself/newdir')	# set your personal working directory
setwd('/Users/e5f/Desktop/CASt/Consulting/CfA_2014')
sessionInfo()  				# interact with the host computer system
citation()   				# quote this citation in any publication using R

# II. Create and characterize a vector

a <- c(33, 44, 92, 58)		# combine numbers into a vector
length(a)
ls()  					# list names of objects in your environment
class(a)  					# state the `class' of an R object  
str(a)  					# state the structure of an R object
a 						# state the contents of an R object  
write(file='output', a)  		# write an ASCII file into the working directory
save(file='output_bin', a)  	# write a binary file
Save(file='output_bin', a)  	# error.  R is case sensitive. 

mean(a) ; median(a)
min(a)  ; max(a)
sum(a)  ;  sum(a>40)
summary(a)
which.max(a)
match(44, a)
a[1:4]  ;  a[3]

# Make a data.frame, a 2D array with column names
df.a <- data.frame(cbind(seq(1:4),a))
names(df.a) <- c('rows', 'value')
df.a
help(seq)


# III. Simple math and a plot

5 + 3 ; 5-3 ; 5*3 ; 5/3 ; 5^3
x <- 5 ; y <- 3
x+y 

z <- seq(0.0, 0.5, 0.1)
z
H_0 <- 68  # km/s/Mpc,  Planck value
speed.light <- 3.0E5  
dist <- speed.light*z / H_0
dist
trunc(dist[4]) ; round(dist[4]) 
format(dist[4], digits=3, scientific=T)
sprintf('%7.2e',dist[4])

sin(0)  ; sin(pi/2) 
ang <- seq(0, pi/2, length=30) 
sin(ang)
plot(ang, sin(ang))
plot(ang, log10(sin(ang)), pch=20, cex=1.2, xlab='Angle (radian)', ylab='log(sine)', main='Trig exercise', cex.lab=1.2, cex.main=1.7, col=2)
lines(ang, log10(sin(ang)), lwd=2, lty=2, col='green3')
dev.copy2eps(file='sine.eps')
help(par)

# IV. An astrophysical calculation using functions

Omega_m <- (0.022068 + 0.12029) / (H_0/100)^2
Omega_Lambda <- 0.6825 
E.H0 <- function(redshift) {sqrt(Omega_m*(1+redshift)^3 + Omega_Lambda)}
lum.dist <- function(redshift) {  
	dist = (speed.light/H_0) * integrate(E.H0, 0, redshift)$value
	return(dist) }
lum.dist(0.1)  ; lum.dist(1.0) 



# V. Create a 500x2 bivariate dataset where the X values are evenly 
# distributed between 0 and 3 and the Y values have a nonlinear 
# dependence on X with heteroscedastic Gaussian scatter

help(seq) 		# Learn some R functions.  Use help files frequently!
help(sample)  
help(rnorm)  
help(rbind)
set.seed(1)
x <- sample(seq(0.01, 3, length.out=500))
y <- 0.5*x + 0.3^(x^2) + rnorm(500, mean=0, sd=(0.05*(1+x^2)))


# VI. Examine, summarize and plot univariate distributions

summary(x) ; summary(y)   	# Summarizes properties of an R object
str(x)				# State the structure of an R object
rbind(summary(x),summary(y))	# Concatenate rows.  cbind concatenates columns.

help(par) 				# An essential help file for formatting figures
help(boxplot)			# A common univariate plot
par(mfrow=c(1,2))  		# Set up a two-panel figure
boxplot(x,  notch=T, main='Boxplot for X')
boxplot(y,  notch=T, pch=20, cex=0.5, main='Boxplot for Y')
dev.copy2eps(file='box.eps')	# Copy device contents to an EPS formatted file. Also dev.copy2pdf. 

hist(y, breaks='FD', main='', xlab='Yvar', col='yellow')	# Histogram with Freedman-Diaconis bin widths; see help(nclass.FD)
qqnorm(y, pch=20, cex=0.3)  	# Quantile function of y compared to normal distribution
qqline(y)				# Expected relationship if y is normal

plot(ecdf(x),pch=20,cex=0.3,main='',ylab='EDF',xlab='')
plot(ecdf(y),pch=20,cex=0.3,add=T)
text(2.0,0.5,"X") ; text(1.4,0.8,"Y")
dev.copy2eps(file='edf.eps')


# VII. Put X and Y into a data frame

help(cbind) ; help(as.data.frame) ; help(names)	# Learn more R functions
xy <- cbind(x, y)  ;  str(xy) 		# Here xy is an `array' of numbers
xy <- as.data.frame(xy)
names(xy) <- c('Xvar', 'Yvar')  ; str(xy)	# Now xy is a 'data.frame' with names associated with columns
attach(xy)  					# Attach allows use of variable names rather than column numbers. 
ls()							# List current contents of environment

xy[1:10,]			# Filter the data.frame: first 10 rows
xy[xy[,1]>2,] 		# Rows where the first column value exceeds 2


# VIII. Sampling and bootstrapping 

trials <- sample.int(length(Xvar),10) # 10 random rows
xy[trials,]	

trials <- sample.int(length(Xvar),10, replace=T) # 10 bootstrap resamples
xy[trials,]	

median(Yvar) 			# Estimate the standard error of the median of Yvar?
mad(Yvar) / sqrt(500)		# Median absolute deviation estimate of median s.e.

library(boot)
med <- function(x,ind) median(x[ind])
boot(Yvar, med, R=1000) 	# Bootstrap estimate of median s.e. 
hist(boot(Yvar, med, R=1000)$t, breaks=50, xlab='Bootstrap median of Yvar')


# IX. Bivariate plots and correlation tests

help(plot)
par(mfrow=c(1,2))
plot(xy, pch=20, cex=0.5)	# Scatterplot. See help(points) for symbol shapes. cex specifies symbol size.
plot(log10(xy), pch=20, cex=0.5, xlab='log(Xvar)', ylab='log(Yvar)')	# Note `log' means `log_e' not `log_10' in R. 
par(mfrow=c(1,2))

length(x[x>2])			# State length of a vector.  Use `dim' for an array or data.frame.
cor.test(x[x>2],y[x>2], method='pearson')	# Parametric hypothesis test for bivariate correlation 
cor.test(x[x>2],y[x>2], method='kendall')	# Nonparametric hypothesis test for bivariate correlation
# Note that parametric test is vulnerable to arbitrary variable transformations 
# while nonparametric test is not
cor.test(log10(x[x>2]),log10(y[x>2]), method='pearson')	
cor.test(log10(x[x>2]),log10(y[x>2]), method='kendall')