###########
#
# GenMedMSc_CM2.2_practical.R
#
## A practical to accompany lectures CM2.1 and CM2.2 of the 
## Genomic Medicine MSc core module 2
## by Luke Jostins-Dean, Jennie Astley and Alexandra Martin-Geary
## 24th October 2023
#

##Part 0 - setting up packages and downloading files

#There are three example files in the same directory on canvas that you downloaded this R script from 

#You will need to edit this command to where-ever you downloaded the three example files to on your system: # Copy folder path to clipboard from Files
file_prefix <- '~/MSc/'

#you will also need to install three packages from CRAN: pedtools, paramlink2 and genpwr, if you don't already have them installed:
install.packages('pedtools')
install.packages('paramlink2')
install.packages('genpwr')


###Part 1 - making and visualising simple pedigrees

### The package pedtools can make pedigrees. Lets load the package:
library(pedtools)

#the function nuclearPed makes a file representing a simple nuclear family with 5 children 
#(nch specifies the number of children)
nuclear_family_1 <- nuclearPed(nch=5)
nuclear_family_1

#this type of format is often called "pedigree" or "ped" format
#each individual has an id (here numbered 1 to 7), as well as the ID of that individual's father ("fid") 
#and mother ("mid"), where "*" or "0" means that that parent isn't present in the pedigree, and the 
#sex (1 for male, 2 for female). 
#Pedigree format can also include other information that isn't present here, such as disese status.

#in our pedigree everyone other than the mother is male. Lets make another pedigree with a mixture of sexes
#among the children, specified using the argument "sex"
nuclear_family_2 <- nuclearPed(nch=5,sex=c(1,2,1,2,1))
nuclear_family_2
#much more balanced

#the pedtools package comes with a default plotting function for pedigrees. Call it like this:
plot(nuclear_family_2)

#this creates a pedigree plot, or family tree. 
#Squares are male, circles are female, the parents are at the top and all the offspring are at the bottom.

#next, we are going to put some genotypes onto the pedigree. 
#Let's put on genotypes assuming both parents are heterozygous (i.e. A/a)
#and the offspring range from homozygous reference ("A/A"), heterozygous ("A/a") to homozygous alternative ("a/a"): 
nuclear_family_2_genotypes <- c("A/a","A/a","a/a","A/A","A/a","A/a","A/A")

#pedtools has a "marker" format that holds marker data for a pedigree. 
#we can use the "marker" function to make a marker file for our mixed-sex pedigree
nuclear_family_2_markers <- marker(nuclear_family_2,geno=nuclear_family_2_genotypes)

#pedtools can now visualise this if we give it both the pedigree and the marker object
plot(nuclear_family_2,marker= nuclear_family_2_markers)

#you should now see the genotypes on below the family members

#finally, we can set which patients have the disease.
#lets assume that this variant we have made up drives a fully penetrant recessive Mendelian disease. 
#So we make an affection status vector, which is TRUE for individuals who are homozygous a (and thus have the disease) 
#and FALSE (i.e. does does not have the disease) otherwise.
nuclear_family_2_affected <- nuclear_family_2_genotypes == "a/a"

#we can then extract out a list of the IDs of patients who have the disease
#using a vector subset (i.e. square bracket) operation
nuclear_family_2_affected_ids <- nuclear_family_2$ID[nuclear_family_2_affected]
#this should only include on individual, as only one individual was "a/a"

#lets plot the pedigree, with genotypes and affection statuses, together
plot(nuclear_family_2,marker= nuclear_family_2_markers, aff = nuclear_family_2_affected_ids)

##Tasks for part 1
#Task 1.1 (easy) - make another example pedigree with genotypes and affection status, 
#illustrating a autosomal dominant disease

#Task 1.2 (harder) - make another pedigree, this time showing an X-linked recessive disease. 
#This will require a bit of thinking, as males should be coded as "a" or "A", and female "A/A", "A/a" or "a/a". 
#You can tell the marker() function to expect an X chromosome by setting the argument chrom="X".


###Part 2 - Reading in bigger pedigree file

#we have a ped file of cousins, similar to the previous pedigree. Let's read it in and take a look at it;
pedfile <- readPed(paste0(file_prefix,'/Pedigree2.ped'))

#NOTE: If this doesn't work, and you get either an "object 'file_prefix' not found" or 
#a "No such file or directory" error, this probably means you have not set up the 
#folder and downloaded the files in Part 0 at the top of this script.

#take a look at the pedigree we have read in
plot(pedfile)
#this is a much larger pedigree than the ones we looked at above, and has multiple generations

#we also have affection statuses for the individuals
afffile <- read.table(paste0(file_prefix,'/Pedigree2.aff'))

#let's just check that the individuals are in the same order
all(afffile$id == pedfile$ID)

#like we in Part 1, lets make a list of affected individuals from this pedigree
pedfile_affected_ids <- pedfile$ID[afffile$phenotype == TRUE]

#and let's plot the affection status on the pedigree
plot(pedfile,aff=pedfile_affected_ids)

#lots of cases!

#note that in this case, the markers are stored in the ped file itself (rather than a seperate Marker object), 
#as a list of marker objects. 
#This is how we can pull out the geneotypes for the first marker:
pedfile$MARKERS[[1]]

#the pedigree file itself doesn't store details on the markers (their position, their chromosome, etc). 
#That is stored in a map file, which includes the chromosome, the name of the marker, and the position 
#of the marker (in Mb):
mapfile <- read.table(paste0(file_prefix,'/Pedigree2.map'))
head(mapfile)

#so, we can see that the first marker:
mapfile[1,]
#is on chromosome 1, is called "marker1", and is at around 1.43Mb. 

#We can also plot that first marker on the pedigree:
plot(pedfile,marker=pedfile$MARKERS[[1]],aff=pedfile_affected_ids)

#this first variant doesn't seem to line up with affection status.

#Task 2.1 (easy) - can you also make plots for a marker toward the middle of the chromosome region, and one 
#from the end. Could any of these plausibly explain the disease?
#Task 2.2 (easyish-but-a-bit-thinky,might require the use of Google): this linkage map is only for chrosomome 1.
#Does it cover all of chromosome 1? If not, what proportion does it cover? 

###Part 3 - Running a linkage scan

#in this section, we will try out a parametric linkage analysis
#load the paramlink2 library, for doing parametric linkage
library(paramlink2)

#Can any of the other variants explain the disease? Lets do a parametric linkage scan. 
#Remember, parametric means that you set model parameters yourself, so we need to decide 
#on what inheritance model to use.

#What models are compatible with this data? 
#It couldn't be a fully penetrant dominant model, because the disease skips a generation in some cases (e.g. individual 6). 
#It could be a partially penetrant dominant variant, though. Lets try that.

#we set a disease model for paramlink2 using the diseaseModel() function
#we need to give it a model (AD for autosomal dominant, AR for autoosmal recessive, and so on)
#we can set an autosomal dominant model like this:
modAD = diseaseModel(model="AD")

#look at the model details:
modAD
#you can see that function has set some default values.
#One thing it has set is the penetrance - it has set this as the penetrance vector (0,1,1)
#this means that 0% of A/A, 100% of A/a and 100% of a/a individuals will have the disease

#BUT! We know that this doesn't work for our pedigree. So, lets set the penetrance lower, at 80%:
modAD_partialPenetrance = diseaseModel("AD", penetrances=c(0,0.8,0.8))

#we can use the pedigree file, the list of affected individuals and the genetic model to run a linkage scan
#using the function lod():
lodsAD = lod(pedfile, aff = pedfile_affected_ids, model = modAD_partialPenetrance)

#the function is called lod because it generates LOD (log-odds) statistics from the linkage scan

#we can plot the LOD scores for each marker along the chromosome
plot(lodsAD)

#there are no LOD scores greater than 2, so no strong signal there.

#Let's try another model. This pedigree could, in theory, be compatible with a recessive model.

#set the model to autosomal recessive ("AR"), with default penetrances.
modAR = diseaseModel("AR")

#and re-run the linkage scan with this model:
lodsAR = lod(pedfile, aff = pedfile_affected_ids, model = modAR)

#And plot the results
plot(lodsAR)

#Bingo! We have a big hit (LOD > 6).

#Task 3.1 (easyish, think about how to do subsetting in R) - which marker is this hit? Where is it located in the genome? 
#Task 3.2 (a bit harder) - make a plot of this marker on the pedigree, along with affection status - how well does 
#it seem to fit?
#Tast 3.3 (harder, and thinkier) - what is the allele frequency of this variant in the founders of this pedigree? 
#Does this seem suprising? Under what circumstances might you see a pathogenic variant to rise to such a high 
#allele frequency?