"use strict";(self.webpackChunkwhg_training_resources=self.webpackChunkwhg_training_resources||[]).push([[492],{3905:(N,e,n)=>{n.d(e,{Zo:()=>c,kt:()=>m});var t=n(7294);function a(N,e,n){return e in N?Object.defineProperty(N,e,{value:n,enumerable:!0,configurable:!0,writable:!0}):N[e]=n,N}function o(N,e){var n=Object.keys(N);if(Object.getOwnPropertySymbols){var t=Object.getOwnPropertySymbols(N);e&&(t=t.filter((function(e){return Object.getOwnPropertyDescriptor(N,e).enumerable}))),n.push.apply(n,t)}return n}function i(N){for(var e=1;e<arguments.length;e++){var n=null!=arguments[e]?arguments[e]:{};e%2?o(Object(n),!0).forEach((function(e){a(N,e,n[e])})):Object.getOwnPropertyDescriptors?Object.defineProperties(N,Object.getOwnPropertyDescriptors(n)):o(Object(n)).forEach((function(e){Object.defineProperty(N,e,Object.getOwnPropertyDescriptor(n,e))}))}return N}function r(N,e){if(null==N)return{};var n,t,a=function(N,e){if(null==N)return{};var n,t,a={},o=Object.keys(N);for(t=0;t<o.length;t++)n=o[t],e.indexOf(n)>=0||(a[n]=N[n]);return a}(N,e);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(N);for(t=0;t<o.length;t++)n=o[t],e.indexOf(n)>=0||Object.prototype.propertyIsEnumerable.call(N,n)&&(a[n]=N[n])}return a}var s=t.createContext({}),l=function(N){var e=t.useContext(s),n=e;return N&&(n="function"==typeof N?N(e):i(i({},e),N)),n},c=function(N){var e=l(N.components);return t.createElement(s.Provider,{value:e},N.children)},p={inlineCode:"code",wrapper:function(N){var e=N.children;return t.createElement(t.Fragment,{},e)}},u=t.forwardRef((function(N,e){var n=N.components,a=N.mdxType,o=N.originalType,s=N.parentName,c=r(N,["components","mdxType","originalType","parentName"]),u=l(n),m=a,d=u["".concat(s,".").concat(m)]||u[m]||p[m]||o;return n?t.createElement(d,i(i({ref:e},c),{},{components:n})):t.createElement(d,i({ref:e},c))}));function m(N,e){var n=arguments,a=e&&e.mdxType;if("string"==typeof N||a){var o=n.length,i=new Array(o);i[0]=u;var r={};for(var s in e)hasOwnProperty.call(e,s)&&(r[s]=e[s]);r.originalType=N,r.mdxType="string"==typeof N?N:a,i[1]=r;for(var l=2;l<o;l++)i[l]=n[l];return t.createElement.apply(null,i)}return t.createElement.apply(null,n)}u.displayName="MDXCreateElement"},2563:(N,e,n)=>{n.r(e),n.d(e,{assets:()=>s,contentTitle:()=>i,default:()=>p,frontMatter:()=>o,metadata:()=>r,toc:()=>l});var t=n(7462),a=(n(7294),n(3905));const o={sidebar_position:2},i="Exploring FASTA files",r={unversionedId:"bioinformatics/exploring_genomic_data_in_R/exploring_sequence_files",id:"bioinformatics/exploring_genomic_data_in_R/exploring_sequence_files",title:"Exploring FASTA files",description:"Introduction",source:"@site/docs/bioinformatics/exploring_genomic_data_in_R/exploring_sequence_files.md",sourceDirName:"bioinformatics/exploring_genomic_data_in_R",slug:"/bioinformatics/exploring_genomic_data_in_R/exploring_sequence_files",permalink:"/bioinformatics/training/MSc_GM_2022/CM4-3-genomic_data/tutorials/bioinformatics/exploring_genomic_data_in_R/exploring_sequence_files",draft:!1,editUrl:"https://github.com/whg-training/whg-training-resources/edit/main/docs/bioinformatics/exploring_genomic_data_in_R/exploring_sequence_files.md",tags:[],version:"current",sidebarPosition:2,frontMatter:{sidebar_position:2},sidebar:"tutorialSidebar",previous:{title:"Getting the prerequiesites",permalink:"/bioinformatics/training/MSc_GM_2022/CM4-3-genomic_data/tutorials/bioinformatics/exploring_genomic_data_in_R/prerequisites"},next:{title:"Working with genome annotation files in R",permalink:"/bioinformatics/training/MSc_GM_2022/CM4-3-genomic_data/tutorials/bioinformatics/exploring_genomic_data_in_R/CM4-3_genomic_data_in_R_gff"}},s={},l=[{value:"Introduction",id:"introduction",level:2}],c={toc:l};function p(N){let{components:e,...o}=N;return(0,a.kt)("wrapper",(0,t.Z)({},c,o,{components:e,mdxType:"MDXLayout"}),(0,a.kt)("h1",{id:"exploring-fasta-files"},"Exploring FASTA files"),(0,a.kt)("h2",{id:"introduction"},"Introduction"),(0,a.kt)("p",null,"This part of the tutorial will look at files containing sequence data: FASTA files. These files are the main format used to hold\ngenome assemblies for major organisms."),(0,a.kt)("p",null,"Two large consortia exist maintaining genomic sequences: Ensembl, maintained by the European bioinformatics institute, and UCSC,\nmaintained by University of California Santa Cruz and funded by NIH, along a few other initiatives, such as ENCODE and RefSeq.\nThese are largely interchangeable but use a differnt indexing strategy and own annotation pipelines. Here we will explore files\nrelating to chromosome 19 sequence and annotation using the files provided by th\n",(0,a.kt)("a",{parentName:"p",href:"https://www.ensembl.org/Homo_sapiens/Info/Index"},"Ensembl consortium"),"."),(0,a.kt)("p",null,"You can download the files by acessing Ensembl website (above), or using the code below which will download them for you."),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-r"},"curl -O http://ftp.ensembl.org/pub/release-107/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.19.fa.gz\n")),(0,a.kt)("p",null,(0,a.kt)("strong",{parentName:"p"},"Note.")," In case of issues a backup of this file is available ",(0,a.kt)("a",{parentName:"p",href:"https://www.well.ox.ac.uk/bioinformatics/training/MSc_GM_2022/CM4-3-genomic_data/data/"},"in this folder"),"."),(0,a.kt)("p",null,"You will have to decompress the files before we start - run this in a terminal window now:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-sh"},"gunzip -k 'Homo_sapiens.GRCh38.dna.chromosome.19.fa.gz'\n")),(0,a.kt)("h1",{id:"genome-sequence-files"},"Genome sequence files"),(0,a.kt)("p",null,"The data is stored as a text file 'Homo_sapiens.GRCh38.dna.chromosome.19.fa'. GRCh38 refers to the name of the assembly - this is\nthe latest published sequence. R Studio is a multilingual environment. Each code chunk you execute can be written in a different\nlanguage. To tell R Studio which language you want to use, swap the default '{r}' at the beginning of the code chunk for '{sh}'\nto run the commands as if you were running them from the command prompt/terminal, or '{python}' to run them in Python. Refer to R\nStudio documentation to find out more. Let's use the bash commands ",(0,a.kt)("inlineCode",{parentName:"p"},"head")," and ",(0,a.kt)("inlineCode",{parentName:"p"},"tail")," to inspect the first and last 10 lines of\nthe FASTA file."),(0,a.kt)("p",null,"Let's look at the top and bottom of the file now:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-sh"},"head -n 10 'Homo_sapiens.GRCh38.dna.chromosome.19.fa'\ntail -n 10 'Homo_sapiens.GRCh38.dna.chromosome.19.fa'\n")),(0,a.kt)("p",null,"This should print something like:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},">19 dna:chromosome chromosome:GRCh38:19:1:58617616:1 REF\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\ntail\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNN\n")),(0,a.kt)("p",null,"We have the header line, starting with the '",">","' character, indicating the sequence name, and the sequence itself should be below.\nBut do you notice something strange? All the bases in the sequence are Ns rather than the expected A, C, G, and T. Why? N\nindicates an ambiguous base, genetic information is protected by the means of long telomeres at either end of the chromosome.\nLet's see how many lines our file contains using the command ",(0,a.kt)("inlineCode",{parentName:"p"},"wc"),"."),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-sh"},"wc -l 'Homo_sapiens.GRCh38.dna.chromosome.19.fa'\n")),(0,a.kt)("p",null,"This prints:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},"976962 Homo_sapiens.GRCh38.dna.chromosome.19.fa\n")),(0,a.kt)("p",null,"OK, we have about 980,000 lines. Let's select some lines at random and use ",(0,a.kt)("inlineCode",{parentName:"p"},"awk")," to check whether we can see the expected\nsequences consisting of A, G, C, and T."),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-sh"},"awk 'FNR>=1000 && FNR<=1020' 'Homo_sapiens.GRCh38.dna.chromosome.19.fa'\n")),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},"NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nGATCACAGAGGCTGGGCTGCTCCCCACCCTCTGCACACCTCCTGCTTCTAACAGCAGAGC\nTGCCAGGCCAGGCCCTCAGGCAAGGGCTCTGAAGTCAGGGTCACCTACTTGCCAGGGCCG\nATCTTGGTGCCATCCAGGGGGCCTCTACAAGGATAATCTGACCTGCAGGGTCGAGGAGTT\n(etc).\n")),(0,a.kt)("div",{className:"admonition admonition-tip alert alert--success"},(0,a.kt)("div",{parentName:"div",className:"admonition-heading"},(0,a.kt)("h5",{parentName:"div"},(0,a.kt)("span",{parentName:"h5",className:"admonition-icon"},(0,a.kt)("svg",{parentName:"span",xmlns:"http://www.w3.org/2000/svg",width:"12",height:"16",viewBox:"0 0 12 16"},(0,a.kt)("path",{parentName:"svg",fillRule:"evenodd",d:"M6.5 0C3.48 0 1 2.19 1 5c0 .92.55 2.25 1 3 1.34 2.25 1.78 2.78 2 4v1h5v-1c.22-1.22.66-1.75 2-4 .45-.75 1-2.08 1-3 0-2.81-2.48-5-5.5-5zm3.64 7.48c-.25.44-.47.8-.67 1.11-.86 1.41-1.25 2.06-1.45 3.23-.02.05-.02.11-.02.17H5c0-.06 0-.13-.02-.17-.2-1.17-.59-1.83-1.45-3.23-.2-.31-.42-.67-.67-1.11C2.44 6.78 2 5.65 2 5c0-2.2 2.02-4 4.5-4 1.22 0 2.36.42 3.22 1.19C10.55 2.94 11 3.94 11 5c0 .66-.44 1.78-.86 2.48zM4 14h5c-.23 1.14-1.3 2-2.5 2s-2.27-.86-2.5-2z"}))),"Note")),(0,a.kt)("div",{parentName:"div",className:"admonition-content"},(0,a.kt)("p",{parentName:"div"},"This is an 'advanced use' of the awk command.  We are using it to find lines between 1000 and 1020.\nDon't worry if you are not an awk expert - it's often better to work in R or another programming language for these kinds of tasks."))),(0,a.kt)("p",null,"Aha - we can see some non-missing bases now.  What about further down the file?"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-sh"},"awk 'FNR>=970000 && FNR<=970020' 'Homo_sapiens.GRCh38.dna.chromosome.19.fa'\n")),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},"TAGAAGTATTAACTTATTTTGAGGGCTTAAAAAGGCTAGAAGTACTGATGTCCTTTTCCT\nGAGTCCTGAAGTCATTCTAGCCATCAACCTCTGGAGAAATGCTGCTGGGGCCATTTTACC\nATGGGACCAGAAATACAAGTCCCTGACATGGGCTTGGCTGAGAAGAAGCAAGTGGGGTGC\nAAACTATGTGTGCTTTCATGTTGCAAAGAAGCTGTGTTGAATCAACAAATATTACTTGAG\nCACTTGCCAGGATTCCAGGTACTGTTCCAGGGCTGGATCACAGTGATGAGTGGGGCAGGT\n(etc.)\n")),(0,a.kt)("p",null,"Aha! All looks good - most of the file seems to be full of genuine sequence."),(0,a.kt)("p",null,"Let's now load the sequence into R's memory and take a closer look at it. We will skip the header and manipulate the original\nobject so that individual lines are joined together and then split by character. We'll use the ",(0,a.kt)("inlineCode",{parentName:"p"},"scan")," command which just reads\ndata from a file:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-r"},"fasta <- scan(\n    'Homo_sapiens.GRCh38.dna.chromosome.19.fa',\n    what = 'character',                              # The type of file to be read\n    skip = 1                                         #Skip the header\n)\n")),(0,a.kt)("p",null,"As shown above the sequence in the file occurs on multiple lines - before starting let's put those back together:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-r"},"fasta <- strsplit(fasta, split = '') #Split the long string into characters\nfasta <- unlist(fasta) #Convert the object from a list back to a string\n\nfasta[1:10]\n")),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'[1] "N" "N" "N" "N" "N" "N" "N" "N" "N" "N"\n')),(0,a.kt)("p",null,"Let's see how many of each type of nucleotide is present in the sequence."),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-r"},"nucleotides <- table(fasta)\n\nnucleotides\n")),(0,a.kt)("p",null,"You should get something like this:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},"       A        C        G        N        T \n15142293 13954580 14061132   176858 15282753\n")),(0,a.kt)("p",null,"Finally let's create a barplot to visualise this:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-r"},"barplot(nucleotides)\n")),(0,a.kt)("p",null,(0,a.kt)("img",{alt:"img",src:n(126).Z,width:"672",height:"480"})),(0,a.kt)("p",null,"OK cool! "),(0,a.kt)("p",null,"So, there are about 200,000 N nucleotides, which form the minority of all the other bases."),(0,a.kt)("p",null,"It would be interesting to know the telomere lengths of course. To find out, let's try to write a while-loop to measure the\nlength of the first telomere:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-r"},"n <- 0                 # Initialise N counter\nwhile( \n    fasta[n+1] == 'N'  # Keep going (n+1 will take on values 1, 2, 3, \u2026) until we see a non-N base\n) {\n  n <- n + 1           # Add 1 to the conut (initially 0)\n}\n\nn\n")),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},"[1] 60000\n")),(0,a.kt)("p",null,"It looks like the telomere is 60kb long. "),(0,a.kt)("p",null,"::tip Note"),(0,a.kt)("p",null,"The ",(0,a.kt)("a",{parentName:"p",href:"https://doi.org/10.1128/mcb.10.2.518-527.1990"},"actual telomeres")," are composed of TTAGGG repeats and the number of ambiguous\n'N' bases at chromosome ends is somewhat artificial, but it helps with bioinformatics applications, such as annotation and\nalignment."),(0,a.kt)("p",null,":::"),(0,a.kt)("p",null,"To check we got this right, let's print sequence fragment 2 bases upstream and downstream of the last N to make sure our\ncalculation is correct:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-r"},"fasta[ (n-2) : (n+2) ]\n")),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'[1] "N" "N" "N" "G" "A"\n')),(0,a.kt)("p",null,"Bingo!  The base 6000 is the last N character."),(0,a.kt)("p",null,"Let's now take a look at the other, coding bases, and check if AT and GC are in roughtly 50:50 proportion. "),(0,a.kt)("p",null,"To do this, let's take our table object and first convert it to a data frame for easier manipulation.\nWe'll do this now and reformat in a useful way:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},"nucleotides <- data.frame(nucleotides, row.names = rownames(nucleotides))\ncolnames(nucleotides) <- c('nucleotide', 'count')                               # Add meaningful column names\nnucleotides$nucleotide <- NULL                                                  # Delete unused column\n#Transpose and retain data frame structure:\nnucleotides <- as.data.frame(t(nucleotides))  \n")),(0,a.kt)("p",null,"Now let's compute GC content:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-r"},"GC <- (nucleotides$G + nucleotides$C) / (nucleotides$A + nucleotides$T + nucleotides$G + nucleotides$C)\nAT <- 1 - GC\n\nGC\n")),(0,a.kt)("p",null,"You should get something like this:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},"[1] 0.4793865\n")),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-r"},"AT\n")),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},"[1] 0.5206135\n")),(0,a.kt)("p",null,"Indeed, the ratio is approximately 50:50. Let's add some code to present our data in a nicer way and make a barlot to show the\nrelative percentages."),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-r"},"print( paste0('GC %: ', round(GC, digits = 2)))\n")),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'[1] "GC %: 0.48"\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-r"},"print( paste0('AT %: ', round(AT, digits = 2)))\n")),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'[1] "AT %: 0.52"\n')),(0,a.kt)("p",null,"That's much better!  And let's plot it:"),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-r"},"barplot(c(GC, AT), names.arg = c('GC %', 'AT %'))\n")),(0,a.kt)("p",null,(0,a.kt)("img",{alt:"img",src:n(940).Z,width:"672",height:"480"})),(0,a.kt)("p",null,"The percentage of GC pairs is referred to as GC content and is an important QC diagnostic of sequencing runs. GC content differs\nbetween parts of genome as well as between the organisms. Regions with epigenetic control exhibit presence of ",(0,a.kt)("a",{parentName:"p",href:"https://doi.org/10.1101%2Fgad.2037511"},"CpG\nislands"),", GC-rich regions with methylated cytosine bases."))}p.isMDXComponent=!0},126:(N,e,n)=>{n.d(e,{Z:()=>t});const t=n.p+"assets/images/base_counts-e00f5ca7463b564dde0b90f859bc96d5.png"},940:(N,e,n)=>{n.d(e,{Z:()=>t});const t=n.p+"assets/images/gc_content-2de96370eb52b7070c65357f3ec03416.png"}}]);