Difference between revisions of "Team:Mingdao/Collaborations"
Line 1,846: | Line 1,846: | ||
</div> | </div> | ||
<img class="colelem" id="u4072-4" alt="Code data for data mining of fire retardant protein" width="1124" height="43" src="https://static.igem.org/mediawiki/2015/2/2c/Mingdao-u4072-4.png"/><!-- rasterized frame --> | <img class="colelem" id="u4072-4" alt="Code data for data mining of fire retardant protein" width="1124" height="43" src="https://static.igem.org/mediawiki/2015/2/2c/Mingdao-u4072-4.png"/><!-- rasterized frame --> | ||
− | + | <div class="colelem" id="u4076"><textarea readonly="readonly" style="width:1120px; height:614px;">// | |
+ | // main.c | ||
+ | // sequence_read | ||
+ | // | ||
+ | // Created by Daniel Yi-Ru, Liu on 2/13/15. | ||
+ | // Copyright (c) 2015 iGEM Mingdao. All rights reserved. | ||
+ | // | ||
+ | #include <stdio.h> | ||
+ | #include <string.h> | ||
+ | #include <time.h> | ||
+ | |||
+ | int aaAnalization(char aa); | ||
+ | /* | ||
+ | fasta file format: | ||
+ | |||
+ | item Serial Number space Protein Name New Line Mark | ||
+ | example >sp|Q6GZX4|001R_FRG3G Putative transcription factor 001R OS=Frog virus 3 \n | ||
+ | format %s %[^\n] %*c | ||
+ | saves at serialNumber ignore proteinName ignore | ||
+ | |||
+ | item Sequence | ||
+ | example MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS | ||
+ | EKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLD | ||
+ | AKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHL | ||
+ | EKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDD | ||
+ | SFRKIYTDLGWKFTPL | ||
+ | format %[^>] | ||
+ | saves at sequence | ||
+ | */ | ||
+ | char serialNumber [50]; | ||
+ | int sequenceLength; | ||
+ | char proteinName [1000]; | ||
+ | char sequence [100000]; | ||
+ | int aaCount [27]; //times of each amino acid appear in a protein | ||
+ | float aaRate; //"amino acid/sequenceLength" content of amino acid in each protein | ||
+ | int sequenceCount; //scans the string "sequence" | ||
+ | int aaCode; //code for amino acid, sees at amino_acid.txt | ||
+ | int sequenceLengthCheck; //check the length of a sequence with "sequenceLength" | ||
+ | int printScan; //scans array "aaCount" for printing out | ||
+ | float proteinMass; | ||
+ | float proteinNitrogenRate; | ||
+ | int completeMount = 0; | ||
+ | clock_t start, end; | ||
+ | const char aminoAcid [][30] = { | ||
+ | {"Alanine"}, {"Cysteine"}, {"Aspartic acid"}, {"Glutamic acid"}, | ||
+ | {"Phenylalanine"}, {"Glycine"}, {"Histidine"}, {"Isoleucine"}, | ||
+ | {"Lysine"}, {"Leucine"}, {"Methionine"}, {"Asparagine"}, | ||
+ | {"Proline"}, {"Glutamine"}, {"Arginine"}, {"Serine"}, | ||
+ | {"Threonine"}, {"Valine"}, {"Tryptophan"}, {"Tyrosine"}, | ||
+ | {"Selenocysteine"}, {"Pyrrolysine"}, {"Glutamine or Glutamic acid"}, | ||
+ | {"Aspartic acid or Asparagine"}, {"Unknow"}, {"New line mark"} | ||
+ | }; | ||
+ | const float aaNitrogenContain [] = { | ||
+ | 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 2.0, 4.0, 1.0, | ||
+ | 1.0, 1.0, 2.0, 1.0, 1.0, 3.0, 1.5, 1.5 | ||
+ | }; | ||
+ | const float aaMass [] = { | ||
+ | 89.09404, 121.15404, 133.10384, 147.13074, 165.19184, 75.06714, 155.15634, 131.17464, | ||
+ | 146.18934, 131.17464, 149.20784, 132.11904, 115.13194, 146.14594, 174.20274, 105.09344, | ||
+ | 119.12034, 117.14784, 204.22844, 181.19124, 168.053, 255.31, 146.63834, 132.61144 | ||
+ | }; | ||
+ | const int aaConvert [] ={ | ||
+ | // A B C D E F G H I J K L M N O P Q R S T U V W X Y Z | ||
+ | 1, 24, 2, 3, 4, 5, 6, 7, 8, 0, 9, 10, 11, 12, 22, 13, 14, 15, 16, 17, 21, 18, 19, 25, 20, 23 | ||
+ | }; | ||
+ | |||
+ | void main(){ | ||
+ | |||
+ | printf("Analization Start...\n"); | ||
+ | |||
+ | start = clock(); | ||
+ | |||
+ | FILE *input, *output; | ||
+ | |||
+ | //input = fopen("uniprot_sprot_test.fasta","r"); | ||
+ | input = fopen("uniprot_sprot.fasta","r"); | ||
+ | |||
+ | if(input == NULL){ | ||
+ | printf("Fail To Open File: uniprot_sprot.fasta!"); | ||
+ | return; | ||
+ | } | ||
+ | |||
+ | output = fopen("output_uniport_N_rate.txt","w+"); | ||
+ | |||
+ | if(output == NULL){ | ||
+ | printf("Fail To Open File: output_uniport_N_rate.txt!"); | ||
+ | return; | ||
+ | } | ||
+ | |||
+ | fprintf(output, "Serial Number\t"); | ||
+ | fprintf(output, "Protein Name\t"); | ||
+ | |||
+ | int i; | ||
+ | for(i = 0; i < 26; i++){ | ||
+ | fprintf(output, "%s\t", aminoAcid[i]); | ||
+ | } | ||
+ | |||
+ | for(i = 0; i < 26; i++){ | ||
+ | fprintf(output, "%s rate\t", aminoAcid[i]); | ||
+ | } | ||
+ | |||
+ | fprintf(output, "Protein Mass (Da)\t"); | ||
+ | fprintf(output, "Nitrogen rate"); | ||
+ | |||
+ | fprintf(output, "\n"); | ||
+ | |||
+ | |||
+ | while(!feof(input)){ | ||
+ | |||
+ | completeMount++; | ||
+ | |||
+ | for (i=0; i<99999; i++) { | ||
+ | sequence[i] = '\0' ; | ||
+ | } | ||
+ | |||
+ | |||
+ | fscanf(input, "%s%*c%[^\n]%*c%[^>]", &serialNumber, &proteinName, &sequence); | ||
+ | //http://blog.csdn.net/u011478505/article/details/25399721 | ||
+ | |||
+ | if(serialNumber[0]!='>'){ | ||
+ | printf("ERROR: Overflow! Line %d", completeMount); | ||
+ | return; | ||
+ | } | ||
+ | |||
+ | //printf("%s %s %s\n", serialNumber, proteinName, sequence); | ||
+ | //printf("%s\n", serialNumber); | ||
+ | //printf("%s\n", proteinName); | ||
+ | //printf("%s\n", sequence); | ||
+ | |||
+ | //fprintf(output, "%s %s %s\n", serialNumber, proteinName, sequence); | ||
+ | fprintf(output, "%s\t", serialNumber); | ||
+ | fprintf(output, "%s\t", proteinName); | ||
+ | //fprintf(output, "%s\n", sequence); | ||
+ | |||
+ | |||
+ | for( i = 0; i < 27; i++){ | ||
+ | aaCount[i] = 0; | ||
+ | } | ||
+ | |||
+ | |||
+ | for(sequenceCount = 0; sequenceCount < strlen(sequence); sequenceCount++){ | ||
+ | aaCode = aaAnalization(sequence [sequenceCount]); | ||
+ | |||
+ | if(aaCode == 0){ | ||
+ | printf("ERROR: Amino acid code unknow! Code:%c", sequence [sequenceCount]); | ||
+ | return; | ||
+ | } | ||
+ | |||
+ | aaCount[aaCode]++; | ||
+ | } | ||
+ | |||
+ | sequenceLengthCheck = 0; | ||
+ | |||
+ | |||
+ | for(printScan = 1; printScan < 27; printScan++){ | ||
+ | |||
+ | //printf("%d\t", aaCount[printScan]); | ||
+ | fprintf(output, "%d\t", aaCount[printScan]); | ||
+ | sequenceLengthCheck += aaCount[printScan]; | ||
+ | |||
+ | } | ||
+ | |||
+ | sequenceLength = (int)strlen(sequence) - aaCount[26]; | ||
+ | sequenceLengthCheck -= aaCount[26]; | ||
+ | |||
+ | for(printScan = 1; printScan < 27; printScan++){ | ||
+ | |||
+ | aaRate = (float)aaCount[printScan]/(float)sequenceLength; | ||
+ | //printf("%f\t", aaRate); | ||
+ | fprintf(output, "%f\t", aaRate); | ||
+ | |||
+ | } | ||
+ | |||
+ | proteinMass = 0; | ||
+ | for(printScan = 1; printScan < 25; printScan++){ | ||
+ | |||
+ | proteinMass += aaCount[printScan]*aaMass[printScan-1]; | ||
+ | |||
+ | } | ||
+ | |||
+ | //printf("%f\t", proteinMass); | ||
+ | fprintf(output, "%f\t", proteinMass); | ||
+ | |||
+ | proteinNitrogenRate = 0; | ||
+ | for(printScan = 1; printScan < 25; printScan++){ | ||
+ | |||
+ | proteinNitrogenRate += aaCount[printScan]*aaNitrogenContain[printScan-1]*14; | ||
+ | |||
+ | } | ||
+ | proteinNitrogenRate /= proteinMass; | ||
+ | //printf("%f\t", proteinNitrogenRate); | ||
+ | fprintf(output, "%f\t", proteinNitrogenRate); | ||
+ | |||
+ | if(sequenceLengthCheck !=sequenceLength){ | ||
+ | printf("\nERROR: Sequence length not match! Length:%d LengthCheck:%d", sequenceLength, sequenceLengthCheck); | ||
+ | return; | ||
+ | } | ||
+ | |||
+ | //printf("\n"); | ||
+ | fprintf(output, "\n"); | ||
+ | |||
+ | |||
+ | int completeRate = completeMount * 10000/ 547599; | ||
+ | int oldCompleteRate; | ||
+ | |||
+ | if(completeRate != oldCompleteRate){ | ||
+ | printf("\rProcessing...\t%.2lf%%", completeMount * 100.0/ 547599); | ||
+ | oldCompleteRate = completeRate; | ||
+ | } | ||
+ | |||
+ | |||
+ | } | ||
+ | |||
+ | |||
+ | fclose(input); | ||
+ | fclose(output); | ||
+ | |||
+ | end = clock(); | ||
+ | |||
+ | printf("\n\nComplete in %.2lf seconds.\t(%.2lf proteins/sec)\n",(end-start)/(double)(CLOCKS_PER_SEC), completeMount/((end-start)/(double)(CLOCKS_PER_SEC))); | ||
+ | printf("%d Proteins Read\n", completeMount); | ||
+ | |||
+ | } | ||
+ | |||
+ | int aaAnalization(char aa){ | ||
+ | int i; | ||
+ | |||
+ | if(aa=='\n'){ | ||
+ | return(26); | ||
+ | } | ||
+ | else{ | ||
+ | i=(int)aa-65; | ||
+ | return(aaConvert[i]); | ||
+ | } | ||
+ | |||
+ | } | ||
+ | </textarea></div> | ||
<a class="anchor_item colelem" id="biobrick-parts-validation"></a> | <a class="anchor_item colelem" id="biobrick-parts-validation"></a> | ||
<div class="clearfix colelem" id="u4077"><!-- group --> | <div class="clearfix colelem" id="u4077"><!-- group --> |
Revision as of 16:25, 16 September 2015
*
*
After understanding of protein composition required for fire retardant effectiveness, we would like to find a novel protein candiate with fire retardant properties. We decided to use protein data mining approach in bioinformatics for our purpose.
We contacted and asked iGEM team, NCTU-Formosa for collaboration with data mining and programming. Chao-Di Chang, who is an advisor of team NCTU-Formosa in 2014, taught us a lot in the field of bioinformatics and protein databases such as NCBI, UniProtKB and dbPTM. The super advisor, Prof. Wen-Liang Chen gave us many great advices and shared experiences in the previous iGEM projects and Jamboree.
This year, the members of NCTU-Formosa helped us finding out the fire retardant target in the protein data bank. And Yi-Ru Liu of team Mingdao wrote a programming code (see here) and NCTU-Formosa gave us a lot of comments on the code. With the thanks of NCTU-Formosa, we found the ideal target with high serine and arginine contents in protein composition from Uniprot database (see the result).
*
*
We’ve created BioBricks of SR/pSB1C3 (BBa_K1608000), SRPK/pSB1C3 (BBa_K1608001) and cloned the genes to the commercial expression vectors, i.e., SR/pGEX-2T and SRPK/pET-29b.
Credit: iGEM Team Uppsala 2012
In order to test the expression system, we request some BioBricks carrying the genes (amajLime, BBa_K1033915; asPink, BBa_K1033933; eforRed, BBa_K592012 tsPurple, BBa_K1033905) of chromoproteins from iGEM Team Uppsala. The work was done in 2012.
We transferred the parts to pGEX-2T followed by transforming E. coli BL21. Color proteins seen by naked eyes are the indicator of success of protein induction and expreesion system. In addition to validation of BioBricks created previously, it also can be used to optimize the condition of protein induction system.
The experiment procedure was described briefly as follows:
↓ Culture E. coli BL21 O/N, which carry indicated BioBricks from the stocks at -80°C.
↓ Dilute 100X in the fresh media (LB with 100 ug/ml Ampicillin) for 2 hr
↓ Till OD600 = 06~1.0, 1mM IPTG was added and incubate for 4~5 hr at 37°C
↓ Transfer 1 ml of bacteria and centrifuge for 1.5 min
↓ Observe color expression
The expression of color genes after IPTG induction in pGEX-2T system. Left ot right: amajLime (BBa_K1033915), asPink (BBa_K1033933), eforRed ( BBa_K592012), tsPurple (BBa_K1033905), BL21 (pGEX-2T empty vector)