package genetic_code_pak; -- collection of genetic code utilities, including unigene analysis
const genetic_code := {["TTT","Phe","F"], ["TTC","Phe","F"], ["TTA","Leu","L"], ["TTG","Leu","L"], ["TCT","Ser","S"],
["TCC","Ser","S"], ["TCA","Ser","S"], ["TCG","Ser","S"], ["TAT","Tyr","Y"], ["TAC","Tyr","Y"], ["TAA","*","\r"],
["TAG","*","\r"], ["TGT","Cys","C"], ["TGC","Cys","C"], ["TGA","*","\r"], ["TGG","Trp","W"], ["CTT","Leu","L"],
["CTC","Leu","L"], ["CTA","Leu","L"], ["CTG","Leu","L"], ["CCT","Pro","P"], ["CCC","Pro","P"], ["CCA","Pro","P"],
["CCG","Pro","P"], ["CAT","His","H"], ["CAC","His","H"], ["CAA","Gln","Q"], ["CAG","Gln","Q"], ["CGT","Arg","R"],
["CGC","Arg","R"], ["CGA","Arg","R"], ["CGG","Arg","R"], ["ATT","Ile","I"], ["ATC","Ile","I"], ["ATA","Ile","I"],
["ATG","Met","M"], ["ACT","Thr","T"], ["ACC","Thr","T"], ["ACA","Thr","T"], ["ACG","Thr","T"], ["AAT","Asn","N"],
["AAC","Asn","N"], ["AAA","Lys","K"], ["AAG","Lys","K"], ["AGT","Ser","S"], ["AGC","Ser","S"], ["AGA","Arg","R"],
["AGG","Arg","R"], ["GTT","Val","V"], ["GTC","Val","V"], ["GTA","Val","V"], ["GTG","Val","V"], ["GCT","Ala","A"],
["GCC","Ala","A"], ["GCA","Ala","A"], ["GCG","Ala","A"], ["GAT","Asp","D"], ["GAC","Asp","D"], ["GAA","Glu","E"],
["GAG","Glu","E"], ["GGT","Gly","G"], ["GGC","Gly","G"], ["GGA","Gly","G"], ["GGG","Gly","G"]};
const complement := {["a","t"],["t","a"],["c","g"],["g","c"],["A","T"],["T","A"],["C","G"],["G","C"]};
-- Oily: AGVLIPPMWC Hydrocarbon or SH, one nitrogen in ring (nonpairing)
-- Polar: NQSTV CONH2 or OH (self-pairing)
-- Pos: RKH NH2 or double N in ring (pairs with complement)
-- Neg: DE COOH (pairs with complement)
const protein_quality := {["A","h"],["G","h"],["V","h"],["L","h"],["I","h"],["P","h"],["F","h"],["M","h"],
["W","h"],["C","h"],["N","n"],["Q","n"],["S","n"],["T","n"],["Y","n"],
["R","p"],["K","p"],["H","p"],["D","a"],["E","a"]};
-- h = hydrocarbon, a = acid, b = basic, n = polar neutral, u = unknown
-- paths to files used and written
var bioinf_course_prefix := "Diana:Diana2003:Pub:FromGiuseppeLATEST:SetlFolder:bioinformatics_course:";
var to_genome_lengths_prefix := bioinf_course_prefix + "unigene:Unigene_analysis_and_psls:genome_lengths:";
var to_raw_unigene_prefix := bioinf_course_prefix + "unigene:";
var gb_active_ix_flat := OM; -- the index flat to the genbank active file
var gb_active_file_handle := OM; -- handle for the genbank active file
-- ***** Legend for restiction enymes data
-- 3.2. Purine (adenine or guanine): R
-- 3.3. Pyrimidine (thymine or cytosine): Y
-- 3.4. Adenine or thymine: W
-- 3.5. Guanine or cytosine: S
-- 3.6. Adenine or cytosine: M
-- 3.7. Guanine or thymine: K
-- 3.8. Adenine or thymine or cytosine: H
-- 3.9. Guanine or cytosine or thymine: B
-- 3.10. Guanine or adenine or cytosine: V
-- 3.11. Guanine or adenine or thymine: D
-- 3.12. Guanine or adenine or thymine or cytosine: N
const base_weights := {["A",329.2],["T",306.2],["U",306.2],["C",305.2],["G",345.2],["N",329.2]};
const peptide_weights := {["A",129.1],["R",156.2],["N",114.1],["D",115.1],
["C",103.1],["Q",128.1],["E",129.1],["G",57.1],
["H",137.1],["I",113.1],["L",113.1],["K",128.2],["M",131.2],["F",147.2],
["P",97.1],["S",87.1],["T",101.1],["W",186.2],["Y",163.2],["V",99.1],["B",132.6],["Z",146.6]};
const restriction_enzymes_data := {["Aar_I", "AGGCCTN4^N4"], ["Aas_I", "GACNNNN^NNGTC"], ["Aat_I", "AGG^CCT"], ["Aat_II", "GACGT^C"],
["Aau_I", "T^GTACA"], ["Acc_I", "GT^MKAC"], ["Acc_II", "CG^CG"], ["Acc_III", "T^CCGGA"],
["Acc16_I", "TGC^GCA"], ["Acc36_I", "ACCTGC(N)4^(N)4"], ["Acc65_I", "G^GTSCC"], ["Acc113_I", "HGT^ACT"],
["AccB1_I", "G^GYRCC"], ["AccB7_I", "CCA(N)4^NTGG"], ["AccBS_I", "CCG^CTC"], ["Aci_I", "C^CGC"],
["Acl_I", "AA^CGTT"], ["AclW_I", "GGATC(N)4^"], ["Acs_I", "R^AATTY"], ["Acu_I", "CTGAAG(N)16^"],
["Acv_I", "CAC^GTG"], ["Acy_I", "GR^CGYC"], ["Ade_I", "CACNNN^GTG"], ["Afa_I", "GT^AC"],
["Afe_I", "HGC^GCT"], ["Afl_II", "C^TTSHG"], ["Afl_III", "A^CRYGT"], ["Age_I", "A^CCGGT"],
["Ahd_I", "GACNNN^NNGTC"], ["Ahl_I", "A^CTAGT"], ["Ale_I", "CACNN^NNGTG"], ["Alo_I", "(N)5^(N)7GAAC(N)6TCC(N)12^"],
["Alu_I", "HG^CT"], ["Alw_I", "GGATC(N)4^"], ["Alw21_I", "GWCCW^C"], ["Alw26_I", "GTCTCN^"],
["Alw44_I", "G^TGCAC"], ["AlwN_I", "CHGNNN^CTG"], ["Ama87_I", "C^YCGRG"], ["Aor51H_I", "HGC^GCT"],
["Apa_I", "GGGCC^C"], ["ApaL_I", "G^TGCAC"], ["Apo_I", "R^AATTY"], ["Asc_I", "GG^CGCGCC"],
["Ase_I", "AT^TSAT"], ["AsiA_I", "A^CCGGT"], ["AsiS_I", "GCGAT^CGC"], ["Asp_I", "GACN^NNGTC"],
["Asp700_I", "GAANN^NNTTC"], ["Asp718_I", "G^GTSCC"], ["AspA2_I", "C^CTAGG"], ["AspE_I", "GACNNN^NNGTC"],
["AspH_I", "GWGCW^C"], ["AspLE_I", "GCG^C"], ["AspS9_I", "G^GNCC"], ["Asu_II", "TT^CGAA"],
["AsuC2_I", "CC^SGG"], ["AsuHP_I", "GGTGA(N)8^"], ["AsuNH_I", "G^CTHGC"], ["Ava_I", "C^YCGRG"],
["Ava_II", "G^GWCC"], ["Avi_II", "TGC^GCA"], ["Avr_II", "C^CTHGG"], ["Axy_I", "CC^TNHGG"],
["Bae_I", "^(N)10AC(N)4"], ["Bal_I", "TGG^CCA"], ["BamH_I", "G^GATCC"], ["Ban_I", "G^GYRCC"],
["Ban_II", "GRGCY^C"], ["Ban_III", "AT^CGAT"], ["Bbe_I", "GGCGC^C"], ["BbrP_I", "CAC^GTC"],
["Bbs_I", "GAHGACNN^"], ["Bbu_I", "GCATG^C"], ["Bbv_I", "GCHGC(N)8^"], ["Bbv12_I", "GWGCW^C"],
["BbvC_I", "CC^TCHGC"], ["Bcc_I", "CCATCNNNN^N"], ["BceA_I", "ACGGC(N)12^(N)2"], ["Bcg_I", "^(N)10CGA(N)6TGC(N)12^"],
["BciV_I", "GTSTCC(N)6^"], ["Bcl_I", "T^GATCA"], ["Bcn_I", "CC^SGG"], ["Bcu_I", "A^CTHGT"],
["Bfa_I", "C^THG"], ["Bfi_I", "ACTGGG(N)5^"], ["Bfm_I", "C^TRYHG"], ["Bfr_I", "C^TTAAG"],
["BfrB_I", "ATG^CAT"], ["Bfu_I", "GTATCC(N)6^"], ["BfuA_I", "ACCTGCNNNN^NNNN"], ["BfuC_I", "^GATC"],
["Bgl_I", "GCC(N)4^NGGC"], ["Bgl_II", "A^GATCT"], ["Bln_I", "C^CTHGG"], ["Blp_I", "GC^TNHGC"],
["Bme18_I", "G^GWCC"], ["Bme1390_I", "CC^NGG"], ["Bme1580_I", "GKGCM^C"], ["BmgB_I", "GAC^GTC"],
["Bmr_I", "ACTGGG"], ["Bmt_I", "GCTAG^C"], ["Bmy_I", "GDGCH^C"], ["Box_I", "GACNN^NNGTC"],
["Bpi_I", "GAHGACNN^"], ["Bpl_I", "^(N)8GHG(N)5CTC(N)13^"], ["Bpm_I", "CTGGHG(N)16^"], ["Bpu10_I", "CC^TNHGC"],
["Bpu14_I", "TT^CGAA"], ["Bpu1102_I", "GC^TNHGC"], ["BpuA_I", "GAHGACNN^"], ["BpuE_I", "CTTGAG(N)16^"],
["Bsa_I", "GGTCTCN^"], ["Bsa29_I", "AT^CGAT"], ["BsaA_I", "YAC^GTR"], ["BsaB_I", "GATNN^NNATC"],
["BsaH_I", "GR^CGYC"], ["BsaJ_I", "C^CNNGG"], ["BsaM_I", "GAATGCN^"], ["BsaO_I", "CGRY^CG"],
["BsaW_I", "W^CCGGW"], ["BsaX_I", "(N)3(N)9AC(N)5CTCC(N)10^"], ["Bsc_I", "AT^CGAT"], ["Bsc4_I", "CC(N)5^NNGG"],
["Bse1_I", "ACTGGN^"], ["Bse3D_I", "GCAATGNN^"], ["Bse8_I", "GATNN^NNATC"], ["Bse21_I", "CC^TNHGG"],
["Bse118_I", "R^CCGGY"], ["BseA_I", "T^CCGGA"], ["BseB_I", "CCWGG"], ["BseC_I", "AT^CGAT"],
["BseD_I", "C^CNNGG"], ["BseG_I", "GGATGNN^"], ["BseJ_I", "GATNN^NNATC"], ["BseL_I", "CC(N)5^NNGG"],
["BseM_I", "GCAATGNN^"], ["BseM_II", "CTCHG(N)10^"], ["BseN_I", "ACTGGN^"], ["BseP_I", "G^CGCGC"],
["BseR_I", "GHGGHG(N)10^"], ["BseS_I", "GKGCM^C"], ["BseX_I", "GCHGC(8/12)"], ["BseX3_I", "C^GGCCG"],
["BseY_I", "C^CCAGC"], ["Bsg_I", "GTGCHG(N)16^"], ["Bsh1236_I", "CG^CG"], ["Bsh1285_I", "CGRY^CG"],
["Bsh1365_I", "GATNN^NNATC"], ["BshF_I", "GG^CC"], ["BshN_I", "G^GYRCC"], ["BshT_I", "A^CCGGT"],
["BsiB_I", "GATNN^NNATC"], ["BsiE_I", "CGRY^CG"], ["BsiHKA_I", "GWGCW^C"], ["BsiHKC_I", "C^YCGRG"],
["BsiM_I", "T^CCGGA"], ["BsiS_I", "C^CGG"], ["BsiW_I", "C^GTSCG"], ["BsiY_I", "CC(N)5^NNGG"],
["BsiZ_I", "AT^CGAT"], ["Bsl_I", "CC(N)5^NNGG"], ["BslF_I", "^(N)11GGGAC(N)10^"], ["Bsm_I", "GAATGCN^"],
["BsmA_I", "GTCTCN^"], ["BsmB_I", "CGTCTCN^"], ["BsmF_I", "GGGAC(N)10^"], ["Bso31_I", "GGTCTCN^NNNN"],
["BsoB_I", "C^YCGRG"], ["BsoMA_I", "GTCTCN^(N)4"], ["Bsp13_I", "T^CCGGA"], ["Bsp19_I", "C^CATGG"],
["Bsp68_I", "TCG^CGA"], ["Bsp106_I", "AT^CGAT"], ["Bsp119_I", "TT^CGAA"], ["Bsp120_I", "G^GGCCC"],
["Bsp143_I", "^GATC"], ["Bsp143_II", "RGCGC^Y"], ["Bsp1286_I", "GDGCH^C"], ["Bsp1407_I", "T^GTACA"],
["Bsp1720_I", "GC^TNAGC"], ["BspAN_I", "GG^CC"], ["BspC_I", "CGAT^CG"], ["BspCN_I", "CTCAG(N)9^"],
["BspD_I", "AT^CGAT"], ["BspE_I", "T^CCGGA"], ["BspH_I", "T^CATGA"], ["BspL_I", "GGN^NCC"],
["BspLU11_I", "A^CATGT"], ["BspM_I", "ACCTGC(N)4^"], ["BspM_I", "CTGCA^G"], ["BspP_I", "GGATC(N)4^"],
["BspT_I", "C^TTAAG"], ["BspT104_I", "TT^CGAA"], ["BspT107_I", "G^GYRCC"], ["BspTN_I", "GGTCTN^NNNN"],
["BspX_I", "AT^CGAT"], ["Bsr_I", "ACTGGN^"], ["BsrB_I", "CCG^CTC"], ["BsrD_I", "GCAATGNN^"],
["BsrF_I", "R^CCGGY"], ["BsrG_I", "T^GTACA"], ["BsrS_I", "ACTGGN^"], ["BssA_I", "R^CCGGY"],
["BssEC_I", "C^CNNGG"], ["BssH_I", "C^TCGAG"], ["BssH_II", "G^CGCGC"], ["BssK_I", "^CCNGG"],
["BssNA_I", "GTS^TSC"], ["BssS_I", "C^ACGHG"], ["BssT1_I", "C^CWWGG"], ["Bst2B_I", "C^ACGHG"],
["Bst2U_I", "CC^WGG"], ["Bst4C_I", "CAN^GT"], ["Bst6_I", "CTCTTCN^NNN"], ["Bst7l_I", "GCHGC(N)8^"],
["Bst98_I", "C^TTSHG"], ["Bst1107_I", "GTS^TSC"], ["BstAC_I", "GR^CGYC"], ["BstAP_I", "GCA(N)4^NTGC"],
["BstAU_I", "T^GTACA"], ["BstB_I", "TT^CGAA"], ["BstBA_I", "YAC^GTR"], ["BstC8_I", "GCN^NGC"],
["BstDE_I", "C^TNHG"], ["BstDS_I", "C^CRYGG"], ["BstE_II", "G^GTNACC"], ["BstEN_I", "CCTNN^NNNAGG"],
["BstEN_II", "^GATC"], ["BstF5_I", "GGATGNN^"], ["BstFN_I", "CG^CG"], ["BstH2_I", "RGCGC^Y"],
["BstHH_I", "GCG^C"], ["BstHP_I", "GTT^AAC"], ["BstKT_I", "GAT^C"], ["BstMA_I", "CTGCA^G"],
["BstMB_I", "^GATC"], ["BstMC_I", "CGRY^CG"], ["BstMW_I", "GCNNNNN^NNGC"], ["BstN_I", "CC^WGG"],
["BstNS_I", "RCATG^Y"], ["BstO_I", "CC^WGG"], ["BstP_I", "G^GTNACC"], ["BstPA_I", "GACNN^NNGTC"],
["BstSC_I", "^CCNGG"], ["BstSF_I", "C^TRYHG"], ["BstSN_I", "TSC^GTS"], ["BstU_I", "CG^CG"],
["BstV1_I", "GCAGC(N)8^(N)4"], ["BstV2_I", "GAAGACNN^NNNN"], ["BstX_I", "CCA(N)5^NTGG"], ["BstX2_I", "R^GATCY"],
["BstY_I", "R^GATCY"], ["BstZ_I", "C^GGCCG"], ["BstZ17_I", "GTS^TSC"], ["Bsu15_I", "AT^CGAT"],
["Bsu36_I", "CC^TNHGG"], ["BsuR_I", "GG^CC"], ["BsuTU_I", "AT^CGAT"], ["Btg_I", "CCR^YGG"],
["Btr_I", "CAC^GTC"], ["Bts_I", "GCHGTGNN^"], ["Bve_I", "ACCTGC(N)4^(N)4"], ["Type_II", "Restriction"],
["Enzymes_Sequence", "Restriction"], ["Cac8_I", "GCN^NGC"], ["Cai_I", "CHGNNN^CTG"], ["CciN_I", "GC^GGCCGC"],
["Cel_II", "GC^TNHGC"], ["Cfo_I", "GCG^C"], ["Cfr_I", "Y^GGCCR"], ["Cfr9_I", "C^CCGGG"],
["Cfr10_I", "R^CCGGY"], ["Cfr13_I", "G^GNCC"], ["Cfr42_I", "CCGC^GG"], ["Cla_I", "AT^CGAT"],
["Cpo_I", "CGGWC^CG"], ["Csp_I", "CG^GWCCG"], ["Csp6_I", "G^TSC"], ["Csp45_I", "TT^CGAA"],
["CspA_I", "A^CCGGT"], ["CviA_II", "C^ATG"], ["CviJ_I", "RG^CY"], ["CviR_I", "TG^CA"],
["CviT_I", "RG^CY"], ["Cvn_I", "CC^TNHGG"], ["Dde_I", "C^TNHG"], ["Dpn_I", "GA^TC"],
["Dpn_II", "^GATC"], ["Dra_I", "TTT^AAA"], ["Dra_II", "RG^GNCCY"], ["Dra_III", "CACNNN^GTG"],
["Drd_I", "GAC(N)4^NNGTC"], ["DseD_I", "GAC(N)4^NNGTC"], ["Eae_I", "Y^GGCCR"], ["Eag_I", "C^GGCCG"],
["Eam1104_I", "CTCTTCN^"], ["Eam1105_I", "GACNNN^NNGTC"], ["Ear_I", "CTCTTCN^"], ["Eci_I", "GGCGGA(N)11^"],
["Ecl136_II", "GHG^CTC"], ["EclHK_I", "GACNNN^NNGTC"], ["EclX_I", "C^GGCCG"], ["Eco24_I", "GRGCY^C"],
["Eco31_I", "GGTCTCN^"], ["Eco32_I", "GAT^ATC"], ["Eco47_I", "G^GWCC"], ["Eco47_III", "HGC^GCT"],
["Eco52_I", "C^GGCCG"], ["Eco57_I", "CTGAAG(N)16^"], ["Eco57M_I", "CTGRAG16^"], ["Eco72_I", "CAC^GTG"],
["Eco81_I", "CC^THGG"], ["Eco88_I", "C^YCGRG"], ["Eco91_I", "G^GTNACC"], ["Eco105_I", "TSC^GTS"],
["Eco130_I", "C^CWWGG"], ["Eco147_I", "HGG^CCT"], ["EcoICR_I", "GHG^CTC"], ["EcoN_I", "CCTNN^NNNHGG"],
["EcoO65_I", "G^GTNACC"], ["EcoO109_I", "RG^GNCCY"], ["EcoR_I", "G^AATTC"], ["EcoR_II", "^CCWGG"],
["EcoR_V", "GAT^ATC"], ["EcoT14_I", "C^CWWGG"], ["EcoT22_I", "ATGCA^T"], ["EcoT38_I", "GRGCY^C"],
["Ege_I", "GGC^GCC"], ["Ehe_I", "GGC^GCC"], ["Erh_I", "C^CWWGG"], ["Esp3_I", "CGTCTCN^"],
["Fal_I", "(N)5^(N)8AAG(N)5CTT(N)13^"], ["Fat_I", "^CATG"], ["Fau_I", "CCCGC(N)4^"], ["FauND_I", "CA^TSTG"],
["Fba_I", "T^GATCA"], ["Fbl_I", "GT^MKAC"], ["Fnu4H_I", "GC^NGC"], ["Fok_I", "GGATG(N)9^"],
["FriO_I", "GRGCY^C"], ["Fse_I", "GGCCGG^CC"], ["Fsp_I", "TGC^GCA"], ["Fsp4H_I", "GC^NGC"],
["FspA_I", "RTGC^GCAY"], ["Fun_I", "AGC^GCT"], ["Fun_II", "G^AATTC"], ["Gsu_I", "CTGGHG(N)16^"],
["Hae_II", "RGCGC^Y"], ["Hae_III", "GG^CC"], ["Hap_II", "C^CGG"], ["Hga_I", "GACGC(N)5^"],
["Hha_I", "GCG^C"], ["Hin1_I", "GR^CGYC"], ["Hin4_I", "(N)5^(N)8GAY(N)5VTC(N)13^"], ["Hin6_I", "G^CGC"],
["Hinc_II", "GTY^RAC"], ["Hind_II", "GTY^YAC"], ["Hind_III", "A^HGCTT"], ["Hinf_I", "G^ANTC"],
["HinP1_I", "G^CGC"], ["Hpa_I", "GTT^AAC"], ["Hpa_II", "C^CGG"], ["Hph_I", "GGTGA(N)8^"],
["Hpy8_I", "GTN^NAC"], ["Hpy99_I", "CGWCG^"], ["Hpy188_I", "TCN^GA"], ["Hpy188_III", "TC^NNGA"],
["HpyCH4_III", "ACN^GT"], ["HpyCH4_IV", "A^CGT"], ["HpyCH4_V", "TG^CA"], ["HpyF10_VI", "GC(N)6^NGC"],
["Hsp92_I", "GR^CGYC"], ["Hsp92_II", "CATG^"], ["HspA_I", "G^CGC"], ["Ita_I", "GC^NGC"],
["Kas_I", "G^GCGCC"], ["Kpn_I", "GGTSC^C"], ["Kpn2_I", "T^CCGGA"], ["Ksp_I", "CCGC^GG"],
["Ksp22_I", "T^GATCA"], ["Ksp632_I", "CTCTTCN^"], ["KspA_I", "GTT^AAC"], ["Kzo9_I", "^GATC"],
["Lsp_I", "TT^CGAA"], ["Lwe_I", "GCATC(N)5^(N)4"], ["Mab_I", "A^CCWGGT"], ["Mae_I", "C^TAG"],
["Mae_II", "A^CGT"], ["Mae_III", "^GTNAC"], ["Mam_I", "GATNN^NNATC"], ["Mbi_I", "CCG^CTC"],
["Mbo_I", "^GATC"], ["Mbo_II", "GAHGA(N)8^"], ["Mfe_I", "C^AATTG"], ["Mfl_I", "R^GATCY"],
["Mhl_I", "GDGCH^C"], ["Mls_I", "TGG^CCA"], ["Mlu_I", "A^CGCGT"], ["MluN_I", "TGG^CCA"],
["Mly_I", "GHGTCN5^"], ["Mly113_I", "GG^CGCC"], ["Mme_I", "TCCRAC(N)20^"], ["Mnl_I", "CCTC(N)7^"],
["Mph1103_I", "ATGCA^T"], ["Mro_I", "T^CCGGA"], ["MroN_I", "G^CCGGC"], ["MroX_I", "GAANN^NNTTC"],
["Msc_I", "TGG^CCA"], ["Mse_I", "T^TAA"], ["Msl_I", "CAY(N)4RTG"], ["Msp_I", "C^CGG"],
["Msp17_I", "GR^CGYC"], ["Msp20_I", "TGG^CCA"], ["MspA1_I", "CMG^CKG"], ["MspC_I", "C^TTSHG"],
["MspR9_I", "CC^NGG"], ["Mss_I", "GTTT^AAAC"], ["Mun_I", "C^AATTG"], ["Mva_I", "CC^WGG"],
["Mva1269_I", "GAATGCN^"], ["Mvn_I", "CG^CG"], ["Mwo_I", "GC(N)5^NNGC"], ["Nae_I", "GCC^GGC"],
["Nar_I", "GG^CGCC"], ["Nci_I", "CC^SGG"], ["Nco_I", "C^CATGG"], ["Nde_I", "CA^TSTG"],
["Nde_II", "^GATC"], ["NgoM_IV", "G^CCGGC"], ["Nhe_I", "G^CTHGC"], ["Nla_III", "CATG^"],
["Nla_IV", "GGN^NCC"], ["NmuC_I", "^GTSAC"], ["Not_I", "GC^GGCCGC"], ["Nru_I", "TCG^CGA"],
["NruG_I", "GACNNN^NNGTC"], ["Nsb_I", "TGC^GCA"], ["Nsi_I", "ATGCA^T"], ["Nsp_I", "RCATG^Y"],
["Nsp_III", "C^YCGRG"], ["Nsp_V", "TT^CGAA"], ["Oli_I", "CACNN^NNGTG"], ["Pac_I", "TTSAT^TSA"],
["Pae_I", "GCATG^C"], ["PaeR7_I", "C^TCGHG"], ["Pag_I", "T^CATGA"], ["Pal_I", "GG^CC"],
["Pau_I", "G^CGCGC"], ["Pce_I", "AGG^CCT"], ["Pci_I", "A^CATGT"], ["Pct_I", "GAATGCN^"],
["Pdi_I", "GCC^GGC"], ["Pdm_I", "GAANN^NNTTC"], ["Pfl23_II", "C^GTSCG"], ["PflB_I", "CCANNNN^NTGG"],
["PflF_I", "GACN^NNGTC"], ["PflM_I", "CCA(N)4^NTGG"], ["Pfo_I", "T^CCNGGA"], ["Pho_I", "GG^CC"],
["PinA_I", "A^CCGGT"], ["Ple_I", "GHGTC(N)4^"], ["Ple19_I", "CGAT^CG"], ["PmaC_I", "CAC^GTG"],
["Pme_I", "GTTT^AAAC"], ["Pml_I", "CAC^GTG"], ["Ppi_I", "(N)5^(N)7GAAC(N)5CTC(N)13^"], ["Pps_I", "GHGTC(N)4^"],
["Ppu10_I", "A^TGCAT"], ["PpuM_I", "RG^GWCCY"], ["PpuX_I", "RG^GWCCY"], ["PshA_I", "GACNN^NNGTC"],
["PshB_I", "AT^TSAT"], ["Psi_I", "TTS^TSA"], ["Psp5_II", "RG^GWCCY"], ["Psp6_I", "^CCWGG"],
["Psp124B_I", "GAGCT^C"], ["Psp1406_I", "AA^CGTT"], ["PspA_I", "C^CCGGG"], ["PspE_I", "G^GTNACC"],
["PspG_I", "^CCWGG"], ["PspL_I", "C^GTSCG"], ["PspN4_I", "GGN^NCC"], ["PspOM_I", "G^GGCCC"],
["PspP_I", "G^GNCC"], ["PspPP_I", "RG^GWCCY"], ["PspX_I", "VC^TCGAGB"], ["Psr_I", "(N)5^(N)7GAAC(N)6TAC(N)12^"],
["Pst_I", "CTGCA^G"], ["Psu_I", "R^GATCY"], ["Psy_I", "GACN^NNGTC"], ["Pvu_I", "CGAT^CG"],
["Pvu_II", "CHG^CTG"], ["Rca_I", "T^CATGA"], ["Rsa_I", "GT^AC"], ["Rsr_II", "CG^GWCCG"],
["Rsr2_I", "CG^GWCCG"], ["Sac_I", "GHGCT^C"], ["Sac_II", "CCGC^GG"], ["Sal_I", "G^TCGAC"],
["SanD_I", "GG^GWCCC"], ["Sap_I", "GCTCTTCN^"], ["Sat_I", "GC^NGC"], ["Sau3A_I", "^GATC"],
["Sau96_I", "G^GNCC"], ["Sbf_I", "CCTGCA^GG"], ["Sca_I", "HGT^ACT"], ["Sch_I", "GHGTC(N)5^"],
["ScrF_I", "CC^NGG"], ["Sda_I", "CCTGCA^GG"], ["Sdu_I", "GDGCH^C"], ["SexA_I", "A^CCWGGT"],
["SfaN_I", "GCATC(N)5^"], ["Sfc_I", "C^TRYHG"], ["Sfi_I", "GGCC(N)4^NGGCC"], ["Sfo_I", "GGC^GCC"],
["Sfr274_I", "C^TCGHG"], ["Sfr303_I", "CCGC^GG"], ["Sfu_I", "TT^CGAA"], ["Sgf_I", "GCGAT^CGC"],
["SgrA_I", "CR^CCGGYG"], ["SgrB_I", "CCGC^GG"], ["Sin_I", "G^GWCC"], ["Sla_I", "CTCGAG"],
["Sma_I", "CCC^GGG"], ["Smi_I", "ATTT^AAAT"], ["SmiM_I", "CAYNN^NNRTG"], ["Sml_I", "C^TYRAG"],
["Smu_I", "CCCGCNNNN^NN"], ["SnaB_I", "TSC^GTS"], ["SpaH_I", "GCATG^C"], ["Spe_I", "A^CTHGT"],
["Sph_I", "GCATG^C"], ["Srf_I", "GCCC^GGGC"], ["Sse9_I", "^AATT"], ["Sse8387_I", "CCTGCA^GG"],
["SseB_I", "HGG^CCT"], ["Ssi_I", "CC^GC"], ["Ssp_I", "AAT^ATT"], ["SspB_I", "T^GTSCA"],
["Sst_I", "GHGCT^C"], ["Sst_II", "CCGC^GG"], ["Stu_I", "AGG^CCT"], ["Sty_I", "C^CWWGG"],
["StyD4_I", "^CCNGG"], ["Sun_I", "C^GTSCG"], ["Swa_I", "ATTT^AAAT"], ["Taa_I", "ACN^GT"],
["Tai_I", "ACGT^"], ["Taq_I", "T^CGA"], ["Taq_II", "GACCGA(N)11^"], ["Tas_I", "^AATT"],
["Tat_I", "W^GTACW"], ["Tau_I", "GCSG^C"], ["Tel_I", "GACN^NNGTC"], ["Tfi_I", "G^AWTC"],
["Tha_I", "CG^CG"], ["Tli_I", "CTCGAG"], ["Tru1_I", "T^TAA"], ["Tru9_I", "T^TAA"],
["Tsc_I", "ACGT^"], ["Tse_I", "G^CWGC"], ["Tsp45_I", "^GTSAC"], ["Tsp509_I", "^AATT"],
["TspDT_I", "ATGAA(N)11^"], ["TspE_I", "^AATT"], ["TspGW_I", "ACGGA(N)11^"], ["TspR_I", "NNCASTGNN^"],
["Tth111_I", "GACN^NNGTC"], ["TthHB8_I", "T^CGA"], ["Van91_I", "CCA(N)4^NTGG"], ["Vha464_I", "C^TTAAG"],
["Vne_I", "G^TGCAC"], ["VpaK11B_I", "G^GWCC"], ["Vsp_I", "AT^TAAT"], ["Xag_I", "CCTNN^NNNAGG"],
["Xap_I", "R^AATTY"], ["Xba_I", "T^CTAGA"], ["Xce_I", "RCATG^Y"], ["Xcm_I", "CCA(N)5^(N)4TGG"],
["Xho_I", "C^TCGAG"], ["Xho_II", "R^GATCY"], ["Xma_I", "C^CCGGG"], ["Xma_III", "C^GGCCG"],
["XmaC_I", "C^CCGGG"], ["XmaJ_I", "C^CTAGG"], ["Xmi_I", "GT^MKAC"], ["Xmn_I", "GAANN^NNTTC"],
["Xsp_I", "C^TAG"], ["Zho_I", "AT^CGAT"], ["Zra_I", "GAC^GTC"], ["Zsp2_I", "ATGCA^T"]};
procedure actual_data(rec_triple); -- read a record using its triple
procedure cds_start_and_translation(stg); -- try translating a sequence in all three frames; return bet piece
procedure tom_in_protein(stg); -- find first start codon in DNA translation
procedure align_by_mers(stg1,stg2,mer_size); -- alignment of two strings by the common mer method.
procedure make_random_dna(n); -- make random sequences of bases of given length
procedure histo_dna(tup,name); -- histogramming
procedure prepare_dna(stg); -- remove junk from fasta record
procedure translate_dna(stg); -- translate dna to protein
procedure rev_comp_dna(stg); -- reverse and complement bases in DNA string
procedure reverse_dna(stg); -- reverse DNA string
procedure complementary_dna(stg); -- complement bases in DNA string
procedure peek(file_name,strt,n); -- peek at start of specified file
procedure survey_ncbi_active_list(file_name); -- initial survey of NCBI active list
procedure index_ncbi_active_list(file_name); -- index the NCBI active list, producing and writing an index flat
procedure gb_record(gb_record_name); -- fetch a genbank record by its name
procedure get_by_gi(gi_no); -- get a genbank record by its genbank gi number
procedure data_blocks_in(lines); -- extract the data blocks from a Genbank record
procedure ncbi_hash(stg); -- hash an ncbi GB name
procedure cut_by_enzymes(flat_stg,enz_name_list); -- return the list of pieces into which a string of bases
-- is cut by a specified list of restriction enzymes,
-- applied in the specified order, return triples [enzA,substg,enzB]
procedure group_strings_by_alignment(tup_of_stgs,tags,mer_size); -- find closely matched subgroups in a list of strings
-- *************************** Unigene Analysis Codes ***************************
procedure filter_psl(file_name); -- filter one of Toto's .psl output files
-- procedure build_examine_unigene_indices(); -- build indices of unigene files and then check them (moved to test program at end)
procedure merge_psl_summaries(file_name_list,out_name); -- merge a list of psl summary files
procedure build_unigene_index(list_of_unigene_filenames,saved_index_name); -- prepare and save a unigene file index
procedure tag_and_index_ug_files(file_name_tup); -- prepare a unigene file index also showing the starting position of the cds
-- this routine can prepare a single index for multiple files
-- Note that this routine returns an index object, leaving it
-- to the calling routine (see build_unigene_indices) to save the index object
procedure triples_in_several(input_file); -- look for cross-species triples; for use with remotely related species
procedure exon_length_compare_histogram(input_file1,input_file2); -- histogram of exon lengths
procedure histo_lengths(input_file); -- build histogram of interior exon lengths
procedure triple_occurences(input_file,rix_file); -- look for multiply occuring triples and assemble annotated groups
procedure show_moduli(thist_line); -- transform a .thist line to show exon moduli and start of reading frame
procedure find_genbank_id(rec_start,rec_len,file_handle); -- read a genbank id from the header line
end genetic_code_pak;
package body genetic_code_pak; -- collection of genetic code utilities
use tkw,string_utility_pak,sort_pak,random_pak,rix_flat_pak,get_lines_pak;
-- use rix_flat_pak to get the gene annotations or data
const allowed_pept_block_charsA := "AGVLIPFMWCNQSTVRKHDEY "; -- allowed peptide charcters
const allowed_pept_block_charsB := "agvlipfmwcnqstvrkhdey ";
const allowed_prefix_chars := "0123456789 ";
const DNA_strand := 9,q_start := 12,q_end := 13;
var ihandle,ohandle_1,ohandle_2,ohandle_n,badhandle,start_line := OM,histo_exon_count,sec_count;
var hv_debug,lines,lines_with_trips,ohandle;
var rix_obj; -- index prepared for access to Unigene information
var alignment_count := 0; -- for tracking progress ith alignments
procedure get_by_gi(gi_no); -- get a record by its genbank gi number
-- print("fetching url: ",url := "ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=nucleotide&val=" + gi_no);
return http_get(url);
end get_by_gi;
procedure http_get(url); -- get url using tkw socket
Tk ?:= tkw(); -- open the main TK window if necessary
url_host := break(url,"/");
sock := Tk("socket",["www." + url_host + ":80","text"]);
-- open socket communication with the HTML host
lines := [];
sock(OM) := "GET " + url;
while (resp := sock(OM)) /= "