diff --git a/README.md b/README.md index fdc1ec586217723ac89f0959b7b7aa3940d90331..203dfdbb077be6297576d0df81856b73ceb401ce 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Embedding-based alignment (EBA) This repository contains the implementation of the EBA method as described in: ["Embedding-based alignment: combining protein language models with dynamic programming alignment to detect structural similarities in the twilight-zone"](https://doi.org/10.1093/bioinformatics/btad786). -Notice that the embedding extraction is independent from the EBA method, and any pLM can be used. However, to facilitate the application we provide a module (plm_extractor.py) that allows the extraction of the per-residue embedding representations for the following pLMs: ProstT5, ProtT5 and ESM-b1. +Notice that the embedding extraction is independent from the EBA method, and any pLM can be used. However, to facilitate the application we provide a module (plm_extractor.py) that allows the extraction of the per-residue embedding representations for the following pLMs: ProtT5, ESM-b1n and ProstT5. Note: In case of high dimensionality embeddings (such as ESM2), we suggest to run the EBA with the parameter l=0.1 or l=0.01 to avoid precision errors. @@ -30,7 +30,7 @@ protT5_ext = plm.load_extractor('ProtT5', 'residue', device=device) seq1 = 'MLIAFEGIDGSGKTTQAKKLYEYLKQKGYFVSLYREPGGTKVGEVLREILLTEELDERTELLLFEASRSKLIEEKIIPDLKRDKVVILDRFVLSTIAYQGYGKGLDVEFIKNLNEFATRGVKPDITLLLDIPVDIALRRLKEKNRFENKEFLEKVRKGFLELAKEEENVVVIDASGEEEEVFKEILRALSGVLRV' seq2 = 'RRGALIVLEGVDRAGKSTQSRKLVEALCAAGHRAELLRFPERSTEIGKLLSSYLQKKSDVEDHSVHLLFSANRWEQVPLIKEKLSQGVTLVVDRYAFSGVAFTGAKENFSLDWCKQPDVGLPKPDLVLFLQLQLADAAKRGAFGHERYENGAFQERALRCFHQLMKDTTLNWKMVDASKSIEAVHEDIRVLSEDAIATATEKPLGELWK' -### extract per-residue embeddings +### extract per-residue embeddings (if you are using ProstT5, add "<AA2fold> " to the sequences) emb1 = protT5_ext.extract(seq1) emb2 = protT5_ext.extract(seq2) print(emb1.shape) @@ -39,7 +39,7 @@ print(emb1.shape) similarity_matrix = sm.compute_similarity_matrix(emb1, emb2) eba_results = methods.compute_eba(similarity_matrix) ### to return the alignment itself use: -#eba_results = eba.EBA(similarity_matrix, extensive_output=True) +#eba_results = methods.compute_eba(similarity_matrix, extensive_output=True) ### show results print('EBA raw: ', eba_results['EBA_raw']) diff --git a/eba_example.py b/eba_example.py index 83d35f5a8a6314d05f8f288fcb65a15f4462eec0..27d0b0a2cdbf408db91bed37c7aa64da1c265c2d 100644 --- a/eba_example.py +++ b/eba_example.py @@ -7,11 +7,11 @@ from eba import plm_extractor as plm device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') protT5_ext = plm.load_extractor('ProtT5', 'residue', device=device) -### sequences example +### sequences example seq1 = 'MLIAFEGIDGSGKTTQAKKLYEYLKQKGYFVSLYREPGGTKVGEVLREILLTEELDERTELLLFEASRSKLIEEKIIPDLKRDKVVILDRFVLSTIAYQGYGKGLDVEFIKNLNEFATRGVKPDITLLLDIPVDIALRRLKEKNRFENKEFLEKVRKGFLELAKEEENVVVIDASGEEEEVFKEILRALSGVLRV' seq2 = 'RRGALIVLEGVDRAGKSTQSRKLVEALCAAGHRAELLRFPERSTEIGKLLSSYLQKKSDVEDHSVHLLFSANRWEQVPLIKEKLSQGVTLVVDRYAFSGVAFTGAKENFSLDWCKQPDVGLPKPDLVLFLQLQLADAAKRGAFGHERYENGAFQERALRCFHQLMKDTTLNWKMVDASKSIEAVHEDIRVLSEDAIATATEKPLGELWK' -### extract per-residue embeddings +### extract per-residue embeddings (if you are using ProstT5, add "<AA2fold> " to the sequences) emb1 = protT5_ext.extract(seq1) emb2 = protT5_ext.extract(seq2) print(emb1.shape)