From 95e7ed133bb1ccb5c74ebaac225b42e2d70215d1 Mon Sep 17 00:00:00 2001 From: Lorenzo Pantolini <lorenzo.pantolini@unibas.ch> Date: Thu, 8 Aug 2024 14:32:12 +0200 Subject: [PATCH] update README --- README.md | 6 +++--- eba_example.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index fdc1ec5..203dfdb 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Embedding-based alignment (EBA) This repository contains the implementation of the EBA method as described in: ["Embedding-based alignment: combining protein language models with dynamic programming alignment to detect structural similarities in the twilight-zone"](https://doi.org/10.1093/bioinformatics/btad786). -Notice that the embedding extraction is independent from the EBA method, and any pLM can be used. However, to facilitate the application we provide a module (plm_extractor.py) that allows the extraction of the per-residue embedding representations for the following pLMs: ProstT5, ProtT5 and ESM-b1. +Notice that the embedding extraction is independent from the EBA method, and any pLM can be used. However, to facilitate the application we provide a module (plm_extractor.py) that allows the extraction of the per-residue embedding representations for the following pLMs: ProtT5, ESM-b1n and ProstT5. Note: In case of high dimensionality embeddings (such as ESM2), we suggest to run the EBA with the parameter l=0.1 or l=0.01 to avoid precision errors. @@ -30,7 +30,7 @@ protT5_ext = plm.load_extractor('ProtT5', 'residue', device=device) seq1 = 'MLIAFEGIDGSGKTTQAKKLYEYLKQKGYFVSLYREPGGTKVGEVLREILLTEELDERTELLLFEASRSKLIEEKIIPDLKRDKVVILDRFVLSTIAYQGYGKGLDVEFIKNLNEFATRGVKPDITLLLDIPVDIALRRLKEKNRFENKEFLEKVRKGFLELAKEEENVVVIDASGEEEEVFKEILRALSGVLRV' seq2 = 'RRGALIVLEGVDRAGKSTQSRKLVEALCAAGHRAELLRFPERSTEIGKLLSSYLQKKSDVEDHSVHLLFSANRWEQVPLIKEKLSQGVTLVVDRYAFSGVAFTGAKENFSLDWCKQPDVGLPKPDLVLFLQLQLADAAKRGAFGHERYENGAFQERALRCFHQLMKDTTLNWKMVDASKSIEAVHEDIRVLSEDAIATATEKPLGELWK' -### extract per-residue embeddings +### extract per-residue embeddings (if you are using ProstT5, add "<AA2fold> " to the sequences) emb1 = protT5_ext.extract(seq1) emb2 = protT5_ext.extract(seq2) print(emb1.shape) @@ -39,7 +39,7 @@ print(emb1.shape) similarity_matrix = sm.compute_similarity_matrix(emb1, emb2) eba_results = methods.compute_eba(similarity_matrix) ### to return the alignment itself use: -#eba_results = eba.EBA(similarity_matrix, extensive_output=True) +#eba_results = methods.compute_eba(similarity_matrix, extensive_output=True) ### show results print('EBA raw: ', eba_results['EBA_raw']) diff --git a/eba_example.py b/eba_example.py index 83d35f5..27d0b0a 100644 --- a/eba_example.py +++ b/eba_example.py @@ -7,11 +7,11 @@ from eba import plm_extractor as plm device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') protT5_ext = plm.load_extractor('ProtT5', 'residue', device=device) -### sequences example +### sequences example seq1 = 'MLIAFEGIDGSGKTTQAKKLYEYLKQKGYFVSLYREPGGTKVGEVLREILLTEELDERTELLLFEASRSKLIEEKIIPDLKRDKVVILDRFVLSTIAYQGYGKGLDVEFIKNLNEFATRGVKPDITLLLDIPVDIALRRLKEKNRFENKEFLEKVRKGFLELAKEEENVVVIDASGEEEEVFKEILRALSGVLRV' seq2 = 'RRGALIVLEGVDRAGKSTQSRKLVEALCAAGHRAELLRFPERSTEIGKLLSSYLQKKSDVEDHSVHLLFSANRWEQVPLIKEKLSQGVTLVVDRYAFSGVAFTGAKENFSLDWCKQPDVGLPKPDLVLFLQLQLADAAKRGAFGHERYENGAFQERALRCFHQLMKDTTLNWKMVDASKSIEAVHEDIRVLSEDAIATATEKPLGELWK' -### extract per-residue embeddings +### extract per-residue embeddings (if you are using ProstT5, add "<AA2fold> " to the sequences) emb1 = protT5_ext.extract(seq1) emb2 = protT5_ext.extract(seq2) print(emb1.shape) -- GitLab