-
-
Notifications
You must be signed in to change notification settings - Fork 49.2k
added smith waterman algorithm #9001
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
4672878
44314e4
65b95a6
8e56b7e
d8a6bcb
fc58801
0662f69
892858a
37d7fed
0e199f7
2729a57
2a3e20a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,98 @@ | ||
|
|
||
| # https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm | ||
| # Score constants | ||
| """ | ||
| Score constants used in the Smith-Waterman algorithm. Matches are given a positive | ||
| score while mismatches are given a negative score. Gaps are also penalized. | ||
| """ | ||
| MATCH = 1 | ||
| MISMATCH = -1 | ||
| GAP = -2 | ||
|
|
||
|
|
||
| def score_function(a: str, b: str) -> int: | ||
BAW2501 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| """ | ||
| Calculate the score for a character pair based on whether they match or mismatch. | ||
| Returns 1 if the characters match, -1 if they mismatch. | ||
| >>> score_function('A', 'A') | ||
| 1 | ||
| >>> score_function('A', 'C') | ||
| -1 | ||
| """ | ||
| if a == b: | ||
| return MATCH | ||
| else: | ||
| return MISMATCH | ||
|
|
||
|
|
||
| def smith_waterman(query: str, subject: str) -> list[list[int]]: | ||
|
||
| """ | ||
| Perform the Smith-Waterman local sequence alignment algorithm. | ||
| Returns a 2D list representing the score matrix. Each value in the matrix | ||
| corresponds to the score of the best local alignment ending at that point. | ||
| >>> smith_waterman('ACAC', 'CA') | ||
| [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]] | ||
BAW2501 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| """ | ||
|
|
||
| # Initialize score matrix | ||
| m = len(query) | ||
| n = len(subject) | ||
| score = [[0] * (n + 1) for _ in range(m + 1)] | ||
|
|
||
| for i in range(1, m + 1): | ||
| for j in range(1, n + 1): | ||
| # Calculate scores for each cell | ||
| match = score[i - 1][j - 1] + score_function(query[i - 1], subject[j - 1]) | ||
| delete = score[i - 1][j] + GAP | ||
| insert = score[i][j - 1] + GAP | ||
|
|
||
| # Take maximum score | ||
| score[i][j] = max(0, match, delete, insert) | ||
|
|
||
| return score | ||
|
|
||
|
|
||
| def traceback(score: list[list[int]], query: str, subject: str) -> str: | ||
| r""" | ||
| Perform traceback to find the optimal local alignment. | ||
| Starts from the highest scoring cell in the matrix and traces back recursively | ||
| until a 0 score is found. Returns the alignment strings. | ||
| >>> traceback([[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 2], [0, 1, 0]], 'ACAC', 'CA') | ||
| 'CAC\nCA-' | ||
BAW2501 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| """ | ||
|
|
||
| # Traceback logic to find optimal alignment | ||
| i = len(query) | ||
| j = len(subject) | ||
BAW2501 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| align1 = "" | ||
| align2 = "" | ||
|
|
||
| while i > 0 and j > 0: | ||
| if score[i][j] == score[i - 1][j - 1] + score_function( | ||
| query[i - 1], subject[j - 1] | ||
| ): | ||
| # optimal path is a diagonal take both letters | ||
| align1 = query[i - 1] + align1 | ||
| align2 = subject[j - 1] + align2 | ||
| i -= 1 | ||
| j -= 1 | ||
| elif score[i][j] == score[i - 1][j] + GAP: | ||
| # optimal path is a vertical | ||
| align1 = query[i - 1] + align1 | ||
| align2 = "-" + align2 | ||
| i -= 1 | ||
| else: | ||
| # optimal path is a horizontal | ||
| align1 = "-" + align1 | ||
| align2 = subject[j - 1] + align2 | ||
| j -= 1 | ||
|
|
||
| return f'{align1}\n{align2}' | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| query = "HEAGAWGHEE" | ||
| subject = "PAWHEAE" | ||
|
|
||
| score = smith_waterman(query, subject) | ||
| print(traceback(score, query, subject)) | ||
Uh oh!
There was an error while loading. Please reload this page.