发布于2019-08-07 14:41 阅读(2649) 评论(0) 点赞(0) 收藏(2)
abbr. Longest Common Subsequence,常用于检测文本相似度,或在图数据中求取公共路径,目的是求解序列间的非连续公共部分。以下代码分别呈现二维和多维动态规划解法,原理可参考:Reference。
以下代码整理自 [美] Goodrich et al. 所著《Data Structures and Algorithms in Python》。
def LCS(X, Y):
"""Return table such that L[j][k] is length of LCS for X[0:j] and Y[0:k]."""
n, m = len(X), len(Y) # introduce convenient notations
L = [[0] * (m+1) for k in range(n+1)] # (n+1) x (m+1) table
for j in range(n):
for k in range(m):
if X[j] == Y[k]: # align this match
L[j+1][k+1] = L[j][k] + 1
else: # choose to ignore one character
L[j+1][k+1] = max(L[j][k+1], L[j+1][k])
return L
def LCS_solution(X, Y, L):
"""Return the longest common substring of X and Y, given LCS table L."""
solution = []
j,k = len(X), len(Y)
while L[j][k] > 0: # common characters remain
if X[j-1] == Y[k-1]:
solution.append(X[j-1])
j -= 1
k -= 1
elif L[j-1][k] >= L[j][k-1]:
j -=1
else:
k -= 1
return ''.join(reversed(solution)) # return left-to-right version
if __name__ == '__main__':
X, Y = 'This is test string A.', 'This is another test string B.'
L = LCS(X,Y)
solution = LCS_solution(X,Y,L)
print(solution)
笔者在二维解法的基础上,将代码进行了泛化处理,泛化后的模型能求解任意条序列的公共子序列。但需要注意的是,由于运用的是动态规划解法,算法的时间复杂度无可避免为
import numpy as np
from copy import deepcopy
class LCS:
def __init__(self):
self.seqs = None # input sequences
self.L = None # LCS Table
self.solution = None
def solve(self,seqs):
'''Return the longest common substring of given sequences.'''
self.seqs = seqs
self.len_list = [len(seq) for seq in self.seqs] # introduce convenient notations
self.L = np.zeros(shape=[seq_len + 1 for seq_len in self.len_list]) # (n+1) x (m+1) x ... table
idx_list = [0]*(len(self.seqs)-1)+[-1]
stop_list = [seq_len-1 for seq_len in self.len_list]
while True:
idx, next_step = self.forward(idx_list,stop_list)
if next_step == 'stop': break
if self.all_equal(next_step): # align this match
self.L[tuple(self.margin(next_step))] = self.L[tuple(next_step)] + 1
else: # choose to ignore one character
self.L[tuple(self.margin(next_step))] = max([self.L[tuple(self.margin(next_step,[dim],reverse))] for reverse in (True,False) for dim in range(len(next_step))])
idx_list = next_step
self.solution = ''
idx_list = deepcopy(self.len_list)
while self.L[tuple(idx_list)] > 0: # common characters remain
next_step = self.margin(idx_list,forward=False)
if self.all_equal(next_step):
self.solution += self.seqs[0][next_step[0]]
idx_list = next_step
else:
candidates = [tuple(self.margin(idx_list,[dim],reverse,forward=False)) for reverse in (True,False) for dim in range(len(idx_list))]
candidates = [(candidate,self.L[candidate]) for candidate in candidates]
candidates = sorted(candidates,key=lambda x:x[1],reverse=True)
idx_list = list(candidates[0][0])
self.solution = self.solution[::-1] # return left-to-right version
return self.solution
def margin(self,idx_list,dims=None,reverse=False,forward=True):
'''Return the next movement of the index list.'''
if dims is None: dims = list(range(len(idx_list)))
idx_list_copy = deepcopy(idx_list)
if reverse == False:
for dim in dims:
idx_list_copy[dim] += 1 if forward else -1
else:
idx_list_copy = [idx + 1 for idx in idx_list_copy] if forward else [idx - 1 for idx in idx_list_copy]
for dim in dims:
idx_list_copy[dim] -= 1 if forward else -1
return idx_list_copy
def forward(self,idx_list,stop_list):
'''Return the next index list following the loop regulations.'''
idx_list_copy = deepcopy(idx_list)
for i in range(len(idx_list_copy)-1,-1,-1):
if idx_list_copy[i] != stop_list[i]:
idx_list_copy[i] += 1
idx_list_copy[i+1:] = [0] * (len(idx_list_copy)-1-i)
return i,idx_list_copy
return (0,'stop')
def all_equal(self,idx_list):
'''Check whether the corresponding characters equal.'''
for i in range(len(self.seqs)-1):
if self.seqs[i][idx_list[i]] != self.seqs[i+1][idx_list[i+1]]:
return False
return True
if __name__ == '__main__':
model = LCS()
solution = model.solve(['This is test string A.', 'This is another test string B.','We totally have three test strings.'])
print(solution)
作者:iuie9493
链接:https://www.pythonheidong.com/blog/article/11373/a0635f630d52c1bd60e6/
来源:python黑洞网
任何形式的转载都请注明出处,如有侵权 一经发现 必将追究其法律责任
昵称:
评论内容:(最多支持255个字符)
---无人问津也好,技不如人也罢,你都要试着安静下来,去做自己该做的事,而不是让内心的烦躁、焦虑,坏掉你本来就不多的热情和定力
Copyright © 2018-2021 python黑洞网 All Rights Reserved 版权所有,并保留所有权利。 京ICP备18063182号-1
投诉与举报,广告合作请联系vgs_info@163.com或QQ3083709327
免责声明:网站文章均由用户上传,仅供读者学习交流使用,禁止用做商业用途。若文章涉及色情,反动,侵权等违法信息,请向我们举报,一经核实我们会立即删除!