我正在尝试使用python从sql文件中提取源表和目标表。到目前为止,我能够使用简单的python代码来提取那些,这些代码寻找关键字“from,join,insert,create”来提取表名,但是现在我这样做是为了那些可以包含函数并且可以在后面使用的过程,这些过程可以出现在我不想要的源代码列表中。我想把它移走,但失败了。我正在粘贴从中提取源代码和目标代码
def tables_in_query(sql_str):
# remove the /* */ comments
q = re.sub(r'/\*[^*]*\*+(?:[^*/][^*]*\*+)*/', "", sql_str)
# remove whole line -- and # comments
lines = [line for line in q.splitlines() if not re.match("^\\s*(--|#)", line)]
# remove trailing -- and # comments
q = " ".join([re.split("--|#", line)[0] for line in lines])
# split on blanks, parenthesis and semicolons
tokens = re.split(r"[\s)(;]+", q)
source, target, target_views, with_1 = set(), set(), set(), set()
view_idx = [i+1 for i, x in enumerate(tokens) if x.lower() in ["view"]]
for i in view_idx:
target_views.add(tokens[i])
with_index = [i + 1 for i, x in enumerate(tokens) if x.lower() in ["with"]]
for i in with_index:
t = tokens[i]
if not (t.lower().startswith('tmp') or t.lower() in ["table", "replace"]):
with_1.add(t)
target_index = [i + 3 for i, x in enumerate(tokens) if x.lower() in ["create"]]
for i in target_index:
t = tokens[i]
if not (t.lower().startswith('tmp') or t.lower() in ["table", "replace"]) and not with_1.__contains__(
t.lower()):
target.add(t)
target_index = [i + 2 for i, x in enumerate(tokens) if x.lower() in ["insert"] and tokens[i+1].lower() in ["into"]]
for i in target_index:
t = tokens[i]
if not (t.lower().startswith('tmp') or t.lower() in ["table", "replace"]) and not with_1.__contains__(
t.lower()):
target.add(t)
target_idx = [i + 1 for i, x in enumerate(tokens) if x.lower() in ["update"]]
for i in target_idx:
t = tokens[i]
if not (t.lower().startswith('tmp') or t.lower() in ["table", "replace"]) and not with_1.__contains__(
t.lower()):
target.add(t.replace('..', '.admin.'))
source_index = [i + 1 for i, x in enumerate(tokens) if x.lower() in ["from", "join"]]
for i in source_index:
t = tokens[i]
if not (t.lower().startswith('tmp') or t.lower() == "select" or t.lower() == "" or t.lower().startswith("substr") or t.lower().startswith("last_day")) and not with_1.__contains__(
t.lower()):
source.add(t)
# print(t)
source_idx = [i + 2 for i, x in enumerate(tokens) if x.lower() in ["alter"]]
for i in source_idx:
t = tokens[i]
if not (t.lower().startswith('tmp') or t.lower() == "select" or t.lower() == "") and not with_1.__contains__(
t.lower()):
source.add(t)
return source, target, target_views
1条答案
按热度按时间olhwl3o21#
对我来说好像有很多工作要做。我建议您使用久经考验的解决方案,如sql元数据(https://github.com/macbre/sql-metadata)amd sql分析(https://github.com/andialbrecht/sqlparse)
或者,如果您使用的是sqlite—可能还有其他dbase引擎—您也可以尝试使用explain来获取感兴趣的sql语句的字节码表,然后查找表操作码,例如openread(源表)、openwrite(目标表)和droptable(目标表?)。表引用位于p2列中,您可以在该列中查找sqlite\u主表以获取表名。这样做的好处是,如果您的sql语句引用了一个视图,字节码将引用底层表,而这是任何解析都无法揭示的。