对于Clinicaltrials.gov数据,我尝试去掉括号,解析合作者名称(可以有可变数量的合作者),并将Phase
解析为数字。我得到了这个错误:
import pandas as pd
import json
import urllib.request, urllib.parse, urllib.error
import sqlite3
import ssl
import ast
conn = sqlite3.connect('ctrialsdb.sqlite')
cur = conn.cursor()
# Setup DB
cur.executescript('''
DROP TABLE IF EXISTS ctdata;
CREATE TABLE ctdata (
NCTId TEXT NOT NULL PRIMARY KEY,
LeadSponsorName TEXT,
CollaboratorName TEXT,
InterventionName TEXT,
Phase TEXT,
OverallStatus TEXT,
Condition TEXT,
LastUpdatePostDate TEXT
)
''')
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
serviceurl ='https://www.clinicaltrials.gov/api/query/study_fields?'
sponsor = input('Company name: ')
url=serviceurl + urllib.parse.urlencode({'expr' : sponsor, 'fields' :
'NCTId, LeadSponsorName,CollaboratorName,InterventionName,Phase,OverallStatus,Condition, LastUpdatePostDate',
'min_rnk' :'1','max_rnk':'1000','fmt' :'json'})
#print(url)
urlo=urllib.request.urlopen(url, context = ctx)
urld = urlo.read().decode()
jdata = json.loads(urld)
#print(json.dumps(jdata, indent=4))
df = pd.DataFrame(jdata['StudyFieldsResponse']['StudyFields'])
df
for child in jdata['StudyFieldsResponse']['StudyFields']:
cur.execute('''INSERT OR IGNORE INTO ctdata
(NCTId, LeadSponsorName, CollaboratorName, InterventionName, Phase, OverallStatus, Condition, LastUpdatePostDate) VALUES (?,?,?,?,?,?,?,?)''',
(child['NCTId'],child['LeadSponsorName'],child['CollaboratorName'],child['InterventionName'],child['Phase'], child['OverallStatus'], child['Condition'], child['LastUpdatePostDate'], ) )
conn.commit()
产出:
Rank NCTId LeadSponsorName CollaboratorName InterventionName Phase OverallStatus Condition LastUpdatePostDate
0 1 [NCT01745367] [AVEO Pharmaceuticals, Inc.] [Astellas Pharma Inc] [Tivozanib Hydrochloride, paclitaxel, Placebo] [Phase 2] [Terminated] [Triple Negative Breast Cancer] [October 27, 2020]
1 2 [NCT01673386] [AVEO Pharmaceuticals, Inc.] [Astellas Pharma Inc] [Tivozanib, Sunitinib] [Phase 2] [Terminated] [Metastatic Renal Cell Carcinoma] [October 27, 2020]
2 3 [NCT02318368] [AVEO Pharmaceuticals, Inc.] [Biodesix, Inc.] [Ficlatuzumab, Erlotinib, placebo] [Phase 2] [Terminated] [Non-small Cell Lung Cancer] [October 22, 2020]
3 4 [NCT01369433] [AVEO Pharmaceuticals, Inc.] [] [Tivozanib + paclitaxel, Tivozanib + temsiroli... [Not Applicable] [Terminated] [Solid Tumors] [September 1, 2020]
误差率
---------------------------------------------------------------------------
InterfaceError Traceback (most recent call last)
Input In [1], in <cell line: 72>()
54 #dfs = (jdata['StudyFieldsResponse']['StudyFields'])
55 #dfs
56
(...)
69 # Condition = child[7]
70 # LastUpdatePostDate = child[8]
72 for child in jdata['StudyFieldsResponse']['StudyFields']:
---> 73 cur.execute('''INSERT OR IGNORE INTO ctdata
74 (NCTId, LeadSponsorName, CollaboratorName, InterventionName, Phase, OverallStatus, Condition, LastUpdatePostDate) VALUES (?,?,?,?,?,?,?,?)''', (child['NCTId'],child['LeadSponsorName'],child['CollaboratorName'],child['InterventionName'],child['Phase'], child['OverallStatus'], child['Condition'], child['LastUpdatePostDate'], ) )
75 conn.commit()
InterfaceError: Error binding parameter 0 - probably unsupported type.
JSON结构:
{
"StudyFieldsResponse": {
"APIVrs": "1.01.05",
"DataVrs": "2022:09:26 23:26:47.947",
"Expression": "AVEO Pharmaceuticals, Inc.",
"NStudiesAvail": 428928,
"NStudiesFound": 42,
"MinRank": 1,
"MaxRank": 1000,
"NStudiesReturned": 42,
"FieldList": [
"NCTId",
"LeadSponsorName",
"CollaboratorName",
"InterventionName",
"Phase",
"OverallStatus",
"Condition",
"LastUpdatePostDate"
],
"StudyFields": [
{
"Rank": 1,
"NCTId": [
"NCT01745367"
],
"LeadSponsorName": [
"AVEO Pharmaceuticals, Inc."
],
"CollaboratorName": [
"Astellas Pharma Inc"
],
"InterventionName": [
"Tivozanib Hydrochloride",
"paclitaxel",
"Placebo"
],
"Phase": [
"Phase 2"
],
"OverallStatus": [
"Terminated"
],
"Condition": [
"Triple Negative Breast Cancer"
],
"LastUpdatePostDate": [
"October 27, 2020"
]
},
{
"Rank": 2,
"NCTId": [
"NCT01673386"
],
"LeadSponsorName": [
"AVEO Pharmaceuticals, Inc."
],
"CollaboratorName": [
"Astellas Pharma Inc"
],
"InterventionName": [
"Tivozanib",
"Sunitinib"
],
"Phase": [
"Phase 2"
],
"OverallStatus": [
"Terminated"
],
"Condition": [
"Metastatic Renal Cell Carcinoma"
],
"LastUpdatePostDate": [
"October 27, 2020"
]
},
{
"Rank": 3,
"NCTId": [
"NCT02318368"
],
"LeadSponsorName": [
"AVEO Pharmaceuticals, Inc."
],
"CollaboratorName": [
"Biodesix, Inc."
],
"InterventionName": [
"Ficlatuzumab",
"Erlotinib",
"placebo"
],
"Phase": [
"Phase 2"
],
"OverallStatus": [
"Terminated"
],
"Condition": [
"Non-small Cell Lung Cancer"
],
"LastUpdatePostDate": [
"October 22, 2020"
]
1条答案
按热度按时间vdzxcuhz1#
我过多地考虑了这个过程--简单的解决办法:
任何其他人有更好的想法请张贴,总是寻求改进!多亏了PChemGuy--当我开始关注类型(他的问题)时,我意识到我需要将列表扁平化。干杯,MJRBBQ