Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 41 additions & 30 deletions publication_to_data/webscrape_datasets_from_publication.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,54 @@
## So far format only for Nature journals.

def get_datasets_from_publication(doi):
"""Returns list of datasets (in Whatever format IVAN wants, from a paper's doi."""
## TO DO !

pass

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

#DOI = 'https://www.nature.com/articles/s41467-020-18551-0'
#DOI = 'https://www.nature.com/articles/s41467-020-20688-x'
DOI = 'https://science.sciencemag.org/content/366/6462/255'
def get_datasets_from_publication(doi):

URL = 'https://dx.doi.org/' + doi

html = urlopen(URL)
soup = BeautifulSoup(html, "html.parser")


dataset = [{}]
data_id = []

#for nature
table = soup.findAll('div',attrs={"id":"data-availability-content"})
for x in table:
myString = (x.find('p').text)
data_id += re.findall(r'(https?://\S+)', myString)


table = soup.findAll('div',attrs={"id":"code-availability-content"})
for x in table:
myString = (x.find('p').text)
data_id += re.findall(r'(https?://\S+)', myString)

html = urlopen(DOI)
#for science
table = soup.findAll('div',attrs={"class":"ack"})
myString = str(table)
data_id += re.findall(r'(https?://\S+)', myString)

soup = BeautifulSoup(html, "html.parser")

url = []
#for nature
table = soup.findAll('div',attrs={"id":"data-availability-content"})
for x in table:
myString = (x.find('p').text)
url += re.findall(r'(https?://\S+)', myString)
#for PeerJ
table = soup.findAll('div',attrs={"id":"addinfo-1"})
for x in table:
myString = str((x.find('a')['href']))
data_id += re.findall(r'(https?://\S+)', myString)

table = soup.findAll('div',attrs={"class":"object-id article-component-doi"})

table = soup.findAll('div',attrs={"id":"code-availability-content"})
for x in table:
myString = (x.find('p').text)
#print (myString)
url += re.findall(r'(https?://\S+)', myString)
for x in table:
myString = str((x.find('a')['href']))
data_id += re.findall(r'(https?://\S+)', myString)

#for science
table = soup.findAll('div',attrs={"class":"ack"})
myString = str(table)
url += re.findall(r'(https?://\S+)', myString)

for item in data_id:
dataset.append({'identifier':item, 'access_open': True, 'license_cc_by': True})

return (dataset)

print (url)
#Collapse

#dataset = get_datasets_from_publication('10.1038/s41467-020-18551-0')
#print (dataset)