-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathGHParser.py
More file actions
78 lines (66 loc) · 2.79 KB
/
GHParser.py
File metadata and controls
78 lines (66 loc) · 2.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#-- ################################################################
#-- ______ ________
#-- / \ / | Title: Github Parser
#-- /$$$$$$ |$$$$$$$$/ Author: Ahmad M.
#-- $$ |__$$ | /$$/ Date: 231018
#-- $$ $$ | /$$/ Params: github links
#-- $$$$$$$$ | /$$/ Descrip: sometimes repos are build in a way
#-- $$ | $$ | /$$/ that makes it hard to clone them.
#-- $$ | $$ |/$$/ this might be the solutin
#-- $$/ $$/ $$/ Notes: only works for 1-level depth right now
#-- ################################################################
print("Welcome to A7's Github parser !")
#-- ################################################################
print("Importing libraries and setting up procedures...")
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from requests import get
import re, os, wget
#-- ################################################################
link = rawinput("Please enter a link...")
# link = "https://github.com/blackvitriol/ROSCOG/tree/master/ROSCOG_Nao"
foldername = link.split('/')[-1]
os.mkdir(foldername)
os.chdir(foldername)
#-- ################################################################
# 1. Get a soup of all files and folders available on the root page of given link
def get_a_soup(link):
response = get(link)
html_soup = BeautifulSoup(response.text, 'html.parser')
soup = str(html_soup.find_all('span', class_ = 'css-truncate css-truncate-target'))
print(soup)
return soup
soup = get_a_soup(link)
#-- ################################################################
# 2. Find all files and folders in the soup, create a hierarchy tree.
#def find_links_in_soup(soup):
list_of_links = re.findall('href=(.+?)id', soup)
for link in list_of_links:
link = link.strip('" ')
print("Found link:",link)
# For files in root folder:
os
if '.' in link:
file = link.split('/')[-1]
raw = "https://github.com"+link
raw = raw.replace('blob','raw')
print("It is a file at", raw, "and has been downloaded.")
wget.download(raw)
# For folders:
else:
foldername = link.split('/')[-1]
raw = main+'/'+foldername
os.mkdir(foldername)
print("It is a folder at", raw, "and has been created.")
#print("Now browsing folder and creating tree:")
# return list_of_links
#-- ################################################################
# 3. Download the tree of files and folders.
# downloader(list_of_links)
#-- ################################################################
# Intended order of development & functionality.
# Limitations: folder names should not contain dots
# link = ""
# soup=get_a_soup(link)
# list_of_links=find_links_in_soup(soup)
# downloader(list_of_links)