Improve download link handling

The previous method relied on the main "download link" in the list page.
But this link was broken a solid 1/4 of the time, and far more often for
some artists.

Instead, during DB build, grab and parse each actual song page too, and
grab from it all possible download links. Use a ThreadPoolExecutor to do
this in a reasonable amount of time (default of 10 workers, but user
configurable).

Then when downloading, iterate over all download links, or provide some
user options for filtering these by ID or description.
This commit is contained in:
Joshua Boniface 2023-04-06 02:06:55 -04:00
parent 3a0ef3dcc6
commit 6ec8923336
2 changed files with 203 additions and 106 deletions

View File

@ -13,11 +13,6 @@ standardized format.
To use the tool, first use the "database" command to build or modify your local JSON database, then use the
"download" command to download songs.
To avoid overloading or abusing the C3DB website, this tool operates exclusively in sequential mode by design; at
most one page is scraped (for "database build") or song downloaded (for "download") at once. Additionally, the tool
design ensures that the JSON database of songs is stored locally, so it only needs to be built once and then is
reused to perform actual downloads without putting further load on the website.
## Installation
1. Install the Python3 requirements from `requirements.txt`.
@ -39,8 +34,9 @@ fetch all avilable songs for all games, and either specify it with the `-u`/`--b
environment variable `C3DBDL_BASE_URL`.
1. Initialize your C3DB JSON database with `c3dbdl [options] database build`. This will take a fair amount
of time to complete as all pages of the chosen base URL are scanned. Note that if you cancel this process, no
data will be saved, so let it complete!
of time to complete as all pages of the chosen base URL, and all song pages (30,000+) are scanned. Note that if
you cancel this process, no data will be saved, so let it complete! The default concurrency setting should make
this relatively quick but YMMV.
1. Download any song(s) you want with `c3dbdl [options] download [options]`.
@ -86,6 +82,9 @@ Downloading song "Rush - Sweet Miracle" by ejthedj...
Downloading from https://dl.c3universe.com/s/ejthedj/sweetMiracle...
```
In addition to the above filters, within each song may be more than one download link. To filter these links,
use the "-i"/"--download-id" and "-d"/"--download-descr" (see the help for details).
Feel free to experiment.
## Output Format

296
c3dbdl
View File

@ -10,11 +10,87 @@ from difflib import unified_diff
from colorama import Fore
from bs4 import BeautifulSoup
from urllib.error import HTTPError
from concurrent.futures import ThreadPoolExecutor, as_completed
CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'], max_content_width=120)
def fetchSongData(entry):
song_entry = dict()
messages = list()
def buildDatabase(pages=None):
for idx, td in enumerate(entry.find_all('td')):
if idx == 2:
# Artist
song_entry["artist"] = td.find('a').get_text().strip().replace('/', '+')
elif idx == 3:
# Song
song_entry["title"] = td.find('div', attrs={'class':'c3ttitlemargin'}).get_text().strip().replace('/', '+')
song_entry["album"] = td.find('div', attrs={'class':'c3tartist'}).get_text().strip().replace('/', '+')
# Song page
tmp_links = td.find_all('a', href=True)
for link in tmp_links:
if link.get('href'):
song_entry["song_link"] = link.get('href')
break
elif idx == 4:
# Genre
song_entry["genre"] = td.find('a').get_text().strip()
elif idx == 5:
# Year
song_entry["year"] = td.find('a').get_text().strip()
elif idx == 6:
# Length
song_entry["length"] = td.find('a').get_text().strip()
elif idx == 8:
# Author (of chart)
song_entry["author"] = td.find('a').get_text().strip().replace('/', '+')
if song_entry and song_entry['author'] and song_entry['title'] and song_entry["song_link"]:
messages.append(f"> Found song entry for {song_entry['artist']} - {song_entry['title']} by {song_entry['author']}")
for entry_type in ["artist", "album", "genre", "year", "length"]:
if not song_entry[entry_type]:
song_entry[entry_type] = "None"
# Get download links from the actual song page
attempts = 1
sp = None
while attempts <= 3:
try:
messages.append(f"Parsing song page {song_entry['song_link']} (attempt {attempts}/3)...")
sp = requests.get(song_entry["song_link"])
break
except Exception:
sleep(attempts)
attempts += 1
if sp is None or sp.status_code != 200:
messages.append("Failed to fetch song page, aborting")
return None
song_parsed_html = BeautifulSoup(sp.text, 'html.parser')
download_section = song_parsed_html.find('div', attrs={"class": "portlet light bg-inverse"})
download_links = download_section.find_all('a', href=True)
dl_links = list()
for link_entry in download_links:
link = link_entry.get('href')
description = link_entry.get_text().strip()
if not "c3universe.com" in link:
continue
messages.append(f"Found download link: {link} ({description})")
dl_links.append({
"link": link,
"description": description,
})
if not dl_links:
messages.append("Found no c3universe.com download links for song, not adding to database")
return None
song_entry["dl_links"] = dl_links
# Append to the database
return messages, song_entry
def buildDatabase(pages, concurrency):
found_songs = []
if pages is None:
@ -46,106 +122,109 @@ def buildDatabase(pages=None):
table_html = parsed_html.body.find('div', attrs={'class':'portlet-body'}).find('tbody')
entries = list()
for entry in table_html.find_all('tr', attrs={'class':'odd'}):
if len(entry) < 1:
break
song_entry = dict()
for idx, td in enumerate(entry.find_all('td')):
if idx == 1:
# Download link
song_entry["dl_link"] = td.find('a', attrs={'target':'_blank'}).get('href')
elif idx == 2:
# Artist
song_entry["artist"] = td.find('a').get_text().strip().replace('/', '+')
elif idx == 3:
# Song
song_entry["title"] = td.find('div', attrs={'class':'c3ttitlemargin'}).get_text().strip().replace('/', '+')
song_entry["album"] = td.find('div', attrs={'class':'c3tartist'}).get_text().strip().replace('/', '+')
elif idx == 4:
# Genre
song_entry["genre"] = td.find('a').get_text().strip()
elif idx == 5:
# Year
song_entry["year"] = td.find('a').get_text().strip()
elif idx == 6:
# Length
song_entry["length"] = td.find('a').get_text().strip()
elif idx == 8:
# Author (of chart)
song_entry["author"] = td.find('a').get_text().strip().replace('/', '+')
if song_entry and song_entry['author'] and song_entry['title']:
click.echo(f"Found song entry for {song_entry['artist']} - {song_entry['title']} by {song_entry['author']}")
for entry_type in ["artist", "album", "genre", "year", "length"]:
if not song_entry[entry_type]:
song_entry[entry_type] = "None"
found_songs.append(song_entry)
entries.append(entry)
click.echo("Fetching and parsing song pages...")
with ThreadPoolExecutor(max_workers=concurrency) as executor:
future_to_song = {executor.submit(fetchSongData, entry): entry for entry in entries}
for future in as_completed(future_to_song):
try:
messages, song = future.result()
click.echo('\n'.join(messages))
if song is None:
continue
found_songs.append(song)
except Exception:
continue
return found_songs
def downloadSong(destination, filename, entry):
click.echo(f"""Downloading song "{entry['artist']} - {entry['title']}" by {entry['author']}...""")
def downloadSong(destination, filename, entry, dlid, dldesc):
click.echo(f"""> Downloading song "{entry['artist']} - {entry['title']}" by {entry['author']}...""")
try:
p = requests.get(entry['dl_link'])
if p.status_code != 200:
raise HTTPError(entry['dl_link'], p.status_code, "", None, None)
if dlid is None:
dl_links = entry['dl_links']
else:
try:
dl_links = [entry['dl_links'][dlid - 1]]
except Exception:
click.echo(f"Invalid download link ID {dlid}.")
return
parsed_html = BeautifulSoup(p.text, 'html.parser')
download_url = parsed_html.body.find('div', attrs={'class':'lock-head'}).find('a').get('href')
except Exception as e:
click.echo(f"Failed parsing or retrieving HTML link: {e}")
return None
if dldesc is not None:
new_dl_links = list()
for entry in dl_links:
if dldesc in entry['description']:
new_dl_links.append(entry)
dl_links = new_dl_links
download_filename = filename.format(
genre=entry['genre'],
artist=entry['artist'],
album=entry['album'],
title=entry['title'],
year=entry['year'],
author=entry['author'],
orig_name=download_url.split('/')[-1],
)
download_filename = f"{destination}/{download_filename}"
download_path = '/'.join(f"{download_filename}".split('/')[0:-1])
if not dl_links:
click.echo(f'No download link matching description "{dldesc}" found.')
return
if os.path.exists(download_filename):
click.echo(f"File exists at {download_filename}")
return None
click.echo(f"""Downloading from {download_url}...""")
attempts = 1
p = None
try:
with requests.get(download_url, stream=True) as r:
while attempts <= 3:
try:
r.raise_for_status()
break
except Exception:
click.echo(f"Download attempt failed: HTTP {r.status_code}; retrying {attempts}/3")
sleep(attempts)
attempts += 1
if r is None or r.status_code != 200:
if r:
code = r.status_code
else:
code = "-1"
raise HTTPError(download_url, code, "", None, None)
if not os.path.exists(download_path):
os.makedirs(download_path)
with open(download_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
click.echo(f"Successfully downloaded to {download_filename}")
except Exception as e:
click.echo(f"Download attempt failed: {e}")
return None
for dl_link in dl_links:
try:
p = requests.get(dl_link['link'])
if p.status_code != 200:
raise HTTPError(dl_link['link'], p.status_code, "", None, None)
parsed_html = BeautifulSoup(p.text, 'html.parser')
download_url = parsed_html.body.find('div', attrs={'class':'lock-head'}).find('a').get('href')
except Exception as e:
click.echo(f"Failed parsing or retrieving HTML link: {e}")
continue
download_filename = filename.format(
genre=entry['genre'],
artist=entry['artist'],
album=entry['album'],
title=entry['title'],
year=entry['year'],
author=entry['author'],
orig_name=download_url.split('/')[-1],
)
download_filename = f"{destination}/{download_filename}"
download_path = '/'.join(f"{download_filename}".split('/')[0:-1])
click.echo(f"""Downloading file "{dl_link['description']}" from {download_url}...""")
if os.path.exists(download_filename):
click.echo(f"File exists at {download_filename}")
continue
attempts = 1
p = None
try:
with requests.get(download_url, stream=True) as r:
while attempts <= 3:
try:
r.raise_for_status()
break
except Exception:
click.echo(f"Download attempt failed: HTTP {r.status_code}; retrying {attempts}/3")
sleep(attempts)
attempts += 1
if r is None or r.status_code != 200:
if r:
code = r.status_code
else:
code = "-1"
raise HTTPError(download_url, code, "", None, None)
if not os.path.exists(download_path):
os.makedirs(download_path)
with open(download_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
click.echo(f"Successfully downloaded to {download_filename}")
except Exception as e:
click.echo(f"Download attempt failed: {e}")
continue
@ -158,7 +237,11 @@ def downloadSong(destination, filename, entry):
"-p", "--pages", "_pages", type=int, default=None, envvar='C3DBDL_BUILD_PAGES',
help="Number of pages to scan (default is all)."
)
def build_database(_overwrite, _pages):
@click.option(
"-c", "--concurrency", "_concurrency", type=int, default=10, envvar='C3DBDL_BUILD_CONCURRENCY',
help="Number of concurrent song page downloads to perform at once."
)
def build_database(_overwrite, _pages, _concurrency):
"""
Initialize the local JSON database of C3DB songs from the website.
@ -173,7 +256,7 @@ def build_database(_overwrite, _pages):
exit(1)
click.echo("Building JSON database; this will take a long time...")
songs_database = buildDatabase(_pages)
songs_database = buildDatabase(_pages, _concurrency)
click.echo('')
click.echo(f"Found {len(songs_database)} songs, dumping to database file '{config['database_filename']}'")
if not os.path.exists(config['download_directory']):
@ -267,7 +350,17 @@ def database():
default=None, type=int,
help='Limit to this many songs (first N matches).'
)
def download(_filters, _limit, _file_structure):
@click.option(
"-i", "--download-id", "_id",
default=None, type=int,
help='Download only "dl_links" entry N (1 is first, etc.), or all if unspecified.'
)
@click.option(
"-d", "--download-descr", "_desc",
default=None,
help='Download only "dl_links" entries with this in their description (fuzzy).'
)
def download(_filters, _id, _desc, _limit, _file_structure):
"""
Download song(s) from the C3DB webpage.
@ -286,14 +379,12 @@ def download(_filters, _limit, _file_structure):
The default output file structure is:
"{genre}/{author}/{artist}/{album}/{title} [{year}].{orig_name}"
\b
Filters allow granular selection of the song(s) to download. Multiple filters can be
specified, and a song is selected only if ALL filters match (logical AND). Each filter
is in the form:
--filter [database_key] [value]
is in the form "--filter [database_key] [value]".
\b
The valid "database_key" values are identical to the output file fields above.
The valid "database_key" values are identical to the output file fields above, except
for "orig_name".
\b
For example, to download all songs in the genre "Rock":
@ -303,6 +394,13 @@ def download(_filters, _limit, _file_structure):
Or to download all songs by the artist "Rush" and the author "MyName":
--filter artist Rush --filter author MyName
In addition to filters, each song may have more than one download link, to provide
multiple versions of the same song (for example, normal and multitracks, or alternate
charts). For each song, the "-i"/"--download-id" and "-d"/"--download-descr" options
can help filter these out, or both can be left blank to download all possible files
for a given song. Mostly useful when being extremely restrictive with filters, less
so when downloading many songs at once.
\b
The following environment variables can be used for scripting purposes:
* C3DBDL_DL_FILE_STRUCTURE: equivalent to "--file-structure"
@ -331,7 +429,7 @@ def download(_filters, _limit, _file_structure):
click.echo(f"Downloading {len(pending_songs)} song files...")
for song in pending_songs:
downloadSong(config['download_directory'], _file_structure, song)
downloadSong(config['download_directory'], _file_structure, song, _id, _desc)
@click.group(context_settings=CONTEXT_SETTINGS)