diff --git a/README.md b/README.md index d35bb57..466f201 100644 --- a/README.md +++ b/README.md @@ -13,11 +13,6 @@ standardized format. To use the tool, first use the "database" command to build or modify your local JSON database, then use the "download" command to download songs. -To avoid overloading or abusing the C3DB website, this tool operates exclusively in sequential mode by design; at -most one page is scraped (for "database build") or song downloaded (for "download") at once. Additionally, the tool -design ensures that the JSON database of songs is stored locally, so it only needs to be built once and then is -reused to perform actual downloads without putting further load on the website. - ## Installation 1. Install the Python3 requirements from `requirements.txt`. @@ -39,8 +34,9 @@ fetch all avilable songs for all games, and either specify it with the `-u`/`--b environment variable `C3DBDL_BASE_URL`. 1. Initialize your C3DB JSON database with `c3dbdl [options] database build`. This will take a fair amount -of time to complete as all pages of the chosen base URL are scanned. Note that if you cancel this process, no -data will be saved, so let it complete! +of time to complete as all pages of the chosen base URL, and all song pages (30,000+) are scanned. Note that if +you cancel this process, no data will be saved, so let it complete! The default concurrency setting should make +this relatively quick but YMMV. 1. Download any song(s) you want with `c3dbdl [options] download [options]`. @@ -86,6 +82,9 @@ Downloading song "Rush - Sweet Miracle" by ejthedj... Downloading from https://dl.c3universe.com/s/ejthedj/sweetMiracle... ``` +In addition to the above filters, within each song may be more than one download link. To filter these links, +use the "-i"/"--download-id" and "-d"/"--download-descr" (see the help for details). + Feel free to experiment. ## Output Format diff --git a/c3dbdl b/c3dbdl index 2a79aa3..515e89a 100755 --- a/c3dbdl +++ b/c3dbdl @@ -10,11 +10,87 @@ from difflib import unified_diff from colorama import Fore from bs4 import BeautifulSoup from urllib.error import HTTPError +from concurrent.futures import ThreadPoolExecutor, as_completed CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'], max_content_width=120) +def fetchSongData(entry): + song_entry = dict() + messages = list() -def buildDatabase(pages=None): + for idx, td in enumerate(entry.find_all('td')): + if idx == 2: + # Artist + song_entry["artist"] = td.find('a').get_text().strip().replace('/', '+') + elif idx == 3: + # Song + song_entry["title"] = td.find('div', attrs={'class':'c3ttitlemargin'}).get_text().strip().replace('/', '+') + song_entry["album"] = td.find('div', attrs={'class':'c3tartist'}).get_text().strip().replace('/', '+') + # Song page + tmp_links = td.find_all('a', href=True) + for link in tmp_links: + if link.get('href'): + song_entry["song_link"] = link.get('href') + break + elif idx == 4: + # Genre + song_entry["genre"] = td.find('a').get_text().strip() + elif idx == 5: + # Year + song_entry["year"] = td.find('a').get_text().strip() + elif idx == 6: + # Length + song_entry["length"] = td.find('a').get_text().strip() + elif idx == 8: + # Author (of chart) + song_entry["author"] = td.find('a').get_text().strip().replace('/', '+') + + if song_entry and song_entry['author'] and song_entry['title'] and song_entry["song_link"]: + messages.append(f"> Found song entry for {song_entry['artist']} - {song_entry['title']} by {song_entry['author']}") + for entry_type in ["artist", "album", "genre", "year", "length"]: + if not song_entry[entry_type]: + song_entry[entry_type] = "None" + + # Get download links from the actual song page + attempts = 1 + sp = None + while attempts <= 3: + try: + messages.append(f"Parsing song page {song_entry['song_link']} (attempt {attempts}/3)...") + sp = requests.get(song_entry["song_link"]) + break + except Exception: + sleep(attempts) + attempts += 1 + if sp is None or sp.status_code != 200: + messages.append("Failed to fetch song page, aborting") + return None + + song_parsed_html = BeautifulSoup(sp.text, 'html.parser') + + download_section = song_parsed_html.find('div', attrs={"class": "portlet light bg-inverse"}) + download_links = download_section.find_all('a', href=True) + dl_links = list() + for link_entry in download_links: + link = link_entry.get('href') + description = link_entry.get_text().strip() + if not "c3universe.com" in link: + continue + messages.append(f"Found download link: {link} ({description})") + dl_links.append({ + "link": link, + "description": description, + }) + if not dl_links: + messages.append("Found no c3universe.com download links for song, not adding to database") + return None + song_entry["dl_links"] = dl_links + + # Append to the database + return messages, song_entry + + +def buildDatabase(pages, concurrency): found_songs = [] if pages is None: @@ -46,106 +122,109 @@ def buildDatabase(pages=None): table_html = parsed_html.body.find('div', attrs={'class':'portlet-body'}).find('tbody') + entries = list() for entry in table_html.find_all('tr', attrs={'class':'odd'}): if len(entry) < 1: break - - song_entry = dict() - - for idx, td in enumerate(entry.find_all('td')): - if idx == 1: - # Download link - song_entry["dl_link"] = td.find('a', attrs={'target':'_blank'}).get('href') - elif idx == 2: - # Artist - song_entry["artist"] = td.find('a').get_text().strip().replace('/', '+') - elif idx == 3: - # Song - song_entry["title"] = td.find('div', attrs={'class':'c3ttitlemargin'}).get_text().strip().replace('/', '+') - song_entry["album"] = td.find('div', attrs={'class':'c3tartist'}).get_text().strip().replace('/', '+') - elif idx == 4: - # Genre - song_entry["genre"] = td.find('a').get_text().strip() - elif idx == 5: - # Year - song_entry["year"] = td.find('a').get_text().strip() - elif idx == 6: - # Length - song_entry["length"] = td.find('a').get_text().strip() - elif idx == 8: - # Author (of chart) - song_entry["author"] = td.find('a').get_text().strip().replace('/', '+') - - if song_entry and song_entry['author'] and song_entry['title']: - click.echo(f"Found song entry for {song_entry['artist']} - {song_entry['title']} by {song_entry['author']}") - for entry_type in ["artist", "album", "genre", "year", "length"]: - if not song_entry[entry_type]: - song_entry[entry_type] = "None" - found_songs.append(song_entry) + entries.append(entry) + + click.echo("Fetching and parsing song pages...") + with ThreadPoolExecutor(max_workers=concurrency) as executor: + future_to_song = {executor.submit(fetchSongData, entry): entry for entry in entries} + for future in as_completed(future_to_song): + try: + messages, song = future.result() + click.echo('\n'.join(messages)) + if song is None: + continue + found_songs.append(song) + except Exception: + continue return found_songs -def downloadSong(destination, filename, entry): - click.echo(f"""Downloading song "{entry['artist']} - {entry['title']}" by {entry['author']}...""") +def downloadSong(destination, filename, entry, dlid, dldesc): + click.echo(f"""> Downloading song "{entry['artist']} - {entry['title']}" by {entry['author']}...""") - try: - p = requests.get(entry['dl_link']) - if p.status_code != 200: - raise HTTPError(entry['dl_link'], p.status_code, "", None, None) + if dlid is None: + dl_links = entry['dl_links'] + else: + try: + dl_links = [entry['dl_links'][dlid - 1]] + except Exception: + click.echo(f"Invalid download link ID {dlid}.") + return - parsed_html = BeautifulSoup(p.text, 'html.parser') - download_url = parsed_html.body.find('div', attrs={'class':'lock-head'}).find('a').get('href') - except Exception as e: - click.echo(f"Failed parsing or retrieving HTML link: {e}") - return None + if dldesc is not None: + new_dl_links = list() + for entry in dl_links: + if dldesc in entry['description']: + new_dl_links.append(entry) + dl_links = new_dl_links - download_filename = filename.format( - genre=entry['genre'], - artist=entry['artist'], - album=entry['album'], - title=entry['title'], - year=entry['year'], - author=entry['author'], - orig_name=download_url.split('/')[-1], - ) - download_filename = f"{destination}/{download_filename}" - download_path = '/'.join(f"{download_filename}".split('/')[0:-1]) + if not dl_links: + click.echo(f'No download link matching description "{dldesc}" found.') + return - if os.path.exists(download_filename): - click.echo(f"File exists at {download_filename}") - return None - - click.echo(f"""Downloading from {download_url}...""") - attempts = 1 - p = None - try: - with requests.get(download_url, stream=True) as r: - while attempts <= 3: - try: - r.raise_for_status() - break - except Exception: - click.echo(f"Download attempt failed: HTTP {r.status_code}; retrying {attempts}/3") - sleep(attempts) - attempts += 1 - if r is None or r.status_code != 200: - if r: - code = r.status_code - else: - code = "-1" - raise HTTPError(download_url, code, "", None, None) - - if not os.path.exists(download_path): - os.makedirs(download_path) - - with open(download_filename, 'wb') as f: - for chunk in r.iter_content(chunk_size=8192): - f.write(chunk) - click.echo(f"Successfully downloaded to {download_filename}") - except Exception as e: - click.echo(f"Download attempt failed: {e}") - return None + for dl_link in dl_links: + try: + p = requests.get(dl_link['link']) + if p.status_code != 200: + raise HTTPError(dl_link['link'], p.status_code, "", None, None) + + parsed_html = BeautifulSoup(p.text, 'html.parser') + download_url = parsed_html.body.find('div', attrs={'class':'lock-head'}).find('a').get('href') + except Exception as e: + click.echo(f"Failed parsing or retrieving HTML link: {e}") + continue + + download_filename = filename.format( + genre=entry['genre'], + artist=entry['artist'], + album=entry['album'], + title=entry['title'], + year=entry['year'], + author=entry['author'], + orig_name=download_url.split('/')[-1], + ) + download_filename = f"{destination}/{download_filename}" + download_path = '/'.join(f"{download_filename}".split('/')[0:-1]) + + click.echo(f"""Downloading file "{dl_link['description']}" from {download_url}...""") + if os.path.exists(download_filename): + click.echo(f"File exists at {download_filename}") + continue + + attempts = 1 + p = None + try: + with requests.get(download_url, stream=True) as r: + while attempts <= 3: + try: + r.raise_for_status() + break + except Exception: + click.echo(f"Download attempt failed: HTTP {r.status_code}; retrying {attempts}/3") + sleep(attempts) + attempts += 1 + if r is None or r.status_code != 200: + if r: + code = r.status_code + else: + code = "-1" + raise HTTPError(download_url, code, "", None, None) + + if not os.path.exists(download_path): + os.makedirs(download_path) + + with open(download_filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + click.echo(f"Successfully downloaded to {download_filename}") + except Exception as e: + click.echo(f"Download attempt failed: {e}") + continue @@ -158,7 +237,11 @@ def downloadSong(destination, filename, entry): "-p", "--pages", "_pages", type=int, default=None, envvar='C3DBDL_BUILD_PAGES', help="Number of pages to scan (default is all)." ) -def build_database(_overwrite, _pages): +@click.option( + "-c", "--concurrency", "_concurrency", type=int, default=10, envvar='C3DBDL_BUILD_CONCURRENCY', + help="Number of concurrent song page downloads to perform at once." +) +def build_database(_overwrite, _pages, _concurrency): """ Initialize the local JSON database of C3DB songs from the website. @@ -173,7 +256,7 @@ def build_database(_overwrite, _pages): exit(1) click.echo("Building JSON database; this will take a long time...") - songs_database = buildDatabase(_pages) + songs_database = buildDatabase(_pages, _concurrency) click.echo('') click.echo(f"Found {len(songs_database)} songs, dumping to database file '{config['database_filename']}'") if not os.path.exists(config['download_directory']): @@ -267,7 +350,17 @@ def database(): default=None, type=int, help='Limit to this many songs (first N matches).' ) -def download(_filters, _limit, _file_structure): +@click.option( + "-i", "--download-id", "_id", + default=None, type=int, + help='Download only "dl_links" entry N (1 is first, etc.), or all if unspecified.' +) +@click.option( + "-d", "--download-descr", "_desc", + default=None, + help='Download only "dl_links" entries with this in their description (fuzzy).' +) +def download(_filters, _id, _desc, _limit, _file_structure): """ Download song(s) from the C3DB webpage. @@ -286,14 +379,12 @@ def download(_filters, _limit, _file_structure): The default output file structure is: "{genre}/{author}/{artist}/{album}/{title} [{year}].{orig_name}" - \b Filters allow granular selection of the song(s) to download. Multiple filters can be specified, and a song is selected only if ALL filters match (logical AND). Each filter - is in the form: - --filter [database_key] [value] + is in the form "--filter [database_key] [value]". - \b - The valid "database_key" values are identical to the output file fields above. + The valid "database_key" values are identical to the output file fields above, except + for "orig_name". \b For example, to download all songs in the genre "Rock": @@ -303,6 +394,13 @@ def download(_filters, _limit, _file_structure): Or to download all songs by the artist "Rush" and the author "MyName": --filter artist Rush --filter author MyName + In addition to filters, each song may have more than one download link, to provide + multiple versions of the same song (for example, normal and multitracks, or alternate + charts). For each song, the "-i"/"--download-id" and "-d"/"--download-descr" options + can help filter these out, or both can be left blank to download all possible files + for a given song. Mostly useful when being extremely restrictive with filters, less + so when downloading many songs at once. + \b The following environment variables can be used for scripting purposes: * C3DBDL_DL_FILE_STRUCTURE: equivalent to "--file-structure" @@ -331,7 +429,7 @@ def download(_filters, _limit, _file_structure): click.echo(f"Downloading {len(pending_songs)} song files...") for song in pending_songs: - downloadSong(config['download_directory'], _file_structure, song) + downloadSong(config['download_directory'], _file_structure, song, _id, _desc) @click.group(context_settings=CONTEXT_SETTINGS)