From cdc67eb1143daad8afc7f5624ddb2fe930198a0a Mon Sep 17 00:00:00 2001 From: Joshua Boniface Date: Sun, 2 Apr 2023 12:50:09 -0400 Subject: [PATCH] Add c2dbdl script --- README.md | 128 +++++++++++++++ c3dbdl | 404 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 4 + 3 files changed, 536 insertions(+) create mode 100644 README.md create mode 100755 c3dbdl create mode 100644 requirements.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..b61c645 --- /dev/null +++ b/README.md @@ -0,0 +1,128 @@ +# C3DB Download Tool + +The C3DB Download Tool allows for easy scraping to a local JSON database and downloading of files from the C3 +(Customs Creators Collective) database, a collection of custom songs for Guitar Hero, Rock Band, and similar clone +games. + +This tool exists because the C3DB is very hard to mass download from: each song must be found in the extensive +list, selected manually, and a second link clicked through, before a random file name is obtained. This tool +simplifies the process by first collecting information about all available songs of a particular type, and then is +able to download songs based on customizable filters (e.g. by genre, artist, author, etc.) and output them in a +standardized format. + +To use the tool, first use the "database" command to build or modify your local JSON database, then use the +"download" command to download songs. + +To avoid overloading or abusing the C3DB website, this tool operates exclusively in sequential mode by design; at +most one page is scraped (for "database build") or song downloaded (for "download") at once. Additionally, the tool +design ensures that the JSON database of songs is stored locally, so it only needs to be built once and then is +reused to perform actual downloads without putting further load on the website. + +## Installation + +1. Install the Python3 requirements from `requirements.txt`. + +1. Copy the script to a virtualenv, somewhere in your $PATH or execute directly from this folder (see Usage below). + +## Usage + +Before running a command, use the build-in help via the `-h`/`--help` option to view the available option(s) of +the command. + +The general process of using `c3dbdl` is as follows: + +1. Select a download location, and either specify it with the `-d`/`--download-directory` option or via the +environment variable `C3DBDL_DOWNLOAD_DIRECTORY`. + +1. Select a base URL. Use this to determine what game(s) you want to want to limit to, or use the default to +fetch all avilable songs for all games, and either specify it with the `-u`/`--base-url` option or via the +environment variable `C3DBDL_BASE_URL`. + +1. Initialize your C3DB JSON database with `c3dbdl [options] database build`. This will take a fair amount +of time to complete as all pages of the chosen base URL are scanned. Note that if you cancel this process, no +data will be saved, so let it complete! + +1. Download any song(s) you want with `c3dbdl [options] download [options]`. + +## Filtering + +Filtering out the songs in the database is a key part of this tool. You might want to be able to grab only select +genres, artists, authors, etc. to make your custom song packs. + +`c3dbdl` is able to filter by several key categories: + +* `genre`: The genre of the song. +* `artist`: The artist of the song. +* `album`: The album of the song. +* `title`: The title of the song. +* `year`: The year of the album/song. +* `author`: The author of the file on C3DB. + +Note that we *cannot* filter - mostly for parsing difficulty reasons - by intrument type or difficulty, by song +length, or by any other information not mentioned above. + +Filtering is always done during the download stage; the JSON database will always contain all possible entries. + +To use filters, append one or more `--filter` options to your `c3dbdl download` command. A filter option begins +with the literal `--filter`, followed by the category (e.g. `genre` or `artist`), then finally the text to filter +on, for instance `Rock` or `Santana` or `2012`. The text must be quoted if it contains whitespace. + +If more that one filter is specified, they are treated as a logical AND, i.e. all the listed filters must apply to +a given song for it to be downloaded in that run. + +Filters allow powerfully specific download selections to be run. For example, let's look for all songs by Rush +from the album Vapor Trails (the remixed version) authored by ejthedj: + +``` +c3dbdl download --filter artist Rush --filter album "Vapor Trails [Remixed]" --author ejthedj +``` + +This shouldfind , as of 2023-04-02, exactly one song, "Sweet Miracle": + +``` +Found 28942 songs from JSON database file 'Downloads/c3db.json' +Downloading 1 song files... +Downloading song "Rush - Sweet Miracle" by ejthedj... +Downloading from https://dl.c3universe.com/s/ejthedj/sweetMiracle... +``` + +Feel free to experiment. + +## Output Format + +When downloading files, it may be advantageous to customize the output directory and filename structure to better +match what you plan to do with the files. For instance, for pure organiation you might want nicely laid out +files with clear directory structures and names, while for Onyx packaging you might want everything in a flat +directory. + +`c3dbdl` provides complete flexibility in the output file format. When downloading, use the `--file-structure` +option to set the file structure. This value is an interpolated string containing one or more field variables, +which are mapped at download file. The available fields are: + +* `genre`: The genre of the song. +* `artist`: The artist of the song. +* `album`: The album of the song. +* `title`: The title of the song. +* `year`: The year of the album/song. +* `author`: The author of the file on C3DB. +* `orig_file`: The original filename that would be downloaded by e.g. a browser. + +The default structure leverages all of these options to create an archive-ready structure as follows: + +``` +{genre}/{artist}/{album}/{title} [{year}] ({author}).{orig_file} +``` + +As an example: + +``` +Prog/Rush/Vapor Trails [Remixed]/Sweet Miracle [2002] (ejthedj).sweetMiracle +``` + +Note that any parent director(ies) will be automatically created down the whole tree until the final filename. + +## Help + +This is a quick and dirty tool I wrote to quickly grab collections of songs. I provide no guarantee of success +when using this tool. If you have issues, please open an issue on this repository and provide *full details* +of your problem. diff --git a/c3dbdl b/c3dbdl new file mode 100755 index 0000000..5305e59 --- /dev/null +++ b/c3dbdl @@ -0,0 +1,404 @@ +#!/usr/bin/env python3 + +import click +import requests +import re +import json +import os +from time import sleep +from difflib import unified_diff +from colorama import Fore +from bs4 import BeautifulSoup +from urllib.error import HTTPError + +CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'], max_content_width=120) + + +def buildDatabase(pages=None): + found_songs = [] + + if pages is None: + r = requests.get(f"{config['base_songs_url']}") + if r.status_code != 200: + return + + root_page_html = BeautifulSoup(r.text, 'html.parser') + pages = int(root_page_html.body.find('a', attrs={'class':'paginationLastPage'}).get('href').replace('?page=', '')) + + click.echo(f"Collecting data from {pages} pages") + + # Get a list of song URIs + for i in range(1, pages + 1): + attempts = 1 + p = None + while attempts <= 5: + try: + click.echo(f"Parsing page {i} (attempt #{attempts})...") + p = requests.get(f"{config['base_songs_url']}?page={i}") + break + except Exception: + sleep(attempts) + attempts += 1 + if p is None or p.status_code != 200: + break + + parsed_html = BeautifulSoup(p.text, 'html.parser') + + table_html = parsed_html.body.find('div', attrs={'class':'portlet-body'}).find('tbody') + + for entry in table_html.find_all('tr', attrs={'class':'odd'}): + if len(entry) < 1: + break + + song_entry = dict() + + for idx, td in enumerate(entry.find_all('td')): + if idx == 1: + # Download link + song_entry["dl_link"] = td.find('a', attrs={'target':'_blank'}).get('href') + elif idx == 2: + # Artist + song_entry["artist"] = td.find('a').get_text().strip().replace('/', '+') + elif idx == 3: + # Song + song_entry["title"] = td.find('div', attrs={'class':'c3ttitlemargin'}).get_text().strip().replace('/', '+') + song_entry["album"] = td.find('div', attrs={'class':'c3tartist'}).get_text().strip().replace('/', '+') + elif idx == 4: + # Genre + song_entry["genre"] = td.find('a').get_text().strip() + elif idx == 5: + # Year + song_entry["year"] = td.find('a').get_text().strip() + elif idx == 6: + # Length + song_entry["length"] = td.find('a').get_text().strip() + elif idx == 8: + # Author (of chart) + song_entry["author"] = td.find('a').get_text().strip().replace('/', '+') + + if song_entry and song_entry['title']: + click.echo(f"Found song entry for {song_entry['artist']} - {song_entry['title']} by {song_entry['author']}") + found_songs.append(song_entry) + + return found_songs + + +def downloadSong(destination, filename, entry): + click.echo(f"""Downloading song "{entry['artist']} - {entry['title']}" by {entry['author']}...""") + + try: + p = requests.get(entry['dl_link']) + if p.status_code != 200: + raise HTTPError(entry['dl_link'], p.status_code, "", None, None) + + parsed_html = BeautifulSoup(p.text, 'html.parser') + download_url = parsed_html.body.find('div', attrs={'class':'lock-head'}).find('a').get('href') + except Exception as e: + click.echo(f"Failed parsing or retrieving HTML link: {e}") + return None + + download_filename = filename.format( + genre=entry['genre'], + artist=entry['artist'], + album=entry['album'], + title=entry['title'], + year=entry['year'], + author=entry['author'], + orig_name=download_url.split('/')[-1], + ) + download_filename = f"{destination}/{download_filename}" + download_path = '/'.join(f"{download_filename}".split('/')[0:-1]) + + if not os.path.exists(download_path): + os.makedirs(download_path) + + if os.path.exists(download_filename): + click.echo(f"File exists at {download_filename}") + return None + + click.echo(f"""Downloading from {download_url}...""") + attempts = 1 + p = None + try: + with requests.get(download_url, stream=True) as r: + while attempts <= 5: + try: + r.raise_for_status() + break + except Exception: + click.echo(f"Download attempt failed: HTTP {r.status_code}; retrying {attempts}/5") + sleep(attempts) + attempts += 1 + if r is None or r.status_code != 200: + if r: + code = r.status_code + else: + code = "-1" + raise HTTPError(download_url, code, "", None, None) + with open(download_filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + click.echo(f"Successfully downloaded to {download_filename}") + except Exception as e: + click.echo(f"Download attempt failed: {e}") + return None + + + +@click.command(name='build', short_help='Build the local database.') +@click.option( + "-o", "--overwrite", '_overwrite', is_flag=True, default=False, envvar='C3DLDB_BUILD_OVERWRITE', + help="Overwrite existing database file." +) +@click.option( + "-p", "--pages", "_pages", type=int, default=None, envvar='C3DBDL_BUILD_PAGES', + help="Number of pages to scan (default is all)." +) +def build_database(_overwrite, _pages): + """ + Initialize the local JSON database of C3DB songs from the website. + + \b + The following environment variables can be used for scripting purposes: + * C3DLDB_BUILD_OVERWRITE: equivalent to "--overwrite" + * C3DBDL_BUILD_PAGES: equivalent to "--pages" + """ + + if os.path.exists(config['database_filename']) and not _overwrite: + click.echo(f"Database already exists at '{config['database_filename']}'; use '--overwrite' to rebuild.") + exit(1) + + click.echo("Building JSON database; this will take a long time...") + songs_database = buildDatabase(_pages) + click.echo('') + click.echo(f"Found {len(songs_database)} songs, dumping to database file '{config['database_filename']}'") + if not os.path.exists(config['download_directory']): + click.echo(f"Creating download directory '{config['download_directory']}'") + os.makedirs(config['download_directory']) + with open(config['database_filename'], "w") as fh: + json.dump(songs_database, fh, indent=2) + fh.write('\n') + + +@click.command(name='edit', short_help='Edit the local database in EDITOR.') +def edit_database(): + """ + Edit the local JSON database of C3DB songs with your $EDITOR. + """ + + if not os.path.exists(config['database_filename']): + click.echo(f"WARNING: Database filename '{config['database_filename']}' does not exist!") + click.echo("Ensure you build a database first with the 'database build' command.") + exit(1) + + with open(config['database_filename'], "r") as fh: + songs_database = fh.read() + + new_songs_database = click.edit(text=songs_database, require_save=True, extension='.json') + while True: + if new_songs_database is None: + click.echo("Aborting with no modifications") + exit(0) + + click.echo('') + click.echo("Pending modifications:") + click.echo('') + diff = list(unified_diff( + songs_database.split('\n'), + new_songs_database.split('\n'), + fromfile='current', + tofile='modified', + fromfiledate='', + tofiledate='', + n=3, + lineterm='')) + for line in diff: + if re.match(r'^\+', line) is not None: + click.echo(Fore.GREEN + line + Fore.RESET) + elif re.match(r'^\-', line) is not None: + click.echo(Fore.RED + line + Fore.RESET) + elif re.match(r'^\^', line) is not None: + click.echo(Fore.BLUE + line + Fore.RESET) + else: + click.echo(line) + click.echo('') + + try: + json.loads(new_songs_database) + break + except Exception: + click.echo('ERROR: Invalid JSON syntax.') + click.confirm('Continue editing?', abort=True) + new_songs_database = click.edit(text=new_songs_database, require_save=True, extension='.json') + + click.confirm('Write modifications to songs database?', abort=True) + + with open(config['database_filename'], "w") as fh: + fh.write(new_songs_database) + + +@click.group(name="database", short_help='Manage the local database.') +def database(): + """ + Manage the local JSON database of C3DB songs. + """ + + pass + + +@click.command(name="download", short_help='Download files from C3DB.') +@click.option( + '-s', '--file-structure', '_file_structure', envvar='C3DBDL_DL_FILE_STRUCTURE', + default="{genre}/{artist}/{album}/{title} [{year}] ({author}).{orig_name}", + help='Specify the output file/directory stucture.' +) +@click.option( + '-f', '--filter', '_filters', envvar='C3DBDL_DL_FILTERS', + default=[], multiple=True, + nargs=2, + help='Add a filter option.' +) +@click.option( + '-l', '--limit', '_limit', envvar='C3DBDL_DL_LIMIT', + default=None, type=int, + help='Limit to this many songs (first N matches).' +) +def download(_filters, _limit, _file_structure): + """ + Download song(s) from the C3DB webpage. + + \b + The output file structure can be specified as a path format with any of the following + fields included, surrounded by curly braces: + * genre: The genre of the song. + * artist: The artist of the song. + * album: The album of the song. + * title: The title of the song. + * year: The year of the album/song. + * author: The author of the file on C3DB. + * orig_name: The original filename from the website. + + \b + The default output file structure is: + "{genre}/{artist}/{album}/{title} [{year}] ({author}).{orig_file}" + + \b + Filters allow granular selection of the song(s) to download. Multiple filters can be + specified, and a song is selected only if ALL filters match (logical AND). Each filter + is in the form: + --filter [database_key] [value] + + \b + The valid "database_key" values are identical to the output file fields above. + + \b + For example, to download all songs in the genre "Rock": + --filter genre Rock + + \b + Or to download all songs by the artist "Rush" and the author "MyName": + --filter artist Rush --filter author MyName + + \b + The following environment variables can be used for scripting purposes: + * C3DBDL_DL_FILE_STRUCTURE: equivalent to "--file-structure" + * C3DBDL_DL_FILTERS: equivalent to "--filter"; limited to one instance + * C3DBDL_DL_LIMIT: equivalent to "--limit" + """ + + with open(config['database_filename'], "r") as fh: + all_songs = json.load(fh) + click.echo(f"Found {len(all_songs)} songs from JSON database file '{config['database_filename']}'") + + pending_songs = list() + + for song in all_songs: + if len(_filters) < 1: + add_to_pending = True + else: + add_to_pending = all(song[_filter[0]] == _filter[1] for _filter in _filters) + + if add_to_pending: + pending_songs.append(song) + + if _limit is not None: + pending_songs = pending_songs[0:_limit] + + click.echo(f"Downloading {len(pending_songs)} song files...") + + for song in pending_songs: + downloadSong(config['download_directory'], _file_structure, song) + + +@click.group(context_settings=CONTEXT_SETTINGS) +@click.option( + '-u', '--base-url', '_base_url', envvar='C3DBDL_BASE_URL', + default='https://db.c3universe.com/songs/all', show_default=True, + help='Base URL of the online C3DB songs page' +) +@click.option( + '-d', '--download-directory', '_download_directory', envvar='C3DBDL_DOWNLOAD_DIRECTORY', + default='~/Downloads', show_default=True, + help='Download directory for JSON database and songs' +) +@click.option( + '-j', '--json-database', '_json_database', envvar='C3DBDL_JSON_DATABASE', + default='c3db.json', show_default=True, + help='JSON database filename within download directory' +) +def cli(_base_url, _download_directory, _json_database): + """ + C3DB Download Tool + + The C3DB Download Tool allows for easy scraping to a local JSON database and downloading + of files from the C3 (Customs Creators Collective) database, a collection of custom songs + for Guitar Hero, Rock Band, and similar clone games. + + This tool exists because the C3DB is very hard to mass download from: each song must + be found in the extensive list, selected manually, and a second link clicked through, + before a random file name is obtained. This tool simplifies the process by first collecting + information about all available songs of a particular type, and then is able to download + songs based on customizable filters (e.g. by genre, artist, author, etc.) and output them + in a standardized format. + + To use the tool, first use the "database" command to build or modify your local JSON + database, then use the "download" command to download songs. + + To avoid overloading or abusing the C3DB website, this tool operates exclusively in + sequential mode by design; at most one page is scraped (for "database build") or song + downloaded (for "download") at once. Additionally, the tool design ensures that the JSON + database of songs is stored locally, so it only needs to be built once and then is reused + to perform actual downloads without putting further load on the website. + + \b + The following environment variables can be used for scripting purposes: + * C3DBDL_BASE_URL: equivalent to "--base-url" + * C3DBDL_DOWNLOAD_DIRECTORY: equivalent to "--download_directory" + * C3DBDL_JSON_DATABASE: equivalent to "--json-database" + + """ + + global config + + # Expand any ~ in the download directory pathname + _download_directory = os.path.expanduser(_download_directory) + + # Populate the configuration store + config['base_songs_url'] = _base_url + config['download_directory'] = _download_directory + config['database_filename'] = f"{_download_directory}/{_json_database}" + + +config = dict() + +database.add_command(build_database) +database.add_command(edit_database) + +cli.add_command(database) +cli.add_command(download) + +def main(): + return cli(obj={}) + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..910928f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +Click +requests +colorama +beautifulsoup4