#!/usr/bin/env python3 # c3dbdl - Customs Creators Collective archive tool # # Copyright (C) 2023 Joshua M. Boniface # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, version 3. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ############################################################################### import click import requests import re import json import os from time import sleep from difflib import unified_diff from colorama import Fore from bs4 import BeautifulSoup from urllib.error import HTTPError from concurrent.futures import ThreadPoolExecutor, as_completed CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"], max_content_width=120) def fetchSongData(entries): song_entry = { "artist": None, "title": None, "album": None, "song_link": None, "genre": None, "year": None, "length": None, "author": None, "instruments": dict(), "dl_links": list(), } messages = list() found_instruments = False # Find song details for idx, td in enumerate(entries[0].find_all("td")): if idx == 2: # Artist song_entry["artist"] = td.find("a").get_text().strip().replace("/", "+") elif idx == 3: # Song song_entry["title"] = ( td.find("div", attrs={"class": "c3ttitlemargin"}) .get_text() .strip() .replace("/", "+") ) song_entry["album"] = ( td.find("div", attrs={"class": "c3tartist"}) .get_text() .strip() .replace("/", "+") ) # Song page tmp_links = td.find_all("a", href=True) for link in tmp_links: if link.get("href"): song_entry["song_link"] = link.get("href") break elif idx == 4: # Genre song_entry["genre"] = td.find("a").get_text().strip() elif idx == 5: # Year song_entry["year"] = td.find("a").get_text().strip() elif idx == 6: # Length song_entry["length"] = td.find("a").get_text().strip() elif idx == 8: # Author (of chart) song_entry["author"] = td.find("a").get_text().strip().replace("/", "+") # Find song instruments and difficulties for idx, td in enumerate(entries[1].find_all("td")): if ( len(list(td.find_all("div", attrs={"style": "width:110px;float:left"}))) > 0 and not found_instruments ): for instrument in td.find_all( "div", attrs={"style": "width:110px;float:left"} ): difficulty_link = ( instrument.find_all( "a", attrs={"style": "text-decoration: none;color:#000"} )[1] .get("href") .split("/") ) instrument_name = ( difficulty_link[-2].split("_")[-1].replace("prokeys", "keys") ) instrument_diff = int(difficulty_link[-1]) if instrument_diff < 1: # No part instrument_difficulty = None else: # Link difficulty - 1 instrument_difficulty = instrument_diff - 1 song_entry["instruments"][instrument_name] = instrument_difficulty found_instruments = True if ( song_entry and song_entry["author"] and song_entry["title"] and song_entry["song_link"] ): messages.append( f"> Found song entry for {song_entry['artist']} - {song_entry['title']} by {song_entry['author']}" ) # Get download links from the actual song page attempts = 1 sp = None while attempts <= 3: try: messages.append( f"Parsing song page {song_entry['song_link']} (attempt {attempts}/3)..." ) sp = requests.get(song_entry["song_link"]) break except Exception: sleep(attempts) attempts += 1 if sp is None or sp.status_code != 200: messages.append("Failed to fetch song page, aborting") return None song_parsed_html = BeautifulSoup(sp.text, "html.parser") download_section = song_parsed_html.find( "div", attrs={"class": "portlet light bg-inverse"} ) download_links = download_section.find_all("a", href=True) dl_links = list() for link_entry in download_links: link = link_entry.get("href") description = link_entry.get_text().strip() if "c3universe.com" not in link: continue messages.append(f"Found download link: {link} ({description})") dl_links.append( { "link": link, "description": description, } ) if not dl_links: messages.append( "Found no c3universe.com download links for song, not adding to database" ) return None song_entry["dl_links"] = dl_links # Return messages and song entry return messages, song_entry def buildDatabase(pages, concurrency): found_songs = [] if pages is None: r = requests.get(f"{config['base_songs_url']}") if r.status_code != 200: return root_page_html = BeautifulSoup(r.text, "html.parser") pages = int( root_page_html.body.find("a", attrs={"class": "paginationLastPage"}) .get("href") .replace("?page=", "") ) click.echo(f"Collecting data from {pages} pages") # Get a list of song URIs for i in range(1, pages + 1): attempts = 1 p = None while attempts <= 3: try: click.echo(f"Parsing page {i} (attempt {attempts}/3)...") p = requests.get(f"{config['base_songs_url']}?page={i}") if p is None or p.status_code != 200: raise parsed_html = BeautifulSoup(p.text, "html.parser") if parsed_html.body is None: raise if ( parsed_html.body.find("div", attrs={"class": "portlet-body"}) is None ): raise break except Exception: sleep(attempts) attempts += 1 table_html = parsed_html.body.find("div", attrs={"class": "portlet-body"}).find( "tbody" ) # This is weird, but because of the table layout, there are two table rows for # each song: the first is the song info, the second is the instruments # So we must make a single "entry" that is a list of the two elements, then # handle that later in fetchSongData. entries = list() entry_idx = 0 entry_data = list() for entry in table_html.find_all("tr", attrs={"class": "odd"}): if len(entry) < 1: break entry_data.append(entry) entry_idx += 1 if entry_idx == 2: entries.append(entry_data) entry_idx = 0 entry_data = list() click.echo("Fetching and parsing song pages...") with ThreadPoolExecutor(max_workers=concurrency) as executor: future_to_song = { executor.submit(fetchSongData, entry): entry for entry in entries } for future in as_completed(future_to_song): try: messages, song = future.result() click.echo("\n".join(messages)) if song is None: continue found_songs.append(song) except Exception: continue return found_songs def downloadSong(destination, filename, entry, dlid, dldesc): click.echo( f"""> Downloading song "{entry['artist']} - {entry['title']}" by {entry['author']}...""" ) if dlid is None: dl_links = entry["dl_links"] else: try: dl_links = [entry["dl_links"][dlid - 1]] except Exception: click.echo(f"Invalid download link ID {dlid}.") return if dldesc is not None: new_dl_links = list() for link in dl_links: if dldesc in link["description"]: new_dl_links.append(link) dl_links = new_dl_links if not dl_links: click.echo(f'No download link matching description "{dldesc}" found.') return for dl_link in dl_links: try: p = requests.get(dl_link["link"]) if p.status_code != 200: raise HTTPError(dl_link["link"], p.status_code, "", None, None) parsed_html = BeautifulSoup(p.text, "html.parser") download_url = ( parsed_html.body.find("div", attrs={"class": "lock-head"}) .find("a") .get("href") ) except Exception as e: click.echo(f"Failed parsing or retrieving HTML link: {e}") continue download_filename = filename.format( genre=entry["genre"], artist=entry["artist"], album=entry["album"], title=entry["title"], year=entry["year"], author=entry["author"], orig_name=download_url.split("/")[-1], ) download_filename = f"{destination}/{download_filename}" download_path = "/".join(f"{download_filename}".split("/")[0:-1]) click.echo( f"""Downloading file "{dl_link['description']}" from {download_url}...""" ) if os.path.exists(download_filename): click.echo(f"File exists at {download_filename}") continue attempts = 1 p = None try: with requests.get(download_url, stream=True) as r: while attempts <= 3: try: r.raise_for_status() break except Exception: click.echo( f"Download attempt failed: HTTP {r.status_code}; retrying {attempts}/3" ) sleep(attempts) attempts += 1 if r is None or r.status_code != 200: if r: code = r.status_code else: code = "-1" raise HTTPError(download_url, code, "", None, None) if not os.path.exists(download_path): os.makedirs(download_path) with open(download_filename, "wb") as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) click.echo(f"Successfully downloaded to {download_filename}") except Exception as e: click.echo(f"Download attempt failed: {e}") continue @click.command(name="build", short_help="Build the local database.") @click.option( "-o", "--overwrite", "_overwrite", is_flag=True, default=False, envvar="C3DLDB_BUILD_OVERWRITE", help="Overwrite existing database file.", ) @click.option( "-p", "--pages", "_pages", type=int, default=None, envvar="C3DBDL_BUILD_PAGES", help="Number of pages to scan (default is all).", ) @click.option( "-c", "--concurrency", "_concurrency", type=int, default=10, envvar="C3DBDL_BUILD_CONCURRENCY", help="Number of concurrent song page downloads to perform at once.", ) def build_database(_overwrite, _pages, _concurrency): """ Initialize the local JSON database of C3DB songs from the website. \b The following environment variables can be used for scripting purposes: * C3DLDB_BUILD_OVERWRITE: equivalent to "--overwrite" * C3DBDL_BUILD_PAGES: equivalent to "--pages" """ if os.path.exists(config["database_filename"]) and not _overwrite: click.echo( f"Database already exists at '{config['database_filename']}'; use '--overwrite' to rebuild." ) exit(1) click.echo("Building JSON database; this will take a long time...") songs_database = buildDatabase(_pages, _concurrency) click.echo("") click.echo( f"Found {len(songs_database)} songs, dumping to database file '{config['database_filename']}'" ) if not os.path.exists(config["download_directory"]): click.echo(f"Creating download directory '{config['download_directory']}'") os.makedirs(config["download_directory"]) with open(config["database_filename"], "w") as fh: json.dump(songs_database, fh, indent=2) fh.write("\n") @click.command(name="edit", short_help="Edit the local database in EDITOR.") def edit_database(): """ Edit the local JSON database of C3DB songs with your $EDITOR. """ if not os.path.exists(config["database_filename"]): click.echo( f"WARNING: Database filename '{config['database_filename']}' does not exist!" ) click.echo( "Ensure you build a database first with the 'database build' command." ) exit(1) with open(config["database_filename"], "r") as fh: songs_database = fh.read() new_songs_database = click.edit( text=songs_database, require_save=True, extension=".json" ) while True: if new_songs_database is None: click.echo("Aborting with no modifications") exit(0) click.echo("") click.echo("Pending modifications:") click.echo("") diff = list( unified_diff( songs_database.split("\n"), new_songs_database.split("\n"), fromfile="current", tofile="modified", fromfiledate="", tofiledate="", n=3, lineterm="", ) ) for line in diff: if re.match(r"^\+", line) is not None: click.echo(Fore.GREEN + line + Fore.RESET) elif re.match(r"^\-", line) is not None: click.echo(Fore.RED + line + Fore.RESET) elif re.match(r"^\^", line) is not None: click.echo(Fore.BLUE + line + Fore.RESET) else: click.echo(line) click.echo("") try: json.loads(new_songs_database) break except Exception: click.echo("ERROR: Invalid JSON syntax.") click.confirm("Continue editing?", abort=True) new_songs_database = click.edit( text=new_songs_database, require_save=True, extension=".json" ) click.confirm("Write modifications to songs database?", abort=True) with open(config["database_filename"], "w") as fh: fh.write(new_songs_database) @click.group(name="database", short_help="Manage the local database.") def database(): """ Manage the local JSON database of C3DB songs. """ pass @click.command(name="download", short_help="Download files from C3DB.") @click.option( "-s", "--file-structure", "_file_structure", envvar="C3DBDL_DL_FILE_STRUCTURE", default="{artist}/{album}/{title}.{author}.{orig_name}", help="Specify the output file/directory stucture.", ) @click.option( "-f", "--filter", "_filters", default=[], multiple=True, nargs=2, help="Add a search filter.", ) @click.option( "-l", "--limit", "_limit", envvar="C3DBDL_DL_LIMIT", default=None, type=int, help="Limit to this many songs (first N matches).", ) @click.option( "-i", "--download-id", "_id", envvar="C3DBDL_DL_ID", default=None, type=int, help='Download only "dl_links" entry N (1 is first, etc.), or all if unspecified.', ) @click.option( "-d", "--download-descr", "_desc", envvar="C3DBDL_DL_DESCR", default=None, help='Download only "dl_links" entries with this in their description (fuzzy).', ) def download(_filters, _id, _desc, _limit, _file_structure): """ Download song(s) from the C3DB webpage. Filters allow granular selection of the song(s) to download. Multiple filters can be specified, and a song is selected only if ALL filters match (logical AND). Filters are specified in the form "--filter ". For a full list of and explanation for filters, see the help output for the "search" command ("c3dbdl search --help"). In addition to filters, each song may have more than one download link, to provide multiple versions of the same song (for example, normal and multitracks, or alternate charts). For each song, the "-i"/"--download-id" and "-d"/"--download-descr" options can help filter these out, or both can be left blank to download all possible files for a given song. \b The output file structure can be specified as a path format with any of the following fields included, surrounded by curly braces: * genre: The genre of the song. * artist: The artist of the song. * album: The album of the song. * title: The title of the song. * year: The year of the album/song. * author: The author of the file on C3DB. * orig_name: The original filename from the website. \b The default output file structure is: "{artist}/{album}/{title}.{author}.{orig_name}" \b The following environment variables can be used for scripting purposes: * C3DBDL_DL_FILE_STRUCTURE: equivalent to "--file-structure" * C3DBDL_DL_LIMIT: equivalent to "--limit" * C3DBDL_DL_ID: equivalent to "--download-id" * C3DBDL_DL_DESCR: equivalent to "--download-descr" """ with open(config["database_filename"], "r") as fh: all_songs = json.load(fh) click.echo( f"Found {len(all_songs)} songs from JSON database file '{config['database_filename']}'" ) pending_songs = list() for song in all_songs: add_to_pending = True song_filters = _filters song_information_filters = list() song_instrument_filters = list() if len(_filters) > 0: # Extract the instrument filters for _filter in song_filters: if _filter[0] == "instrument": song_instrument_filters.append(_filter[1].lower()) else: song_information_filters.append(_filter) if len(song_information_filters) > 0 or len(song_instrument_filters) > 0: # Parse the information filters if len(song_information_filters) > 0: try: pending_information_filters = list() for information_filter in song_information_filters: filter_field = information_filter[0].lower() filter_value = information_filter[1].lower() if re.match("^~", filter_value): filter_value = filter_value.replace("~", "") if filter_value in song[filter_field].lower(): pending_information_filters.append(True) else: pending_information_filters.append(False) else: if filter_value == song[filter_field].lower(): pending_information_filters.append(True) else: pending_information_filters.append(False) information_add_to_pending = all(pending_information_filters) except KeyError as e: click.echo(f"Invalid filter field {e}") exit(1) else: information_add_to_pending = True # Parse the instrument filters if len(song_instrument_filters) > 0: try: pending_instrument_filters = list() for instrument_filter in song_instrument_filters: if re.match("^no-", instrument_filter): instrument_filter = instrument_filter.replace("no-", "") if song["instruments"][instrument_filter] is None: pending_instrument_filters.append(True) else: pending_instrument_filters.append(False) else: if song["instruments"][instrument_filter] is not None: pending_instrument_filters.append(True) else: pending_instrument_filters.append(False) instrument_add_to_pending = all(pending_instrument_filters) except KeyError as e: click.echo(f"Invalid instrument value {e}") exit(1) else: instrument_add_to_pending = True add_to_pending = all( [information_add_to_pending, instrument_add_to_pending] ) if add_to_pending: pending_songs.append(song) if _limit is not None: pending_songs = pending_songs[0:_limit] click.echo(f"Downloading {len(pending_songs)} songs...") for song in pending_songs: downloadSong(config["download_directory"], _file_structure, song, _id, _desc) @click.command(name="search", short_help="Search for songs from local C3DB.") @click.option( "-f", "--filter", "_filters", default=[], multiple=True, nargs=2, help="Add a search filter.", ) def search(_filters): """ Search for song(s) from the C3DB local database. Filters allow granular selection of the song(s) to download. Multiple filters can be specified, and a song is selected only if ALL filters match (logical AND). Filters are specified in the form "--filter ". \b The valid fields for the "" value are: * genre: The genre of the song. * artist: The artist of the song. * album: The album of the song. * title: The title of the song. * year: The year of the album/song. * author: The author of the file on C3DB. * instrument: An instrument chart for the song. \b For example, to download all songs in the genre "Rock": --filter genre Rock \b Or to download all songs by the artist "Rush" and the author "MyName": --filter artist Rush --filter author MyName Filter values are case insensitive, and non-instrument filters can be made fuzzy by adding a tilde ("~") to the beginning of the "". \b For example, to match all songs with "Word" in their titles: --filter title ~word Instrument filters allow selection of the presence of instruments. If an instrument fitler is given, only songs which contain parts for the given instrument(s) will be shown. \b The valid instruments are: * guitar * bass * drums * vocals * keys To negate an instrument filter and find only entires without the specified instrument, append "no-" to the instrument name. \b For example, to download only songs that have a keys part but no vocal part: --filter instrument keys --filter instrument no-vocals Note that while instrument difficulties are displayed in the output of this command, they can not be filtered on; this is up to the user to do manually. The purpose of instrument filters is to ensure that songs contain or don't contain given parts, not to granularly select the difficulty of said parts (that's for the players of the game to do, not us). """ with open(config["database_filename"], "r") as fh: all_songs = json.load(fh) click.echo( f"Found {len(all_songs)} songs from JSON database file '{config['database_filename']}'" ) pending_songs = list() for song in all_songs: add_to_pending = True song_filters = _filters song_information_filters = list() song_instrument_filters = list() if len(_filters) > 0: # Extract the instrument filters for _filter in song_filters: if _filter[0] == "instrument": song_instrument_filters.append(_filter[1].lower()) else: song_information_filters.append(_filter) if len(song_information_filters) > 0 or len(song_instrument_filters) > 0: # Parse the information filters if len(song_information_filters) > 0: try: pending_information_filters = list() for information_filter in song_information_filters: filter_field = information_filter[0].lower() filter_value = information_filter[1].lower() if re.match("^~", filter_value): filter_value = filter_value.replace("~", "") if filter_value in song[filter_field].lower(): pending_information_filters.append(True) else: pending_information_filters.append(False) else: if filter_value == song[filter_field].lower(): pending_information_filters.append(True) else: pending_information_filters.append(False) information_add_to_pending = all(pending_information_filters) except KeyError as e: click.echo(f"Invalid filter field {e}") exit(1) else: information_add_to_pending = True # Parse the instrument filters if len(song_instrument_filters) > 0: try: pending_instrument_filters = list() for instrument_filter in song_instrument_filters: if re.match("^no-", instrument_filter): instrument_filter = instrument_filter.replace("no-", "") if song["instruments"][instrument_filter] is None: pending_instrument_filters.append(True) else: pending_instrument_filters.append(False) else: if song["instruments"][instrument_filter] is not None: pending_instrument_filters.append(True) else: pending_instrument_filters.append(False) instrument_add_to_pending = all(pending_instrument_filters) except KeyError as e: click.echo(f"Invalid instrument value {e}") exit(1) else: instrument_add_to_pending = True add_to_pending = all( [information_add_to_pending, instrument_add_to_pending] ) if add_to_pending: pending_songs.append(song) click.echo(f"Found {len(pending_songs)} matching songs:") click.echo() for entry in pending_songs: click.echo( f"""> Song: "{entry['artist']} - {entry['title']}" ({entry['length']}, {entry['genre']}) from "{entry['album']} ({entry['year']})" by {entry['author']}""" ) instrument_list = list() for instrument in entry["instruments"]: instrument_list.append(f"{instrument} [{entry['instruments'][instrument]}]") click.echo( f""" Instruments: {', '.join(instrument_list)}""", ) click.echo(""" Available downloads:""") for link in entry["dl_links"]: click.echo(f""" * {link['description']}""") click.echo() @click.group(context_settings=CONTEXT_SETTINGS) @click.option( "-u", "--base-url", "_base_url", envvar="C3DBDL_BASE_URL", default="https://db.c3universe.com/songs/all", show_default=True, help="Base URL of the online C3DB songs page", ) @click.option( "-d", "--download-directory", "_download_directory", envvar="C3DBDL_DOWNLOAD_DIRECTORY", default="~/Downloads", show_default=True, help="Download directory for JSON database and songs", ) @click.option( "-j", "--json-database", "_json_database", envvar="C3DBDL_JSON_DATABASE", default="c3db.json", show_default=True, help="JSON database filename within download directory", ) def cli(_base_url, _download_directory, _json_database): """ Customs Creators Collective archive tool The Customs Creators Collective archive tool allows for easy scraping to a local JSON database and downloading of files from the C3 (Customs Creators Collective) database, a collection of custom songs for Rock Band and similar clone games. This tool exists because the C3DB is very hard to mass download from: each song must be found in the extensive list, selected manually, and a second link clicked through, before a random file name is obtained. This tool simplifies the process by first collecting information about all available songs of a particular type, and then is able to download songs based on customizable filters (e.g. by genre, artist, author, etc.) and output them in a standardized format. To use the tool, first use the "database" command to build or modify your local JSON database, then use the "download" command to download songs. \b The following environment variables can be used for scripting purposes: * C3DBDL_BASE_URL: equivalent to "--base-url" * C3DBDL_DOWNLOAD_DIRECTORY: equivalent to "--download_directory" * C3DBDL_JSON_DATABASE: equivalent to "--json-database" """ global config # Expand any ~ in the download directory pathname _download_directory = os.path.expanduser(_download_directory) # Populate the configuration store config["base_songs_url"] = _base_url config["download_directory"] = _download_directory config["database_filename"] = f"{_download_directory}/{_json_database}" config = dict() database.add_command(build_database) database.add_command(edit_database) cli.add_command(database) cli.add_command(download) cli.add_command(search) def main(): return cli(obj={}) if __name__ == "__main__": main()