Improve Mediafire parsing even further

Handle Mediafire link when in Javascript
Add negation filters
2023-04-29 11:20:37 -04:00 · 2023-04-29 11:06:54 -04:00 · 2023-04-28 20:55:53 -04:00 · 2023-04-28 18:40:54 -04:00
1 changed files with 96 additions and 48 deletions
--- a/c3dbdl/c3dbdl.py
+++ b/c3dbdl/c3dbdl.py
@@ -153,8 +153,6 @@ def fetchSongData(entries):
        for link_entry in download_links:
            link = link_entry.get("href")
            description = link_entry.get_text().strip()
-            if "c3universe.com" not in link:
-                continue
            messages.append(f"Found download link: {link} ({description})")
            dl_links.append(
                {
@@ -252,6 +250,70 @@ def buildDatabase(pages, concurrency):
    return found_songs


+def downloadFile(download_url, download_path, download_filename):
+    attempts = 1
+    p = None
+    try:
+        with requests.get(download_url, stream=True) as r:
+            while attempts <= 3:
+                try:
+                    r.raise_for_status()
+                    break
+                except Exception:
+                    click.echo(
+                        f"Download attempt failed: HTTP {r.status_code}; retrying {attempts}/3"
+                    )
+                    sleep(attempts)
+                    attempts += 1
+            if r is None or r.status_code != 200:
+                if r:
+                    code = r.status_code
+                else:
+                    code = "-1"
+                raise HTTPError(download_url, code, "", None, None)
+
+            if not os.path.exists(download_path):
+                os.makedirs(download_path)
+
+            with open(download_filename, "wb") as f:
+                for chunk in r.iter_content(chunk_size=8192):
+                    f.write(chunk)
+            click.echo(f"Successfully downloaded to {download_filename}")
+    except Exception as e:
+        click.echo(f"Download attempt failed: {e}")
+        return None
+
+def parseC3Universe(dl_link):
+    try:
+        p = requests.get(dl_link)
+        parsed_html = BeautifulSoup(p.text, "html.parser")
+        download_element = (
+            parsed_html.body.find("div", attrs={"class": "lock-head"})
+            .find("a")
+        )
+        download_url = download_element.get("href")
+        return download_url
+    except Exception as e:
+        click.echo(f"Failed parsing or retrieving file URL from link {dl_link}: {e}")
+        return None
+
+
+def parseMediafire(dl_link):
+    try:
+        p = requests.get(dl_link)
+        parsed_html = BeautifulSoup(p.text, "html.parser")
+        download_element = parsed_html.find(
+            "a", attrs={"aria-label": "Download file"}
+        )
+        if download_element is not None:
+            download_url = download_element.get("href")
+        else:
+            download_url = re.search(r"'(http[s]*://download[0-9]+.mediafire.com/.*)';", p.text).group(1)
+        return download_url
+    except Exception as e:
+        click.echo(f"Failed parsing or retrieving file URL from link {dl_link}: {e}")
+        return None
+
 def downloadSong(destination, filename, entry, dlid, dldesc):
    click.echo(
        f"""> Downloading song "{entry['artist']} - {entry['title']}" by {entry['author']}..."""
@@ -278,19 +340,17 @@ def downloadSong(destination, filename, entry, dlid, dldesc):
            return

    for dl_link in dl_links:
-        try:
-            p = requests.get(dl_link["link"])
-            if p.status_code != 200:
-                raise HTTPError(dl_link["link"], p.status_code, "", None, None)
+        if 'dl.c3universe.com' in dl_link['link']:
+            download_url = parseC3Universe(dl_link["link"])
+        elif 'www.mediafire.com' in dl_link["link"]:
+            download_url = parseMediafire(dl_link["link"])
+        else:
+            click.echo("Download URL is not valid for CLI download; skipping...")
+            click.echo(f"URL: {dl_link['link']}")
+            continue

-            parsed_html = BeautifulSoup(p.text, "html.parser")
-            download_url = (
-                parsed_html.body.find("div", attrs={"class": "lock-head"})
-                .find("a")
-                .get("href")
-            )
-        except Exception as e:
-            click.echo(f"Failed parsing or retrieving HTML link: {e}")
+        if download_url is None:
+            click.echo(f"No valid download URL found, skipping...")
            continue

        download_filename = filename.format(
@@ -312,38 +372,7 @@ def downloadSong(destination, filename, entry, dlid, dldesc):
            click.echo(f"File exists at {download_filename}")
            continue

-        attempts = 1
-        p = None
-        try:
-            with requests.get(download_url, stream=True) as r:
-                while attempts <= 3:
-                    try:
-                        r.raise_for_status()
-                        break
-                    except Exception:
-                        click.echo(
-                            f"Download attempt failed: HTTP {r.status_code}; retrying {attempts}/3"
-                        )
-                        sleep(attempts)
-                        attempts += 1
-                if r is None or r.status_code != 200:
-                    if r:
-                        code = r.status_code
-                    else:
-                        code = "-1"
-                    raise HTTPError(download_url, code, "", None, None)
-
-                if not os.path.exists(download_path):
-                    os.makedirs(download_path)
-
-                with open(download_filename, "wb") as f:
-                    for chunk in r.iter_content(chunk_size=8192):
-                        f.write(chunk)
-                click.echo(f"Successfully downloaded to {download_filename}")
-        except Exception as e:
-            click.echo(f"Download attempt failed: {e}")
-            continue
-
+        downloadFile(download_url, download_path, download_filename)

@click.command(name="build", short_help="Build the local database.")
@click.option(
@@ -594,7 +623,13 @@ def download(_filters, _id, _desc, _limit, _file_structure):
                    for information_filter in song_information_filters:
                        filter_field = information_filter[0].lower()
                        filter_value = information_filter[1].lower()
-                        if re.match("^~", filter_value):
+                        if re.match("^!", filter_value):
+                            filter_value = filter_value.replace("!", "")
+                            if filter_value in song[filter_field].lower():
+                                pending_information_filters.append(False)
+                            else:
+                                pending_information_filters.append(True)
+                        elif re.match("^~", filter_value):
                            filter_value = filter_value.replace("~", "")
                            if filter_value in song[filter_field].lower():
                                pending_information_filters.append(True)
@@ -694,6 +729,13 @@ def search(_filters):
    For example, to match all songs with "Word" in their titles:
      --filter title ~word

+    A filter can be negated by adding an exclamation mark ("!") to the beginning of the
+    "<value>". Note that "!" must be escaped or single-quoted under BASH.
+
+    \b
+    For example, to match all songs except those by Yes as their artist:
+      --filter artist '!Yes'
+
    Instrument filters allow selection of the presence of instruments. If an instrument
    fitler is given, only songs which contain parts for the given instrument(s) will be
    shown.
@@ -750,7 +792,13 @@ def search(_filters):
                    for information_filter in song_information_filters:
                        filter_field = information_filter[0].lower()
                        filter_value = information_filter[1].lower()
-                        if re.match("^~", filter_value):
+                        if re.match("^!", filter_value):
+                            filter_value = filter_value.replace("!", "")
+                            if filter_value in song[filter_field].lower():
+                                pending_information_filters.append(False)
+                            else:
+                                pending_information_filters.append(True)
+                        elif re.match("^~", filter_value):
                            filter_value = filter_value.replace("~", "")
                            if filter_value in song[filter_field].lower():
                                pending_information_filters.append(True)
Author	SHA1	Message	Date
Joshua Boniface	7a2154db69	Improve Mediafire parsing even further	2023-04-29 11:20:37 -04:00
Joshua Boniface	0a0fb144c0	Handle Mediafire link when in Javascript	2023-04-29 11:06:54 -04:00
Joshua Boniface	6ecb62431e	Add negation filters	2023-04-28 20:55:53 -04:00
Joshua Boniface	447eb4120a	Add handler for Mediafile downloads	2023-04-28 18:40:54 -04:00