Added old code

2025-02-17 20:08:35 +01:00
parent bb3d40304d
commit f9253e4f50
8 changed files with 472 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -87,6 +87,7 @@ ipython_config.py
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 .env/
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
--- a/examples/inner_test.py
+++ b/examples/inner_test.py
@@ -0,0 +1,21 @@
 import shadowtube.preprocess as prep
 import shadowtube.recommend as rec
 raw_history = prep.parse_database("./short.db")
 # print(raw_history[0]["title"] + ": " + str(prep.relevancy(raw_history[0], raw_history)))
 # print(
 #    raw_history[28]["title"] + ": " + str(prep.relevancy(raw_history[28], raw_history))
 # )
 # for i in range(0, 10):
 #    print(prep.get_similarity(raw_history[i], raw_history))
 history = prep.sort_history(raw_history)
 print(len(history))
 # print(recommend(history))
 recommendations = rec.recommend(history, verbose=False)
 print(recommendations)
--- a/examples/tkinter_preview.py
+++ b/examples/tkinter_preview.py
@@ -0,0 +1,82 @@
 import requests
 from io import BytesIO
 from PIL import Image, ImageTk
 import tkinter as tk
 import yt_dlp
 import shadowtube.recommend as rec
 import shadowtube.preprocess as prep
 # kjdshfgklshdfjkglkadshf
 import webbrowser
 # List of 8 YouTube video IDs
 video_ids = rec.recommend(
    prep.sort_history(
        prep.parse_database(
            "/home/fedir/.var/app/io.freetubeapp.FreeTube/config/FreeTube/history.db"
        )
    ),
    verbose=True,
    count=8,
 )
 print(video_ids)
 def get_video_info(video_id):
    """Fetch the title and thumbnail URL of a YouTube video using yt_dlp."""
    ydl_opts = {"quiet": True, "no_warnings": True, "extract_flat": True}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(
            f"https://www.youtube.com/watch?v={video_id}", download=False
        )
        return info.get("title", "Title Not Found"), info.get("thumbnail", "")
 def open_video(event, video_id):
    """Open the YouTube video in the default web browser when clicked."""
    webbrowser.open(f"https://www.youtube.com/watch?v={video_id}")
 def show_video_preview(video_id, row, col):
    """Fetch and display a video's title and thumbnail in a grid layout."""
    title, thumbnail_url = get_video_info(video_id)
    # Fetch thumbnail
    response = requests.get(thumbnail_url)
    if response.status_code == 200:
        image_data = Image.open(BytesIO(response.content))
        image_data = image_data.resize((200, 112))  # Resize for grid
        photo = ImageTk.PhotoImage(image_data)
    else:
        photo = None
    # Create thumbnail label (clickable)
    thumbnail_label = tk.Label(root, image=photo, cursor="hand2")
    thumbnail_label.image = photo  # Keep reference
    thumbnail_label.grid(row=row, column=col, padx=10, pady=10)
    thumbnail_label.bind("<Button-1>", lambda event, v=video_id: open_video(event, v))
    # Create title label (wrapped text)
    title_label = tk.Label(
        root, text=title, font=("Arial", 10, "bold"), wraplength=200, justify="center"
    )
    title_label.grid(row=row + 1, column=col, padx=10, pady=5)
 # Create Tkinter window
 root = tk.Tk()
 root.title("YouTube Video Previews")
 # Add all 8 videos in a 2x4 grid
 for index, video_id in enumerate(video_ids):
    row = (index // 4) * 2  # Every second row for the title
    col = index % 4
    show_video_preview(video_id, row, col)
 root.mainloop()
 # kjdhfglkjsdhfgkljhsdkfg
--- a/shadowtube/init.py
+++ b/shadowtube/init.py
@@ -0,0 +1,2 @@
 from .preprocess import *
 from .recommend import *
--- a/shadowtube/cache.db
+++ b/shadowtube/cache.db
--- a/shadowtube/cache_similarity.db
+++ b/shadowtube/cache_similarity.db
--- a/shadowtube/preprocess.py
+++ b/shadowtube/preprocess.py
@@ -0,0 +1,211 @@
 import json
 import math
 from typing import List
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 import sqlite3
 model = None
 cache = sqlite3.connect("cache_similarity.db")
 cursor = cache.cursor()
 def parse_database(filename):
    parsed_data = []
    with open(filename, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if not line:
                continue
            try:
                parsed_data.append(json.loads(line))
            except json.JSONDecodeError:
                # Handle unquoted values by fixing the line
                fixed_line = fix_unquoted_values(line)
                parsed_data.append(json.loads(fixed_line))
    return parsed_data
 def get_embedding(text: str):
    global model
    if model is None:
        model = SentenceTransformer("all-MiniLM-L6-v2", backend="openvino")
    return model.encode(text, normalize_embeddings=True)
 def compute_similarity(entry_embedding, database_embeddings):
    similarities = cosine_similarity([entry_embedding], database_embeddings)[0]
    return similarities
 def get_cached_similarity_reflexive(entry1, entry2, verbose=False):
    response = []
    response.extend(get_cached_similarity(entry1, entry2, verbose=verbose))
    response.extend(get_cached_similarity(entry2, entry1, verbose=verbose))
    if len(response) > 1:
        print("WARN: duplicate pairs!")
    return response
 def get_cached_similarity(entry1, entry2, verbose=False):
    query = (
        "SELECT factor FROM similarity WHERE id='"
        + (entry1["videoId"] + "/" + entry2["videoId"])
        + "'"
    )
    if verbose:
        print("INFO: querying database: " + query + " (get_cached_similarity)")
    cursor.execute(query)
    response = cursor.fetchall()
    if verbose:
        print("INFO: response was: " + response.__str__() + "(get_cached_similarity)")
    formatted_response = []
    for tup in response:
        formatted_response.append(float(tup[0]))
    return formatted_response
 def set_cached_similarity(ids, similarities: List[float], verbose=False):
    pair_insert = []
    for i in range(0, len(ids)):
        pair_insert.append((ids[i], float(similarities[i])))
        # print(pair_insert)
    cursor.executemany(
        "INSERT INTO similarity VALUES(?, ?)",
        pair_insert,
    )
    cache.commit()
 def get_similarity(entry1, entry2, use_cache=True):
    similarity = []
    if use_cache:
        similarity = get_cached_similarity_reflexive(entry1, entry2)
        if len(similarity) == 0:
            entry1_embedding = get_embedding(
                entry1["title"] + " " + entry1["description"]
            )
            entry2_embedding = get_embedding(
                entry2["title"] + " " + entry2["description"]
            )
            similarity.append(compute_similarity(entry1_embedding, [entry2_embedding]))
            set_cached_similarity(
                [entry1["videoId"] + "/" + entry2["videoId"]], similarity[0]
            )
    else:
        entry1_embedding = get_embedding(entry1["title"] + " " + entry1["description"])
        entry2_embedding = get_embedding(entry2["title"] + " " + entry2["description"])
        similarity.append(compute_similarity(entry1_embedding, [entry2_embedding]))
    return similarity[0]
 def get_global_similarity(entry, database, k=10, use_cache=True, verbose=False):
    entry_text = entry["title"] + " " + entry["description"]
    similarities: List[float] = []
    # Get all embeddings in the database
    database_texts = []
    text_keys = []
    for e in database:
        if entry["videoId"] != e["videoId"]:
            cached = get_cached_similarity_reflexive(entry, e, verbose=verbose)
            if len(cached) == 0:
                text_keys.append(entry["videoId"] + "/" + e["videoId"])
                if "description" not in e:
                    print(e["title"])
                    database_texts.append(e["title"])
                else:
                    database_texts.append(e["title"] + " " + e["description"])
            else:
                similarities.append(cached[0])
    # Compute similarity
    # print(len(text_keys))
    if len(text_keys) > 0:
        entry_embedding = get_embedding(entry_text)
        database_embeddings = np.array([get_embedding(text) for text in database_texts])
        computed_similarities: List[float] = compute_similarity(
            entry_embedding, database_embeddings
        )
        set_cached_similarity(text_keys, computed_similarities)
        similarities.extend(computed_similarities)
    # print(similarities)
    # Exclude self-similarity
    similarities_sorted = np.sort(similarities)[-k:-1]
    # Normalize score to [0, 1]
    return float(np.mean(similarities_sorted))
 def sort_history(history):
    """returns the same database, but with values sorted by the watch time and freshness."""
    sorted_history = history
    max_time_watched = -math.inf
    min_time_watched = math.inf
    max_watch_progress = -math.inf
    min_watch_progress = math.inf
    for entry in history:
        if "timeWatched" in entry:
            if int(entry["timeWatched"]) > max_time_watched:
                max_time_watched = int(entry["timeWatched"])
            if int(entry["timeWatched"]) < min_time_watched:
                min_time_watched = int(entry["timeWatched"])
        if "watchProgress" in entry:
            if int(entry["watchProgress"]) > max_watch_progress:
                max_watch_progress = int(entry["watchProgress"])
            if int(entry["watchProgress"]) < min_watch_progress:
                min_watch_progress = int(entry["watchProgress"])
    wp_factor = max_watch_progress - min_watch_progress
    wp_offset = min_watch_progress
    tw_factor = max_time_watched - min_time_watched
    tw_offset = min_time_watched
    def quality(entry):
        q = 0
        if "timeWatched" in entry:
            q += (entry["timeWatched"] - tw_offset) / tw_factor
        else:
            q += 0.5
        if "watchProgress" in entry:
            q += (entry["watchProgress"] - wp_offset) / wp_factor
        else:
            q += 0.5
        # EXPERIMENTAL!!! WILL MAKE COMPUTER EXPLODE!!!
        # q += get_similarity(entry, history)
        return (2 - q) / 2
    for entry in sorted_history:
        entry["quality"] = quality(entry)
    sorted_history.sort(key=lambda x: x["quality"])
    return sorted_history
 def fix_unquoted_values(line):
    """Attempts to fix unquoted values by adding quotes around them."""
    import re
    def replacer(match):
        key, value = match.groups()
        if not (value.startswith('"') and value.endswith('"')):
            value = f'"{value}"'  # Add quotes around the value
        return f'"{key}":{value}'
    fixed_line = re.sub(r'"(\w+)":(\w+)', replacer, line)
    return fixed_line
--- a/shadowtube/recommend.py
+++ b/shadowtube/recommend.py
@@ -0,0 +1,155 @@
 import sqlite3
 from random import sample
 from innertube.clients import InnerTube
 client = InnerTube("WEB")
 cache = sqlite3.connect("cache.db")
 cursor = cache.cursor()
 # def recommend(history):
 #    recommendations = []
 #    for video_id in history:
 #        video = client.next(video_id)
 #        recommendations += fetch_recommended(video)
 #    return recommendations
 # data = client.search(query="distrotube")
 # video = client.next("BV1O7RR-VoA")
 def fetch_recommended(video, verbose=False):
    # we need to fetch all of the recommended videos
    recommended = []
    try:
        recommended.append(
            video["contents"]["twoColumnWatchNextResults"]["autoplay"]["autoplay"][
                "sets"
            ][0]["autoplayVideo"]["watchEndpoint"]["videoId"]
        )
    except:
        if verbose:
            print("ERR:no autoplay (fetch_recommended)")
    try:
        reccount = len(
            video["contents"]["twoColumnWatchNextResults"]["secondaryResults"][
                "secondaryResults"
            ]["results"]
        )
        if verbose:
            print(
                "INFO: Recommendation count is "
                + str(reccount)
                + " (fetch_recommended)"
            )
    except:
        if verbose:
            print("ERR:no recommendations (fetch_recommended)")
    else:
        for rec in video["contents"]["twoColumnWatchNextResults"]["secondaryResults"][
            "secondaryResults"
        ]["results"]:
            if "compactVideoRenderer" in rec:
                recommended.append(rec["compactVideoRenderer"]["videoId"])
            else:
                if verbose:
                    print("WARN:invalid format (fetch_recommended)")
    if len(recommended) == 0:
        recommended.append("")
    return recommended
 def fetch_from_database(video_id, verbose=False):
    query = "SELECT vid2 FROM recommends WHERE vid1='" + video_id + "'"
    if verbose:
        print("INFO: querying database: " + query)
    cursor.execute(query)
    response = cursor.fetchall()
    if verbose:
        print("INFO: response was: " + response.__str__())
    formatted_response = []
    for tup in response:
        formatted_response.append(tup[0])
    return formatted_response
 def insert_to_database(video_id, recommended):
    pair_insert = []
    for vid in recommended:
        pair_insert.append((video_id, vid))
        # print(pair_insert)
    cursor.executemany(
        "INSERT INTO recommends VALUES(?, ?)",
        pair_insert,
    )
    cache.commit()
 def recommend_by_video(video_id, use_cache=True, verbose=False):
    recommended = []
    if use_cache:
        response = fetch_from_database(video_id, verbose=verbose)
        if len(response) == 0:
            recommended_now = fetch_recommended(client.next(video_id), verbose=verbose)
            if len(recommended_now) != 0 and recommended_now[0] != "":
                insert_to_database(video_id, recommended_now)
            recommended = recommended_now
        else:
            recommended = response
    else:
        recommended = fetch_recommended(client.next(video_id), verbose=verbose)
    return recommended
 # minp - cuts off all videos that have worse value than `minp`. Set to 0 to disable.
 def sampler(history, count, minp: float = 0):
    """Selects `count` random videos from history."""
    videos = []
    for i in range(0, count):
        entry = sample(history, 1)[0]
        if "quality" in entry:
            while entry["quality"] < minp:
                entry = sample(history, 1)[0]
        videos.append(entry)
    return videos
 def remove_viewed(history, videos):
    new_videos = videos.copy()
    for entry in history:
        for video in new_videos:
            if entry["videoId"] == video:
                new_videos.remove(video)
    return new_videos
 def recommend(history, count=8, verbose=False, use_cache=True):
    recommended = []
    initial_videos = sampler(history, count, minp=0.8)
    for entry in initial_videos:
        video_id = entry["videoId"]
        title = entry["title"]
        if verbose:
            print("INFO:recommending by " + video_id + ': "' + title + '"')
        rec = remove_viewed(
            history, recommend_by_video(video_id, use_cache=use_cache, verbose=verbose)
        )
        if rec[0] != "":
            recommended.append(sample(rec, k=1)[0])
    return recommended
 def id_to_link(id_list):
    link_list = []
    for id in id_list:
        link_list.append("https://youtu.be/" + id)
    return link_list
		`@@ -0,0 +1,2 @@`
							`from .preprocess import *`
							`from .recommend import *`