In [None]:
from json import dumps as json_dumps

import marimo as mo
import plotly.express as px
import polars as pl
from bs4 import BeautifulSoup
from bs4.element import Tag
from httpx import get as http_get
from pydantic import BaseModel
from pydantic_ai.agent import Agent
from pydantic_ai.models.openai import OpenAIModel
from pydantic_ai.providers.openai import OpenAIProvider

In [None]:
import plotly.offline
plotly.offline.init_notebook_mode()

In [None]:
_SHAARLI_URL = "https://sebsauvage.net/links/"

class Link(BaseModel):
    id: str
    title: str
    href: str
    description: str
    tags: list[str]

def extract_link(entry_node: Tag) -> Link:
    link_id = entry_node.select_one("a")["id"]

    container_node = entry_node.select_one(".linkcontainer")

    link_node = container_node.select_one(".linktitle a")
    link_href = link_node["href"]
    if link_href == "?" + link_id:
        link_href = ""
    link_title = link_node.text.strip()

    if description_node := container_node.select_one(".linkdescription"):
        link_description = description_node.text.strip()
    else:
        link_description = ""

    link_tags = [
        tag.text.strip() for tag in container_node.select(".linktaglist a")
    ]

    link = Link(
        id=link_id,
        title=link_title,
        href=link_href,
        description=link_description,
        tags=link_tags,
    )

    return link

def extract_links(soup: BeautifulSoup) -> list[Link]:
    return [extract_link(entry_node) for entry_node in soup.select("#linklist li")]

_r = http_get(_SHAARLI_URL)
_r.raise_for_status()

_soup = BeautifulSoup(_r.text, "html5lib")
_links = extract_links(_soup)
links_df = pl.DataFrame(_links)

In [None]:
links_df

In [None]:
_OLLAMA_URL = "http://edemaruh:11434/v1"
_LLM_MODEL = "ollama:qwen3:8b"
# LLM_MODEL = "google-gla:gemini-2.5-flash"

_SYSTEM_PROMPT = """\
You are a data classifier. for each content item, determines whether it talks about AI/LLM (True) or not (False). 
"""

_provider_name, _model_name = _LLM_MODEL.split(":", maxsplit=1)
if _provider_name == "ollama":
    _provider = OpenAIProvider(
        base_url=_OLLAMA_URL,
    )
    _model = OpenAIModel(
        model_name=_model_name,
        provider=_provider,
    )
else:
    _model = _LLM_MODEL

_agent = Agent(model=_model, output_type=bool, system_prompt=_SYSTEM_PROMPT)

classified_links_df = links_df.with_columns(
    pl.Series(
        "is_ai",
        [
            (await _agent.run(json_dumps(link))).output
            for link in mo.status.progress_bar(
                links_df.iter_rows(), total=links_df.height
            )
        ],
    )
)

In [None]:
classified_links_df

In [None]:
px.pie(
    classified_links_df,
    names="is_ai",
    title="AI/LLM Links Classification",
    color_discrete_sequence=px.colors.qualitative.Pastel,
    hole=0.3,
).show()