[GH-ISSUE #816] [SoFIFA] Add Fetch Icon #173

Open
opened 2026-03-02 15:56:24 +03:00 by kerem · 0 comments
Owner

Originally created by @rekvizitt on GitHub (Feb 22, 2025).
Original GitHub issue: https://github.com/probberechts/soccerdata/issues/816

Is your feature request related to a problem? Please describe.
Currently, the soccerdata library does not support fetching and saving team icons when retrieving team ratings from SoFIFA. This limits the ability to visually represent teams in applications or analyses that use this data.

Describe the solution you'd like
I propose adding functionality to fetch and save team icons when retrieving team ratings from SoFIFA. This would involve:

  1. Modifying the read_team_ratings method to extract the image URL from the HTML response.
  2. Adding a new method get_image to handle downloading and saving the image.
  3. Storing the path to the saved image in the returned DataFrame for easy access.

Describe alternatives you've considered

  1. Manual Download: Users could manually download images, but this is time-consuming and error-prone.
  2. External Libraries: Using external libraries to scrape images, but integrating this directly into soccerdata provides a more seamless user experience.
  3. API Integration: If SoFIFA provides an API for images, integrating it could be more efficient, but this requires API access and documentation.

Additional context
Here is the modified code snippet that implements the proposed feature:

import os
import requests

def read_team_ratings(self) -> pd.DataFrame:
    """Retrieve ratings for all teams in the selected leagues.
    Returns
    -------
    pd.DataFrame
    """
    # Define id and description of ratings to retrieve
    ratings = {
        "oa": "overall",
        "at": "attack",
        "md": "midfield",
        "df": "defence",
        "tb": "transfer_budget",
        "cw": "club_worth",
        "bs": "build_up_speed",
        "bd": "build_up_dribbling",
        "bp": "build_up_passing",
        "bps": "build_up_positioning",
        "cc": "chance_creation_crossing",
        "cp": "chance_creation_passing",
        "cs": "chance_creation_shooting",
        "cps": "chance_creation_positioning",
        "da": "defence_aggression",
        "dm": "defence_pressure",
        "dw": "defence_team_width",
        "dd": "defence_defender_line",
        "dp": "defence_domestic_prestige",
        "ip": "international_prestige",
        "ps": "players",
        "sa": "starting_xi_average_age",
        "ta": "whole_team_average_age",
    }

    # Build URL
    urlmask = SO_FIFA_API + "/teams?lg={}&r={}&set=true"
    for rating_id in ratings:
        urlmask += f"&showCol[]={rating_id}"
    filemask = "teams_{}_{}.html"

    # Get league IDs
    leagues = self.read_leagues()

    # Collect teams
    teams = []
    iterator = list(product(leagues.iterrows(), self.versions.iterrows()))
    for i, ((lkey, league), (version_id, version)) in enumerate(iterator):
        logger.info(
            "[%s/%s] Retrieving teams for %s in %s edition",
            i + 1,
            len(iterator),
            lkey,
            version["update"],
        )
        league_id = league["league_id"]

        # Read HTML page (league overview)
        filepath = self.data_dir / filemask.format(league_id, version_id)
        url = urlmask.format(league_id, version_id)
        reader = self.get(url, filepath)

        # Extract team links
        tree = html.parse(reader)
        for node in tree.xpath("//table/tbody/tr"):
            # Extract team name
            team_name_node = node.xpath(".//td[2]//a")
            if not team_name_node:
                continue  # Skip rows without a valid team name

            team_name = team_name_node[0].text.strip() if team_name_node else None

            # Extract ratings safely
            team_data = {"league": lkey, "team": team_name, **version.to_dict()}
            for key, desc in ratings.items():
                value_nodes = node.xpath(f".//td[@data-col='{key}']//text()")
                team_data[desc] = value_nodes[0].strip() if value_nodes else None

            # Extract image URL
            img_node = node.xpath(".//td[1]//img")
            if img_node:
                img_url = img_node[0].get("data-src") or img_node[0].get("src")
                if img_url:
                    img_filename = os.path.join(self.data_dir, f"{team_name}.png")
                    self.get_image(img_url, img_filename)
                    team_data["image_path"] = img_filename

            teams.append(team_data)

    # Return DataFrame
    return (
        pd.DataFrame(teams)
        .replace({"team": TEAMNAME_REPLACEMENTS})
        .set_index(["league", "team"])
        .sort_index()
    )

def get_image(self, url, save_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)

my test code:

import soccerdata as sd
import pandas as pd
import numpy as np
from pathlib import Path
import logging
import os

logging.basicConfig(level=logging.INFO)

leagues = ["ENG-Premier League"]
seasons = ["2024-25"]

current_dir = Path.cwd()
data_dir = current_dir / 'test_data'
raw_data_dir = data_dir / 'raw'
sofifa_data_dir = data_dir / 'soFIFA'

raw_data_dir.mkdir(parents=True, exist_ok=True)
sofifa_data_dir.mkdir(parents=True, exist_ok=True)

sofifa = sd.SoFIFA(leagues=leagues, versions='latest', data_dir=sofifa_data_dir)

logging.info("Fetching team ratings from SoFIFA...")
try:
    sofifa_ratings = sofifa.read_team_ratings()
    logging.info("SoFIFA team ratings fetched successfully.")
except Exception as e:
    logging.error(f"Error fetching SoFIFA data: {e}")

sofifa_ratings.to_json(raw_data_dir / 'sofifa_ratings.json', orient='records', indent=4)

Result:

Image

Originally created by @rekvizitt on GitHub (Feb 22, 2025). Original GitHub issue: https://github.com/probberechts/soccerdata/issues/816 **Is your feature request related to a problem? Please describe.** Currently, the soccerdata library does not support fetching and saving team icons when retrieving team ratings from SoFIFA. This limits the ability to visually represent teams in applications or analyses that use this data. **Describe the solution you'd like** I propose adding functionality to fetch and save team icons when retrieving team ratings from SoFIFA. This would involve: 1. Modifying the [read_team_ratings](https://github.com/probberechts/soccerdata/blob/aed3caa939a25af96e8ab71356e6b9879020dd9a/soccerdata/sofifa.py#L295) method to extract the image URL from the HTML response. 2. Adding a new method get_image to handle downloading and saving the image. 3. Storing the path to the saved image in the returned DataFrame for easy access. **Describe alternatives you've considered** 1. Manual Download: Users could manually download images, but this is time-consuming and error-prone. 2. External Libraries: Using external libraries to scrape images, but integrating this directly into soccerdata provides a more seamless user experience. 3. API Integration: If SoFIFA provides an API for images, integrating it could be more efficient, but this requires API access and documentation. **Additional context** Here is the modified code snippet that implements the proposed feature: ```python import os import requests def read_team_ratings(self) -> pd.DataFrame: """Retrieve ratings for all teams in the selected leagues. Returns ------- pd.DataFrame """ # Define id and description of ratings to retrieve ratings = { "oa": "overall", "at": "attack", "md": "midfield", "df": "defence", "tb": "transfer_budget", "cw": "club_worth", "bs": "build_up_speed", "bd": "build_up_dribbling", "bp": "build_up_passing", "bps": "build_up_positioning", "cc": "chance_creation_crossing", "cp": "chance_creation_passing", "cs": "chance_creation_shooting", "cps": "chance_creation_positioning", "da": "defence_aggression", "dm": "defence_pressure", "dw": "defence_team_width", "dd": "defence_defender_line", "dp": "defence_domestic_prestige", "ip": "international_prestige", "ps": "players", "sa": "starting_xi_average_age", "ta": "whole_team_average_age", } # Build URL urlmask = SO_FIFA_API + "/teams?lg={}&r={}&set=true" for rating_id in ratings: urlmask += f"&showCol[]={rating_id}" filemask = "teams_{}_{}.html" # Get league IDs leagues = self.read_leagues() # Collect teams teams = [] iterator = list(product(leagues.iterrows(), self.versions.iterrows())) for i, ((lkey, league), (version_id, version)) in enumerate(iterator): logger.info( "[%s/%s] Retrieving teams for %s in %s edition", i + 1, len(iterator), lkey, version["update"], ) league_id = league["league_id"] # Read HTML page (league overview) filepath = self.data_dir / filemask.format(league_id, version_id) url = urlmask.format(league_id, version_id) reader = self.get(url, filepath) # Extract team links tree = html.parse(reader) for node in tree.xpath("//table/tbody/tr"): # Extract team name team_name_node = node.xpath(".//td[2]//a") if not team_name_node: continue # Skip rows without a valid team name team_name = team_name_node[0].text.strip() if team_name_node else None # Extract ratings safely team_data = {"league": lkey, "team": team_name, **version.to_dict()} for key, desc in ratings.items(): value_nodes = node.xpath(f".//td[@data-col='{key}']//text()") team_data[desc] = value_nodes[0].strip() if value_nodes else None # Extract image URL img_node = node.xpath(".//td[1]//img") if img_node: img_url = img_node[0].get("data-src") or img_node[0].get("src") if img_url: img_filename = os.path.join(self.data_dir, f"{team_name}.png") self.get_image(img_url, img_filename) team_data["image_path"] = img_filename teams.append(team_data) # Return DataFrame return ( pd.DataFrame(teams) .replace({"team": TEAMNAME_REPLACEMENTS}) .set_index(["league", "team"]) .sort_index() ) def get_image(self, url, save_path): response = requests.get(url) if response.status_code == 200: with open(save_path, 'wb') as f: f.write(response.content) ``` my test code: ```python import soccerdata as sd import pandas as pd import numpy as np from pathlib import Path import logging import os logging.basicConfig(level=logging.INFO) leagues = ["ENG-Premier League"] seasons = ["2024-25"] current_dir = Path.cwd() data_dir = current_dir / 'test_data' raw_data_dir = data_dir / 'raw' sofifa_data_dir = data_dir / 'soFIFA' raw_data_dir.mkdir(parents=True, exist_ok=True) sofifa_data_dir.mkdir(parents=True, exist_ok=True) sofifa = sd.SoFIFA(leagues=leagues, versions='latest', data_dir=sofifa_data_dir) logging.info("Fetching team ratings from SoFIFA...") try: sofifa_ratings = sofifa.read_team_ratings() logging.info("SoFIFA team ratings fetched successfully.") except Exception as e: logging.error(f"Error fetching SoFIFA data: {e}") sofifa_ratings.to_json(raw_data_dir / 'sofifa_ratings.json', orient='records', indent=4) ``` Result: ![Image](https://github.com/user-attachments/assets/c4d680f7-a2f8-42f5-bafd-b04f0a73192d)
Sign in to join this conversation.
No milestone
No project
No assignees
1 participant
Notifications
Due date
The due date is invalid or out of range. Please use the format "yyyy-mm-dd".

No due date set.

Dependencies

No dependencies set.

Reference
starred/soccerdata#173
No description provided.