<a href="https://colab.research.google.com/github/bdekoz/midnight.sfo-crux/blob/main/lcp_perf_analysis_2025q2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Performance analysis concentrating on understanding the current state of the Largest Contentful Paint (LCP) metric, and how it correlates and compares to existing NavigationTiming and Visual metrics.

Experimental data was collected between (2025-02-07, 11), on a wireless connection in SFO/149 NMG room 546. The Chrome () and Firefox browsers (nightly as of 20250204) were hosted on Android 15, the browsertime testing framework was used in combination with [sitelists generated](https://github.com/bdekoz/mozilla-data-sitelists/blob/main/docs/sitelist_generation.md) from CrUX.

Two devices were used: a rooted Pixel 8 (shiba), and a rooted Google Tablet (tangor). Setup [details](https://sunglint.wordpress.com/2024/05/22/android-2024-devices/)

[github repo 1](https://github.com/bdekoz/midnight.sfo-2025-02.1)

[github repo 2](https://github.com/bdekoz/midnight.sfo-2025-02.2)



# shared

In [None]:
#@title setup, static and constant data
import json
import os
import requests

from google.cloud import bigquery
from datetime import datetime, timedelta
from re import sub
from os import path
from google.colab import auth
auth.authenticate_user()

# Mount Google Drive
from google.colab import drive
gdrive = '/content/drive/'
gdriveprefix = gdrive + 'My Drive/Colab Notebooks/'
drive.mount(gdrive, force_remount=True)

# use interactive tables
from google.colab import data_table
data_table.enable_dataframe_formatter()


import datetime
import time
import hashlib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
from scipy.stats import ttest_ind
from scipy.stats import ks_2samp
from scipy.stats import mannwhitneyu

pd.set_option('display.width', 1920)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# useful constants
tab = "\t";
newline = "\n";



# Data files as per github prefix encoded as:
#
# ci == mozilla CI results
# sfo == mozilla SFO results
#
# ISO YEAR - DATE
# 2025-02
#
# sitelist identifier
# x.1 is baseline tp6m
# x.2 is baseline tp6m with a11y on
# x.3 is CrUX 2024 phone 10k sites, with specific web content.
# x.3x40 4 slices x 10 sites
# x.3x100 4 slices x 25 sites

data1_file ='https://raw.githubusercontent.com/bdekoz/midnight.sfo-2025-02.1/main/results/2025-02-11/data.json'
data2_file ='https://raw.githubusercontent.com/bdekoz/midnight.sfo-2025-02.2/main/results/2025-02-13/data.json'

data100_file ='https://raw.githubusercontent.com/bdekoz/midnight.sfo-2025-02.3x100/refs/heads/main/results/2025-02-15/data.json'
data100a_file ='https://raw.githubusercontent.com/bdekoz/midnight.sfo-2025-02.3x100/refs/heads/main/results/2025-02-20/data.json'


Mounted at /content/drive/


In [None]:
#@title percentage(numerator, denom) / percentage_change(one, two) / deserialize_csv(csvfile)

def percentage(part, whole):
  Percentage = 100 * float(part)/float(whole)
  return str(int(round(Percentage,0))) + '%'

def percentage_change(col1, col2):
  return (((col2 - col1) / col1) * 100)


def deserialize_csv(csvfile):
    # null values as empty string
    # df = pd.read_csv(csvfile, sep='|', keep_default_na=False);
    # null values as 0.0
    df = pd.read_csv(csvfile, sep='|').fillna(value = 0.0);
    df.set_index(df.columns[0]) #set the first column as the index
    df_len = len(df.index);
    #print("dataframe rows: " + str(df_len));
    return df;



In [None]:
#@title deserialize_json(jfile) / deserialize_json_metrics_by_browser / content_traits_active_count / content_traits_active_p / deserialize_json_metrics_by_browser_if

# simple, all pandas
def deserialize_json(jfile):
    # null values as empty string
    # df = pd.read_csv(csvfile, sep='|', keep_default_na=False);
    # null values as 0.0
    df = pd.read_json(jfile)
    df.set_index('test') #set the json field 'test' as the index
    df_len = len(df.index);
    #print("dataframe rows: " + str(df_len));
    return df;


# Assume jfile is a json file that is an aggregate of pageload data
# collected from a set of URLS, matching existing midnight schema.
def deserialize_json_metrics_by_browser(jfile, browser):
    # 1 local file
    #with open(jfile, "r") as file:
    #  data = json.load(file)

    # 2 github repo URL
    response = requests.get(jfile)
    data = response.json()

    # either walk json and add it here or flatten json with json_normalize
    metric_data = {}
    for item in data:
      test_data = {}
      url_mini = item['test']

      metrics_array = item[browser]['metrics']
      lcpe_data = metrics_array['lcp_element']

      # Only extract metrics with 'median' fields from the JSON
      metric_only = {key: value["median"] for key, value in metrics_array.items() if isinstance(value, dict) and "median" in value}
      test_data.update(metric_only)
      # Add lcp element
      test_data['lcp_element'] = lcpe_data

      metric_data[url_mini] = test_data

    df = pd.DataFrame(metric_data)
    df = df.transpose()
    df = df.rename(columns={"largestContentfulPaintloadTime": "lcp_load", "largestContentfulPaintrenderTime": "lcp_render"})
    #df.columns.values[0] = "url_mini"
    #df.set_index('test') #set the json field test as the index
    df_len = len(df.index);
    print("dataframe rows: " + str(df_len));
    return df;


# Number of fields of the url_content_traits object that have a non-zero value.
def content_traits_active_count(data):
  count = 0
  for key, value in data.items():
      if value != 0:
        count += 1
  return count;

  # Number of fields of the url_content_traits object that have a non-zero value.
def content_traits_active_p(data, trait):
  b = data[trait] != 0;
  return b;


# Assume jfile is a json file that is an aggregate of pageload data
# collected from a set of URLS, matching existing midnight schema.
def deserialize_json_metrics_by_browser_if(jfile, browser, trait):
    # 1 local file
    #with open(jfile, "r") as file:
    #  data = json.load(file)

    # 2 github repo URL
    response = requests.get(jfile)
    data = response.json()

    # either walk json and add it here or flatten json with json_normalize
    metric_data = {}
    for item in data:
      test_data = {}
      url_mini = item['test']
      traits_dict = item['url_content_traits']

      if not content_traits_active_p(traits_dict, trait):
        continue

      metrics_array = item[browser]['metrics']
      lcpe_data = metrics_array['lcp_element']

      # Only extract metrics with 'median' fields from the JSON
      metric_only = {key: value["median"] for key, value in metrics_array.items() if isinstance(value, dict) and "median" in value}
      test_data.update(metric_only)
      # Add lcp element
      test_data['lcp_element'] = lcpe_data

      metric_data[url_mini] = test_data

    df = pd.DataFrame(metric_data)
    df = df.transpose()
    df = df.rename(columns={"largestContentfulPaintloadTime": "lcp_load", "largestContentfulPaintrenderTime": "lcp_render"})
    #df.columns.values[0] = "url_mini"
    #df.set_index('test') #set the json field test as the index
    df_len = len(df.index);
    print("dataframe rows: " + str(df_len));
    return df;


In [None]:
#@title compare_2_lcp_elements / compare_4_lcp_elements / compare_2_lcp_values

# Compare two data frames for LCP element matching
def compare_2_lcp_elements(df1, df2):

  # error check data frame sizes, which should match
  if len(df1.index) != len(df2.index):
    print("Exiting, dataframes have different indexes");
  dflen = len(df1.index);

  #dfbool = df2.columns[lcp-element] != df1.columns[lcp-element];
  dflcpe = df2["lcp_element"] == df1["lcp_element"];
  lcpe_same = dflcpe.sum();
  lcpmatch = percentage(lcpe_same, dflen);
  return lcpmatch;


# Compare four data frames for LCP element matching
def compare_4_lcp_elements(df1, df2, df3, df4):
  # error check input data frames
  if len(df1.index) != len(df2.index) != len(df3.index) != len(df4.index):
    print("Exiting, dataframes have different indexes");
  dflen = len(df1.index);

  dflcpe1 = df1["lcp_element"] == df2["lcp_element"];
  dflcpe2 = df3["lcp_element"] == df4["lcp_element"];
  dflcpe = dflcpe1 & dflcpe2;
  lcpe_same = dflcpe.sum();

  lcpmatch = percentage(lcpe_same, dflen);
  return lcpmatch;


# Compare two data frames (with LCP element matching) for LCPrender times
def compare_2_lcp_values(df1, df2):
  df1_lcpr = pd.to_numeric(df1["lcp_render"], errors='coerce') # Convert to numeric, handle errors
  df2_lcpr = pd.to_numeric(df2["lcp_render"], errors='coerce') # Convert to numeric, handle errors

  #print(df1_lcpr.head(10))
  print("df1 lcp render min/max/median");
  print(df1_lcpr.min())
  print(df1_lcpr.max())
  print(df1_lcpr.median())
  print("\n")

  #print(df2_lcpr.head(10))
  print("df2 lcp render min/max/median");
  print(df2_lcpr.min())
  print(df2_lcpr.max())
  print(df2_lcpr.median())
  print("\n")

  #dfpchange = percentage_change(df1["lcp-render"], df2["lcp-render"])
  dfpchange = percentage_change(df1_lcpr, df2_lcpr)
  dfpchangesort = dfpchange.sort_values(ascending=False)
  #print("firefox/chrome % change: \n" + str(dfpchange));
  print("df1/df2 % change: \n" + str(dfpchangesort));
  print("\n")

  #print(dfpchange.head(10))
  print("min/max/median");
  print(dfpchange.min())
  print(dfpchange.max())
  print(dfpchange.median())
  print("\n")

  #print(dfpchange);
  #dfpchange.plot();

In [None]:
#@title correlate_lcp_to_metrics


def correlate_lcp_to_metrics(df):
  #filter data frame to no non-null lcp-render value rows
  #df1_lcpr = df1[df1["lcp-render"].notnull()]


  df_lcpr = df["lcp_render"]

  #print("first ten values");
  #print(df1_lcpr.head(10)) # print 10 rows
  #print("\n");
  #df1_lcpr.plot()
  #print("\n");

  corrsic = df_lcpr.corr(df['fetchStart'], method='pearson');
  print("fetchStart " + str(corrsic))

  corrsic = df_lcpr.corr(df['responseStart'], method='pearson');
  print("responseStart " + str(corrsic))

  corrsic = df_lcpr.corr(df['domComplete'], method='pearson');
  print("domComplete " + str(corrsic))
  print("\n");


  # Correlation between all relevant columns of DataFrame (except 1,2,5,6 below)
  # url-id, url, fetchStart, connectStart, responseStart, SpeedIndex, VC85, fcp, lcp-load, lcp-render, lcp-element
  # url_id|url|power|redirectEnd|fetchStart|domainLookupStart|connectStart|requestStart|responseStart|domContentLoadedEventStart|domComplete|SpeedIndex|VC85|fcp|lcp_load|lcp_render|lcp_element
  df1e = df.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]];
  #print(df1e.head(10));
  corre = df1e.corr();
  print(corre);

  #df1e_lcpr = df1e["lcp_render"];
  #corre = df1e_lcpr.corr(df1e, method='pearson');
  #print(corre)
  #print(df1);

In [None]:
#@title intersection_of_2_data_frames_by_index

# take two data frames and make sure that each have the same set of indexes (urlm)
def intersection_of_2_data_frames_by_index(df1, df2):
  print("df1 len: " + str(len(df1.index)));
  print("df2 len: " + str(len(df2.index)));

  # Get the symmetric difference of the indexes
  index_diff = df1.index.symmetric_difference(df2.index)
  print("Indexes present in only one df: ", len(index_diff), index_diff)

  unique_to_df1 = df1.index.difference(df2.index)
  unique_to_df2 = df2.index.difference(df1.index)
  print("Indexes unique to df1:", unique_to_df1)
  print("Indexes unique to df2:", unique_to_df2)

  #Get the symmetric intersection of the indexes
  indexi = df1.index.intersection(df2.index)
  print("Indexes present in both dfs: ", len(indexi), indexi)



  return indexi;

# analysis

In [None]:
#@title 2 browsers all metrics

#df1 = deserialize_json(data1_file)
#print(df1)

print("firefox")
dffx100 = deserialize_json_metrics_by_browser(data100_file, "firefox")
print(dffx100)

print(newline)

print("chrome")
dfchr100 = deserialize_json_metrics_by_browser(data100_file, "chrome")
print(dfchr100)

firefox
dataframe rows: 90
                                           fcp lcp_load lcp_render redirectStart redirectEnd fetchStart domainLookupStart connectStart requestStart responseStart domInteractive domComplete loadEventEnd SpeedIndex VisualComplete85 LastVisualChange                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   lcp_element
4usted                                     481      682        682             0           0          5                37           49  

In [None]:
#@title firefox metric correlation matrix

correlate_lcp_to_metrics(dffx100)



fetchStart -0.08055493112798821
responseStart 0.8294078246423595
domComplete 0.8505927680584393


                        fcp  lcp_load  lcp_render  redirectStart  redirectEnd  fetchStart  domainLookupStart  connectStart  requestStart  responseStart  domInteractive  domComplete  loadEventEnd  SpeedIndex  VisualComplete85  LastVisualChange
fcp                1.000000  0.842766    0.867782      -0.050008    -0.038963   -0.051662          -0.033327     -0.043907      0.358570       0.975791        0.983714     0.916048      0.915843    0.768958          0.607220          0.574526
lcp_load           0.842766  1.000000    0.984028      -0.077324    -0.090307   -0.097938          -0.068638     -0.087103      0.430277       0.806586        0.836497     0.832281      0.831934    0.714772          0.599532          0.588098
lcp_render         0.867782  0.984028    1.000000      -0.071940    -0.066774   -0.080555          -0.055725     -0.076912      0.424222       0.829408        0.857369     0

In [None]:
#@title chrome metric correlation matrix

correlate_lcp_to_metrics(dfchr100)

fetchStart -0.08548675996740042
responseStart 0.6836705945928653
domComplete 0.19800199075945568


                        fcp  lcp_load  lcp_render  redirectStart  redirectEnd  fetchStart  domainLookupStart  connectStart  requestStart  responseStart  domInteractive  domComplete  loadEventEnd  SpeedIndex  VisualComplete85  LastVisualChange
fcp                1.000000  0.823670    0.743890      -0.061184    -0.033874   -0.041955          -0.042366     -0.010500      0.447363       0.969942        0.971570     0.222203      0.222261    0.201322          0.189254          0.244400
lcp_load           0.823670  1.000000    0.805107      -0.085347    -0.092922   -0.101894          -0.100551     -0.080395      0.409251       0.760135        0.817602     0.177661      0.177674    0.152552          0.136278          0.223519
lcp_render         0.743890  0.805107    1.000000      -0.092483    -0.071222   -0.085487          -0.085877     -0.050290      0.414083       0.683671        0.748932     

In [None]:
#@title 2 browsers LCP element match percent

cmp_pre = compare_2_lcp_elements(dffx100, dfchr100);
print("ff/chrome lcp match: " + cmp_pre);



ff/chrome lcp match: 53%


In [None]:
#@title 2 browsers LCP element match sites

dflcpeqp = dfchr100["lcp_element"] == dffx100["lcp_element"];
print("df1 len: " + str(len(dfchr100.index)));
print("df2 len: " + str(len(dffx100.index)));

# wrong only true
print("fenix/chrome lcp matching set length: " + str(dflcpeqp.sum()));


df1 len: 90
df2 len: 90
fenix/chrome lcp matching set length: 48


In [None]:
#@title 2 browsers LCP Render all sites

compare_2_lcp_values(dffx100, dfchr100)

firefox lcp render min/max/median
230
22997
1334.0


chrome lcp render min/max/median
278
19216
1746.0


firefox/chrome % change: 
die_luke_info                              2488.474576
maisterboards                               261.884368
haymanpens                                  192.845258
shop_beyer_soehne_de                        192.187500
beautywestafrica                            167.568398
tarasfactory_pl                             113.262599
unitedreggae                                 86.895987
tsum_promodo_digital                         77.261848
ar_cairomap360                               61.538462
v_bar_pl                                     58.719647
research_danskebank_research                 58.603334
mrshark_gumroad                              53.174603
ceraplastica                                 51.898734
joshi_group                                  51.600000
my_store_11166242_creator_spring             50.000000
emargit_gportal_hu                          

In [None]:
#@title 2 browsers LCP Render LCP match only

# create bool array for all results where LCP elements match.
#dffx, dfchr
dflcpeqp = dffx100["lcp_element"] == dfchr100["lcp_element"];
print("df1 len: " + str(len(dffx100.index)));
print("df2 len: " + str(len(dfchr100.index)));
print("fenix/chrome lcp matching set length: " + str(len(dflcpeqp.index)));


# filter results for only matching LCP elements.
df1_setas1 = dffx100[dflcpeqp];
print("firefox lcp match length: " + str(len(df1_setas1.index)));


df2_setas1 = dfchr100[dflcpeqp];
print("chrome lcp match length: " + str(len(df2_setas1.index)));


compare_2_lcp_values(df1_setas1, df2_setas1)




df1 len: 90
df2 len: 90
fenix/chrome lcp matching set length: 90
firefox lcp match length: 48
chrome lcp match length: 48
firefox lcp render min/max/median
232
22997
1261.0


chrome lcp render min/max/median
296
19216
1601.0


firefox/chrome % change: 
haymanpens                                 192.845258
ar_cairomap360                              61.538462
v_bar_pl                                    58.719647
ceraplastica                                51.898734
joshi_group                                 51.600000
bameethainoodle                             48.506401
4usted                                      45.747801
freeflix_tv_en_malavida_android             45.023697
destinationsport                            44.295830
spertiveicoli_it                            41.092637
flandersfood_nl                             39.675174
protravel_pl                                37.112750
linkoping                                   35.071090
thearbalistguild_forumotion                 3

# analysis (accessibility-specific)

In [None]:
#@title 2 browsers LCP Render LCP element matching only (a11y)

#print("firefox a11y enabled")
dffx100a = deserialize_json_metrics_by_browser(data100a_file, "firefox")
#dffx100a = dffx100aa.drop('bing')
#print(dffx100a)

indexi = intersection_of_2_data_frames_by_index(dffx100, dffx100a)
print()

df1u = dffx100[dffx100.index.isin(indexi)]
print("common df1: firefox")
print(len(df1u), df1u)
print("\n")

df2u = dffx100a[dffx100a.index.isin(indexi)]
print("common df2: firefox a11y enabled")
print(len(df2u), df2u)
print("\n")

# create bool array for all results where LCP elements match.
dflcpeqpa = df1u["lcp_element"] == df2u["lcp_element"];
print("fenix/fenix-a11y lcp matching set length: " + str(len(dflcpeqpa.index)));
print()

# filter results for only matching LCP elements.
df1um = df1u[dflcpeqpa];
df2um = df2u[dflcpeqpa];
if (len(df1um.index) != len(df2um.index)):
  print("Exiting, dataframes have different indexes after attempted");
compare_2_lcp_values(df1um, df2um)




dataframe rows: 91
df1 len: 90
df2 len: 91
Indexes present in only one df:  7 Index(['charlesschwabchallenge', 'dateready_2h_fit1', 'firia_pl', 'iconinteriors_ru', 'lakfm_lk', 'ridis_ru', 'unitedreggae'], dtype='object')
Indexes unique to df1: Index(['charlesschwabchallenge', 'dateready_2h_fit1', 'unitedreggae'], dtype='object')
Indexes unique to df2: Index(['firia_pl', 'iconinteriors_ru', 'lakfm_lk', 'ridis_ru'], dtype='object')
Indexes present in both dfs:  87 Index(['4usted', 'addario', 'aldositaliangreengates_co_uk', 'applianceprousa', 'ar_cairomap360', 'arihanthospital', 'artesp', 'bameethainoodle', 'beautywestafrica', 'boost_your_audience', 'botzsak_arukereso_hu', 'brockvillenissan', 'bryansk_ucheba_ru', 'bulutokul', 'capitaleducrimeradio', 'centrodemedicinaregenerativa', 'ceraplastica', 'chuckmechanicalpe', 'cockpit_lebara_ch_4', 'colombiaricebox_ola_click', 'destinationsport', 'detfriasverige_se', 'die_luke_info', 'dogburger_menudino', 'dominiquegambier_fr', 'e_mancare_ro', 'em

# analysis (web content)

In [None]:
#@title 2 browsers LCP Render results, element matching only (by webcontent)

#print("firefox a11y enabled")
dffx100a = deserialize_json_metrics_by_browser(data100a_file, "firefox")
#dffx100a = dffx100aa.drop('bing')
#print(dffx100a)

indexi = intersection_of_2_data_frames_by_index(dffx100, dffx100a)
print()

df1u = dffx100[dffx100.index.isin(indexi)]
print("common df1: firefox")
print(len(df1u), df1u)
print("\n")

df2u = dffx100a[dffx100a.index.isin(indexi)]
print("common df2: firefox a11y enabled")
print(len(df2u), df2u)
print("\n")

# create bool array for all results where LCP elements match.
dflcpeqpa = df1u["lcp_element"] == df2u["lcp_element"];
print("fenix/fenix-a11y lcp matching set length: " + str(len(dflcpeqpa.index)));
print()

# filter results for only matching LCP elements.
df1um = df1u[dflcpeqpa];
df2um = df2u[dflcpeqpa];
if (len(df1um.index) != len(df2um.index)):
  print("Exiting, dataframes have different indexes after attempted");
compare_2_lcp_values(df1um, df2um)


