Replicating the income statement as it appears on the SEC form without extra revenue segments being pulled in
Opened this issue · 3 comments
I am trying to replicate the income statement as shown in the company's 10-K (or 10-Q) so I can perform segment analysis.
For example, for Tesla (found here) we want
where we can see revenue is broken out by segment. However, it is difficult (if not impossible to) recreate this table without extra data being pull in. My code as follows will demonstrate this.
import pandas as pd
url_10k_tsla = 'https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231.htm'
xbrl_tsla_json = xbrlApi.xbrl_to_json(htm_url=url_10k_tsla)
# Initialize an empty DataFrame
df = pd.DataFrame(columns=['period'])
# Iterate through each key in the JSON data
for key, values in xbrl_tsla_json['StatementsOfIncome'].items():
# See if there are segments first
has_segments = False
for value in values:
if 'segment' in value:
has_segments = True
for value in values:
# Extract period and value
period = value['period']['startDate'] + ' - ' + value['period']['endDate']
# if there are segments
if has_segments:
# Get the value out of the segment or call it total.
segment_name = value.get('segment', {}).get('value', 'Total').split(':')[-1]
value_key = key + ' - ' + segment_name
# There are no segments
else:
segment_name = value.get('value')
value_key = key
# Check if period already exists in DataFrame
if period in df['period'].values:
# Update existing row
df.loc[df['period'] == period, value_key] = value['value']
else:
# Add new row
row = {'period': period}
row[value_key] = value['value']
df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
df.set_index('period', inplace=True)
# convert all DataFrame columns to the int64 dtype
df = df[df.columns[~df.columns.isin(['EarningsPerShareBasic', 'EarningsPerShareDiluted'])]].astype(np.float64)
# Round all columns to millions except the EarningsPerShareBasic and EarningsPerShareDiluted
df = df[df.columns[~df.columns.isin(['EarningsPerShareBasic', 'EarningsPerShareDiluted'])]].divide(1_000_000)
df.T
will produce the following pandas table:
In particular, the table lists two extra revenue sources, RevenueFromContractWithCustomerExcludingAssessedTax - EnergyGenerationAndStorageSalesMember and RevenueFromContractWithCustomerExcludingAssessedTax - SalesAndServicesMember as well as two extra gross profits (although I am not as concerned with these). These two extra revenue sources come from the 'Revenue Recognition - Revenue by source' table which is separate from the income statement. Is there a way to keep these extra revenue sources from being pulled in (without hard coding their names since I want to apply this technique to many companies) so that my code replicates exactly the income statement as it appears in the filing?
I actually got this code working if anyone is interested:
import pandas as pd
"""
This code will take the income statement and give us an 'itemized' list of revenues and cost of goods sold by segment.
It assumes, based on the data structure, that segments that come after the section total are from a different section of the filing paperwork
"""
def get_cleaned_income_statement_yearly(xbrl_json):
# Initialize an empty DataFrame
df = pd.DataFrame(columns=['period'])
# Iterate through each key in the JSON data
for key, values in xbrl_json['StatementsOfIncome'].items():
# See if there are segments first
has_segments = False
for value in values:
if 'segment' in value:
has_segments = True
for i, value in enumerate(values):
# Extract the year
period = value['period']['endDate'].split('-')[0]
# if there are segments
if has_segments:
# Get the value out of the segment or call it total.
segment_name = value.get('segment', {}).get('value', 'Total').split(':')[-1]
# Split the segment name into parts
pos = [ind for ind,e in enumerate(segment_name + 'A') if e.isupper()]
parts = [segment_name[pos[j]:pos[j+1]] for j in range(len(pos)-1)]
# Add the segment name to the label except for the last word
if len(parts) == 1:
value_key = key + ' - ' + 'Total'
else:
value_key = key + ' - ' + ''.join(parts[:-1])
# if this is the total row we want to break out of the loop because the follow up metrics are from different forms.
if value.get('segment', {}).get('value') == None:
# Check if period already exists in DataFrame
if period in df['period'].values:
# Update existing row
df.loc[df['period'] == period, value_key] = value['value']
else:
# Add new row
row = {'period': period}
row[value_key] = value['value']
df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
if i+1 < len(values) and 'segment' in values[i+1]:
break # Move to the next key-value pair if the next record has a segment
# There are no segments
else:
segment_name = value.get('value')
value_key = key
# Check if period already exists in DataFrame
if period in df['period'].values:
# Update existing row
df.loc[df['period'] == period, value_key] = value['value']
else:
# Add new row
row = {'period': period}
row[value_key] = value['value']
tmp_df = pd.DataFrame([row])
df = pd.concat([df, tmp_df], ignore_index=True)
new_index_names = {'period': 'Year'}
df = df.rename(columns=new_index_names)
df.set_index('Year', inplace=True)
# Convert all DataFrame columns to the float64 dtype except EarningsPerShareBasic and EarningsPerShareDiluted
columns_to_convert = df.columns[~df.columns.isin(['EarningsPerShareBasic', 'EarningsPerShareDiluted'])]
df.loc[:, columns_to_convert] = df.loc[:, columns_to_convert].astype(np.float64)
# Round all columns to millions except the EarningsPerShareBasic and EarningsPerShareDiluted
columns_to_round = df.columns[~df.columns.isin(['EarningsPerShareBasic', 'EarningsPerShareDiluted'])]
df.loc[:, columns_to_round] = df.loc[:, columns_to_round].divide(1_000_000)
return df.T
and here is some code to clean up cash flows:
# convert XBRL-JSON of cash flow statement to pandas dataframe
def get_cleaned_cash_flow_statement_yearly(xbrl_json):
cash_flows_store = {}
for usGaapItem in xbrl_json['StatementsOfCashFlows']:
values = []
indicies = []
for i, fact in enumerate(xbrl_json['StatementsOfCashFlows'][usGaapItem]):
if 'segment' not in fact:
# check if date instant or date range is present
if "instant" in fact['period']:
index = fact['period']['instant']
# else it is a date range
else:
index = fact['period']['startDate'] + '-' + fact['period']['endDate']
# avoid duplicate indicies with same values
if index in indicies:
continue
if "value" not in fact:
values.append(0)
else:
values.append(int(fact['value']) / 1_000_000)
indicies.append(index)
cash_flows_store[usGaapItem] = pd.Series(values, index=indicies)
cash_flows = pd.DataFrame(cash_flows_store)
cash_flows = cash_flows.T
# Populate the starting and ending cash columns correctly.
beginning_cash_columns = cash_flows.columns[(cash_flows.isna().sum() == len(cash_flows) - 1)] # Select columns where every value but one is NaN
# Find the row index where the values in selected columns are not NaN
cash_flow_lines = cash_flows.loc[:, beginning_cash_columns].notnull().idxmax()
# Select the first one since they should all be the same: Ex: CashCashEquivalents
row_to_insert_after = cash_flow_lines.iloc[0]
# Locate the position
idx = cash_flows.index.get_loc(row_to_insert_after) + 1
# Split the DataFrame
cash_flows_before = cash_flows.iloc[:idx]
cash_flows_after = cash_flows.iloc[idx:]
# Select the old row
old_row = cash_flows.iloc[cash_flows.index.get_loc(row_to_insert_after)]
# Make a copy of the old row
new_row = old_row.copy()
new_row_df = pd.DataFrame(new_row).T
# Rename the index of the new row
new_row_df.index = [row_to_insert_after + ' - Ending']
# Concatenate
cash_flows = pd.concat([cash_flows_before, new_row_df, cash_flows_after])
# back fill and forward fill the old and new rows
cash_flows.loc[row_to_insert_after] = cash_flows.loc[row_to_insert_after].ffill()
cash_flows.loc[row_to_insert_after + ' - Ending'] = cash_flows.loc[row_to_insert_after + ' - Ending'].bfill()
# Drop all underneeded columns where there was only the one value that had to be shifted -- the cash beginnging or end amount
cash_flows = cash_flows.drop(beginning_cash_columns, axis=1)
return cash_flows
working on the balance sheet now
I wonder if there a way using sec-api to pull in the company facts json: Ex. https://data.sec.gov/api/xbrl/companyfacts/CIK0000355379.json
to resolve the xbrl attributes to their english conterparts ie us-gaap:CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents becomes Cash, Cash Equivalents, Restricted Cash, and Restricted Cash Equivalents