# Installing dependencies for my data analysis workflow
# xlrd is needed for Excel file support
!pip install xlrd

print("Dependencies installed! Ready for data analysis.")

Requirement already satisfied: xlrd in e:\anaconda\lib\site-packages (1.2.0)

# My essential data analysis import
# Pandas is the cornerstone of my data analysis toolkit
import pandas as pd

print("Pandas imported successfully! Let's dive into data analysis.")

# Loading my music dataset from CSV
# This URL contains the top-selling albums data I'll analyze
csv_path = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0101EN-SkillsNetwork/labs/Module%204/data/TopSellingAlbums.csv'

# Creating my DataFrame - the foundation of my analysis
my_music_df = pd.read_csv(csv_path)

print("Success! My music dataset is loaded and ready for exploration.")
print(f"Dataset shape: {my_music_df.shape} (rows, columns)")

# My first look at the dataset - the crucial preview
print("First glimpse of my music dataset:")
my_music_df.head()

# Read data from Excel File and print the first five rows

xlsx_path = 'https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/PY0101EN/Chapter%204/Datasets/TopSellingAlbums.xlsx'

my_music_df = pd.read_excel(xlsx_path)
print("Excel file loaded successfully!")
print("\nFirst 5 rows from Excel data:")
my_music_df.head()

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
E:\anaconda\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
   1349             try:
-> 1350                 h.request(req.get_method(), req.selector, req.data, headers,
   1351                           encode_chunked=req.has_header('Transfer-encoding'))

E:\anaconda\lib\http\client.py in request(self, method, url, body, headers, encode_chunked)
   1239         """Send a complete request to the server."""
-> 1240         self._send_request(method, url, body, headers, encode_chunked)
   1241 

E:\anaconda\lib\http\client.py in _send_request(self, method, url, body, headers, encode_chunked)
   1285             body = _encode(body, 'body')
-> 1286         self.endheaders(body, encode_chunked=encode_chunked)
   1287 

E:\anaconda\lib\http\client.py in endheaders(self, message_body, encode_chunked)
   1234             raise CannotSendHeader()
-> 1235         self._send_output(message_body, encode_chunked=encode_chunked)
   1236 

E:\anaconda\lib\http\client.py in _send_output(self, message_body, encode_chunked)
   1005         del self._buffer[:]
-> 1006         self.send(msg)
   1007 

E:\anaconda\lib\http\client.py in send(self, data)
    945             if self.auto_open:
--> 946                 self.connect()
    947             else:

E:\anaconda\lib\http\client.py in connect(self)
   1408 
-> 1409             self.sock = self._context.wrap_socket(self.sock,
   1410                                                   server_hostname=server_hostname)

E:\anaconda\lib\ssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
    499         # ctx._wrap_socket()
--> 500         return self.sslsocket_class._create(
    501             sock=sock,

E:\anaconda\lib\ssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
   1039                         raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
-> 1040                     self.do_handshake()
   1041             except (OSError, ValueError):

E:\anaconda\lib\ssl.py in do_handshake(self, block)
   1308                 self.settimeout(None)
-> 1309             self._sslobj.do_handshake()
   1310         finally:

FileNotFoundError: [Errno 2] No such file or directory

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
<ipython-input-11-2e2a6c8acaa7> in <module>
      3 xlsx_path = 'https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/PY0101EN/Chapter%204/Datasets/TopSellingAlbums.xlsx'
      4 
----> 5 df = pd.read_excel(xlsx_path)
      6 df.head()

E:\anaconda\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
    294                 )
    295                 warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
--> 296             return func(*args, **kwargs)
    297 
    298         return wrapper

E:\anaconda\lib\site-packages\pandas\io\excel\_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols)
    302 
    303     if not isinstance(io, ExcelFile):
--> 304         io = ExcelFile(io, engine=engine)
    305     elif engine and engine != io.engine:
    306         raise ValueError(

E:\anaconda\lib\site-packages\pandas\io\excel\_base.py in __init__(self, path_or_buffer, engine)
    865         self._io = stringify_path(path_or_buffer)
    866 
--> 867         self._reader = self._engines[engine](self._io)
    868 
    869     def __fspath__(self):

E:\anaconda\lib\site-packages\pandas\io\excel\_xlrd.py in __init__(self, filepath_or_buffer)
     20         err_msg = "Install xlrd >= 1.0.0 for Excel support"
     21         import_optional_dependency("xlrd", extra=err_msg)
---> 22         super().__init__(filepath_or_buffer)
     23 
     24     @property

E:\anaconda\lib\site-packages\pandas\io\excel\_base.py in __init__(self, filepath_or_buffer)
    340         # If filepath_or_buffer is a url, load the data into a BytesIO
    341         if is_url(filepath_or_buffer):
--> 342             filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read())
    343         elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):
    344             filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer)

E:\anaconda\lib\site-packages\pandas\io\common.py in urlopen(*args, **kwargs)
    135     import urllib.request
    136 
--> 137     return urllib.request.urlopen(*args, **kwargs)
    138 
    139 

E:\anaconda\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 
    224 def install_opener(opener):

E:\anaconda\lib\urllib\request.py in open(self, fullurl, data, timeout)
    523 
    524         sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 525         response = self._open(req, data)
    526 
    527         # post-process response

E:\anaconda\lib\urllib\request.py in _open(self, req, data)
    540 
    541         protocol = req.type
--> 542         result = self._call_chain(self.handle_open, protocol, protocol +
    543                                   '_open', req)
    544         if result:

E:\anaconda\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    500         for handler in handlers:
    501             func = getattr(handler, meth_name)
--> 502             result = func(*args)
    503             if result is not None:
    504                 return result

E:\anaconda\lib\urllib\request.py in https_open(self, req)
   1391 
   1392         def https_open(self, req):
-> 1393             return self.do_open(http.client.HTTPSConnection, req,
   1394                 context=self._context, check_hostname=self._check_hostname)
   1395 

E:\anaconda\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
   1351                           encode_chunked=req.has_header('Transfer-encoding'))
   1352             except OSError as err: # timeout error
-> 1353                 raise URLError(err)
   1354             r = h.getresponse()
   1355         except:

URLError: <urlopen error [Errno 2] No such file or directory>

# Accessing the Length column - analyzing album durations
# Double brackets create a DataFrame (vs single bracket for Series)
album_lengths = my_music_df[['Length']]

print("Album lengths as DataFrame:")
print(f"Data type: {type(album_lengths)}")
album_lengths

# Getting a column as a Series - 1-dimensional data
length_series = my_music_df['Length']

print(f"Data type: {type(length_series)}")
print(f"Series name: {length_series.name}")
print("\nFirst few values:")
length_series

0    0:42:19
1    0:42:11
2    0:42:49
3    0:57:44
4    0:46:33
5    0:43:08
6    1:15:54
7    0:40:01
Name: Length, dtype: object

# Getting a column as a DataFrame - maintains 2D structure
artist_df = my_music_df[['Artist']]

print(f"Data type: {type(artist_df)}")
print(f"Shape: {artist_df.shape}")
print("\nDataFrame structure:")
artist_df.head()

pandas.core.frame.DataFrame

# Creating my focused analysis dataset
# Artist, Length, and Genre are key for my music analysis
my_analysis_data = my_music_df[['Artist', 'Length', 'Genre']]

print("My focused music analysis dataset:")
print(f"Shape: {my_analysis_data.shape}")
print("\nData preview:")
my_analysis_data

# Accessing the very first cell - row 0, column 0
# This gives me the first artist in my dataset
first_artist = my_music_df.iloc[0, 0]

print(f"First artist in my dataset: {first_artist}")
print(f"Data type: {type(first_artist)}")
first_artist

'Michael Jackson'

# Second artist in my dataset
second_artist = my_music_df.iloc[1, 0]

print(f"Second artist: {second_artist}")
second_artist

'AC/DC'

# First album's release year (row 0, column 2)
first_album_year = my_music_df.iloc[0, 2]

print(f"First album released in: {first_album_year}")
first_album_year

1982

# Access the value on the second row and the third column
df.iloc[1,2]

# Second album's release year for comparison
second_album_year = my_music_df.iloc[1, 2]

print(f"Second album released in: {second_album_year}")
print(f"Time difference: {abs(first_album_year - second_album_year)} years")
second_album_year

1980

# Using column names instead of positions - much clearer!
# Getting the second artist using label-based indexing
second_artist_loc = my_music_df.loc[1, 'Artist']

print(f"Second artist (using loc): {second_artist_loc}")
second_artist_loc

'AC/DC'

# Access the column using the name

df.loc[1, 'Artist']

# Let me also check their album name
second_album = my_music_df.loc[1, 'Album']

print(f"Album: {second_album}")
print(f"Artist: {second_artist_loc}")
second_album

'AC/DC'

# Checking release years using meaningful column names

first_album_release = my_music_df.loc[0, 'Released']

print(f"First album release year: {first_album_release}")
first_album_release

1982

# Access the column using the name

df.loc[1, 'Released']

# Comparing release years for historical context
second_album_release = my_music_df.loc[1, 'Released']

print(f"Release timeline:")
print(f"  1st album: {first_album_release}")
print(f"  2nd album: {second_album_release}")
print(f"Both from the golden age of rock!")
second_album_release

1980

# Slicing the dataframe

# Position-based slicing: first 2 rows, first 3 columns
# Great for quick data exploration
data_slice_iloc = my_music_df.iloc[0:2, 0:3]

print("Position-based slice (first 2 rows, first 3 columns):")
data_slice_iloc

# Slicing the dataframe using name

# Label-based slicing: much more intuitive and readable
# From 'Artist' to 'Released' columns, first 3 rows
data_slice_loc = my_music_df.loc[0:2, 'Artist':'Released']

print("Label-based slice (Artist to Released, first 3 rows):")
print("This approach is more maintainable and self-documenting!")
data_slice_loc

# My solution: Getting ratings as DataFrame for analysis
my_ratings = my_music_df[['Rating']]

print("Friend ratings of top albums:")
print(f"Average rating: {my_ratings['Rating'].mean():.1f}/10")
print(f"Highest rated album has: {my_ratings['Rating'].max()}/10")
my_ratings

q = df[['Rating']]
q

# My chronological analysis dataset
chrono_data = my_music_df[['Released', 'Artist']]

print("Chronological view of musical legends:")
print("\nSorted by release year:")
chrono_sorted = chrono_data.sort_values('Released')
chrono_data.head()

q = df[['Released', 'Artist']]
q

# Pinpointing specific data with iloc
specific_value = my_music_df.iloc[1, 2]

print(f"Value at position [1,2]: {specific_value}")
print(f"This represents: {my_music_df.columns[2]} for {my_music_df.iloc[1,0]}")
specific_value

1980

df.iloc[1, 2]

# Creating a custom indexed version for advanced access patterns
custom_index = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
my_custom_df = my_music_df.copy()  # Better practice: use copy()
my_custom_df.index = custom_index

print("Custom indexing demonstration:")
print(f"Artist at index 'a': {my_custom_df.loc['a', 'Artist']}")

print("\nMultiple artists (a through d):")
artist_range = my_custom_df.loc['a':'d', 'Artist']
print(artist_range.tolist())

a    Michael Jackson
b              AC/DC
c         Pink Floyd
d    Whitney Houston
Name: Artist, dtype: object

df_new=df
df_new.index=new_index
df_new.loc['a', 'Artist']
df_new.loc['a':'d', 'Artist']

Artist	Album	Released	Length	Genre	Music recording sales (millions)	Claimed sales (millions)	Released	Soundtrack	Rating (friends)
Michael Jackson	Thriller	1982	00:42:19	Pop, rock, R&B	46	65	30-Nov-82		10.0
AC/DC	Back in Black	1980	00:42:11	Hard rock	26.1	50	25-Jul-80		8.5
Pink Floyd	The Dark Side of the Moon	1973	00:42:49	Progressive rock	24.2	45	01-Mar-73		9.5
Whitney Houston	The Bodyguard	1992	00:57:44	Soundtrack/R&B, soul, pop	26.1	50	25-Jul-80	Y	7.0
Meat Loaf	Bat Out of Hell	1977	00:46:33	Hard rock, progressive rock	20.6	43	21-Oct-77		7.0
Eagles	Their Greatest Hits (1971-1975)	1976	00:43:08	Rock, soft rock, folk rock	32.2	42	17-Feb-76		9.5
Bee Gees	Saturday Night Fever	1977	1:15:54	Disco	20.6	40	15-Nov-77	Y	9.0
Fleetwood Mac	Rumours	1977	00:40:01	Soft rock	27.9	40	04-Feb-77		9.5

Artist	Album	Released	Length	Genre	Sales (M)	Rating
Michael Jackson	Thriller	1982	00:42:19	Pop, rock, R&B	46	10.0
AC/DC	Back in Black	1980	00:42:11	Hard rock	26.1	8.5
Pink Floyd	The Dark Side of the Moon	1973	00:42:49	Progressive rock	24.2	9.5
Whitney Houston	The Bodyguard	1992	00:57:44	Soundtrack/R&B	26.1	7.0

	Artist	Album	Released	Length	Genre	Music Recording Sales (millions)	Claimed Sales (millions)	Released.1	Soundtrack	Rating
0	Michael Jackson	Thriller	1982	0:42:19	pop, rock, R&B	46.0	65	30-Nov-82	NaN	10.0
1	AC/DC	Back in Black	1980	0:42:11	hard rock	26.1	50	25-Jul-80	NaN	9.5
2	Pink Floyd	The Dark Side of the Moon	1973	0:42:49	progressive rock	24.2	45	01-Mar-73	NaN	9.0
3	Whitney Houston	The Bodyguard	1992	0:57:44	R&B, soul, pop	27.4	44	17-Nov-92	Y	8.5
4	Meat Loaf	Bat Out of Hell	1977	0:46:33	hard rock, progressive rock	20.6	43	21-Oct-77	NaN	8.0

	Rating
0	10.0
1	9.5
2	9.0
3	8.5
4	8.0
5	7.5
6	7.0
7	6.5

	Released	Artist
0	1982	Michael Jackson
1	1980	AC/DC
2	1973	Pink Floyd
3	1992	Whitney Houston
4	1977	Meat Loaf
5	1976	Eagles
6	1977	Bee Gees
7	1977	Fleetwood Mac

My Personal Journey with Pandas Data Loading¶

My Data Loading Adventure¶

Table of Contents

My Learning Path¶

What I'll Explore:¶

About My Music Dataset - Top Selling Albums¶

My Introduction to Pandas - The Data Analysis Game Changer¶

My Data Access Mastery - Viewing and Extracting Information¶

Precise Data Access with iloc - My Position-Based Approach¶

Label-Based Access with loc - My Preferred Method¶

Data Slicing - My Efficiency Technique¶

Testing My Skills - Data Access Practice¶

My Pandas Data Loading Mastery - Key Takeaways¶

What I've Accomplished:¶

My Data Access Philosophy:¶

Real-World Applications:¶

Next Steps in My Data Journey:¶