|
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | +import seaborn as sns |
| 4 | + |
| 5 | +df = pd.read_excel('/Users/oscar/Desktop/My_Python_Libraries/text_exercise.XLSX') |
| 6 | +# print(df) |
| 7 | +# id staff department job salary age |
| 8 | +# 0 M0001 Tom BLUE HR manager "$150,000" 52 |
| 9 | +# 1 M0002 JOHN BLACK IT manager "$180,000" 48 |
| 10 | +# 2 E0001 Micheal Brown IT data scientist "$150,000" 35 |
| 11 | +# 3 E0002 jason walker HR recruiter 130000dolar 38 |
| 12 | +# 4 E0003 Alex Green IT backend developer "$110,000" - |
| 13 | +# 5 E0004 OSCAR SMİTH IT frontend developer "$120,000" 32 |
| 14 | +# 6 E0005 Adrian STAR IT data scientist "$135,000" 40 |
| 15 | +# 7 E0006 Albert simon IT data scientist 125000dolar 35 |
| 16 | + |
| 17 | +# print(df.info()) |
| 18 | +# <class 'pandas.core.frame.DataFrame'> |
| 19 | +# RangeIndex: 8 entries, 0 to 7 |
| 20 | +# Data columns (total 6 columns): |
| 21 | +# # Column Non-Null Count Dtype |
| 22 | +# --- ------ -------------- ----- |
| 23 | +# 0 id 8 non-null object |
| 24 | +# 1 staff 8 non-null object |
| 25 | +# 2 department 8 non-null object |
| 26 | +# 3 job 8 non-null object |
| 27 | +# 4 salary 8 non-null object |
| 28 | +# 5 age 8 non-null object |
| 29 | +# dtypes: object(6) |
| 30 | +# memory usage: 512.0+ bytes |
| 31 | +# None |
| 32 | + |
| 33 | +#------------------------------------------------------------------------------------------------ |
| 34 | +# some explanations : |
| 35 | +# lower() => Converts a string into lower case |
| 36 | +# upper() => Converts a string into upper case |
| 37 | +# capitalize() => Converts the first character to upper case |
| 38 | +# title() => Converts the first character of each word to upper case |
| 39 | +# swapcase() => Swaps the case lower/upper |
| 40 | + |
| 41 | +# this is python built-in lower: |
| 42 | +# print('steve'.lower()) |
| 43 | +# steve |
| 44 | + |
| 45 | +# this is for series in pandas : |
| 46 | +# print(df['staff'].str.lower()) |
| 47 | +# 0 tom blue |
| 48 | +# 1 john black |
| 49 | +# 2 micheal brown |
| 50 | +# 3 jason walker |
| 51 | +# 4 alex green |
| 52 | +# 5 oscar smi̇th |
| 53 | +# 6 adrian star |
| 54 | +# 7 albert simon |
| 55 | +# Name: staff, dtype: object |
| 56 | + |
| 57 | +# also you can use it for srt.upper(), str.title(), str.capitalize(), str.swapcase(). |
| 58 | + |
| 59 | +#------------------------------------------------------------------------------------------------ |
| 60 | +# isalpha() => Returns True if all characters in the string are in the alphabet |
| 61 | +# isnumeric() => Returns True if all characters in the string are numeric |
| 62 | +# isalnum() => Returns True if all characters in the string are alphanumeric |
| 63 | +# endswith() => Returns true if the string ends with the specified value |
| 64 | +# startswith() => Returns true if the string starts with the specified value |
| 65 | +# contains() => Returns a Boolean value True for each element if the substring contains in the element, else False. |
| 66 | + |
| 67 | +# print(df) |
| 68 | +# id staff department job salary age |
| 69 | +# 0 M0001 Tom BLUE HR manager "$150,000" 52 |
| 70 | +# 1 M0002 JOHN BLACK IT manager "$180,000" 48 |
| 71 | +# 2 E0001 Micheal Brown IT data scientist "$150,000" 35 |
| 72 | +# 3 E0002 jason walker HR recruiter 130000dolar 38 |
| 73 | +# 4 E0003 Alex Green IT backend developer "$110,000" - |
| 74 | +# 5 E0004 OSCAR SMİTH IT frontend developer "$120,000" 32 |
| 75 | +# 6 E0005 Adrian STAR IT data scientist "$135,000" 40 |
| 76 | +# 7 E0006 Albert simon IT data scientist 125000dolar 35 |
| 77 | + |
| 78 | +# print(df['job'].str.isalpha()) |
| 79 | +# 0 True |
| 80 | +# 1 True |
| 81 | +# 2 False |
| 82 | +# 3 True |
| 83 | +# 4 False |
| 84 | +# 5 False |
| 85 | +# 6 False |
| 86 | +# 7 False |
| 87 | +# Name: job, dtype: bool |
| 88 | + |
| 89 | +# print(df['age'].str.isnumeric()) |
| 90 | +# 0 NaN |
| 91 | +# 1 NaN |
| 92 | +# 2 NaN |
| 93 | +# 3 NaN |
| 94 | +# 4 False |
| 95 | +# 5 NaN |
| 96 | +# 6 NaN |
| 97 | +# 7 NaN |
| 98 | +# Name: age, dtype: object |
| 99 | +# |
| 100 | +# it returns them as NaN but we need them as boolean. |
| 101 | + |
| 102 | +# If the types are object we can't check them by using str.isnumeric() attribute : |
| 103 | +# lets convert them to str initially . |
| 104 | +# astype let us to convert an item to related type, in below example to str . |
| 105 | +# print(df['age'].astype(str).str.isnumeric()) |
| 106 | +# 0 True |
| 107 | +# 1 True |
| 108 | +# 2 True |
| 109 | +# 3 True |
| 110 | +# 4 False |
| 111 | +# 5 True |
| 112 | +# 6 True |
| 113 | +# 7 True |
| 114 | +# Name: age, dtype: bool |
| 115 | + |
| 116 | +# print(df['job']) |
| 117 | +# 0 manager |
| 118 | +# 1 manager |
| 119 | +# 2 data scientist |
| 120 | +# 3 recruiter |
| 121 | +# 4 backend developer |
| 122 | +# 5 frontend developer |
| 123 | +# 6 data scientist |
| 124 | +# 7 data scientist |
| 125 | +# Name: job, dtype: object |
| 126 | + |
| 127 | +# print(df['job'].str.startswith('data')) |
| 128 | +# 0 False |
| 129 | +# 1 False |
| 130 | +# 2 True |
| 131 | +# 3 False |
| 132 | +# 4 False |
| 133 | +# 5 False |
| 134 | +# 6 True |
| 135 | +# 7 True |
| 136 | +# Name: job, dtype: bool |
| 137 | + |
| 138 | +# Also you can use it with str.endswith("per"), str.contains("data"), |
| 139 | + |
| 140 | + |
| 141 | +# print(df['salary']) |
| 142 | +# 0 "$150,000" |
| 143 | +# 1 "$180,000" |
| 144 | +# 2 "$150,000" |
| 145 | +# 3 130000dolar |
| 146 | +# 4 "$110,000" |
| 147 | +# 5 "$120,000" |
| 148 | +# 6 "$135,000" |
| 149 | +# 7 125000dolar |
| 150 | +# Name: salary, dtype: object |
| 151 | + |
| 152 | +# print(df['salary'].str.isalnum()) |
| 153 | +# 0 False |
| 154 | +# 1 False |
| 155 | +# 2 False |
| 156 | +# 3 True |
| 157 | +# 4 False |
| 158 | +# 5 False |
| 159 | +# 6 False |
| 160 | +# 7 True |
| 161 | +# Name: salary, dtype: bool |
| 162 | + |
| 163 | +# if there is a any punctuation or sign inside the series, result will be deceptive. |
| 164 | +# first of all we need to clean them . |
| 165 | + |
| 166 | +# With using regex : |
| 167 | +# It shows the results which contains a character a to z . |
| 168 | +# print(df['salary'].str.contains(r'[a-z]')) |
| 169 | +# 0 False |
| 170 | +# 1 False |
| 171 | +# 2 False |
| 172 | +# 3 True |
| 173 | +# 4 False |
| 174 | +# 5 False |
| 175 | +# 6 False |
| 176 | +# 7 True |
| 177 | +# Name: salary, dtype: bool |
| 178 | + |
| 179 | +#------------------------------------------------------------------------------------------------ |
| 180 | +# We can use these string methods which returning boolean expression for creating condition and so selecting relative rows |
| 181 | +# it returns the all rows which contains the 'data' inside the job's column : |
| 182 | +# print(df[ df['job'].str.contains('data') ]) |
| 183 | +# id staff department job salary age |
| 184 | +# 2 E0001 Micheal Brown IT data scientist "$150,000" 35 |
| 185 | +# 6 E0005 Adrian STAR IT data scientist "$135,000" 40 |
| 186 | +# 7 E0006 Albert simon IT data scientist 125000dolar 35 |
| 187 | + |
| 188 | + |
| 189 | + |
| 190 | + |
| 191 | + |
| 192 | + |
| 193 | + |
| 194 | + |
| 195 | + |
| 196 | + |
| 197 | + |
| 198 | + |
| 199 | + |
| 200 | + |
| 201 | + |
| 202 | + |
| 203 | + |
| 204 | + |
| 205 | + |
| 206 | + |
0 commit comments