2023Python程序设计--期末大作业

2023Python程序设计–期末大作业

任务一

抓取链家官网北上广深4个一线城市和天津的租房数据。应获取每个城市的全部租房数据(一线城市的数据量应该在万的数量级)。

分析

以北京为例,书写报告时北京房子总数为66187套,而链家只允许查看前100页(共3000套)租房信息。所以,我们需要细化条件使得每个条件下住房信息少于100页。解决方法是按照市辖区进行区分,这样均小于100页

因此,可以分两步进行:

  1. 将所有市辖区爬取下来

  2. 按照市辖区组合爬取住房信息

代码

以北京为例,其余地区与北京相似:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import scrapy

class RoomItem(scrapy.Item):
# define the fields for your item here like:
名字 = scrapy.Field()
市辖区 = scrapy.Field()
二级地址 = scrapy.Field()
小区 = scrapy.Field()
月租金 = scrapy.Field()
类型 = scrapy.Field()
面积 = scrapy.Field()

class DistItem(scrapy.Item):
市辖区名字 = scrapy.Field()
市辖区网址 = scrapy.Field()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import scrapy
from scrapy import Request
from spider.items import DistItem

class MySpider(scrapy.Spider):
name = "bjdistrict"
allowed_domains = ["bj.lianjia.com"]
start_urls = ["https://bj.lianjia.com/zufang/"]
head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "}
district=['dongcheng','xicheng','chaoyang','haidian','fengtai','shijingshan','tongzhou','changping','daxing','yizhuangkaifaqu','shunyi','fangshan','mentougou','pinggu','huairou','miyun','yanqing']

def start_requests(self):
for dis in self.district:
url = f'https://bj.lianjia.com/zufang/{dis}/'
yield Request(url=url, callback=self.parse, meta={'district': dis}, headers=self.head)

def parse(self, response):
items = []
for each in response.xpath("/html/body/div[3]/div[1]/div[4]/div[1]/ul[4]/li/a"):
item = DistItem()
item['市辖区名字'] = ''.join(each.xpath("text()").extract()).strip()
item['市辖区网址'] = ''.join(each.xpath("@href").extract()).strip()
if item['市辖区名字']!='不限' and item['市辖区网址']:
yield item
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from itemadapter import ItemAdapter
import json

class SpiderPipeline:
def open_spider(self, spider):
if spider.name == 'beijing':
self.file = open('beijing.json', 'w', encoding='utf-8')
elif spider.name == 'bjdistrict':
self.file = open('bjdistrict.json', 'w', encoding='utf-8')
elif spider.name == 'guangzhou':
self.file = open('guangzhou.json', 'w', encoding='utf-8')
elif spider.name == 'gzdistrict':
self.file = open('gzdistrict.json', 'w', encoding='utf-8')
elif spider.name == 'shanghai':
self.file = open('shanghai.json', 'w', encoding='utf-8')
elif spider.name == 'shdistrict':
self.file = open('shdistrict.json', 'w', encoding='utf-8')
elif spider.name == 'szdistrict':
self.file = open('szdistrict.json', 'w', encoding='utf-8')
elif spider.name == 'shenzhen':
self.file = open('shenzhen.json', 'w', encoding='utf-8')
elif spider.name == 'tianjin':
self.file = open('tianjin.json', 'w', encoding='utf-8')
elif spider.name == 'tjdistrict':
self.file = open('tjdistrict.json', 'w', encoding='utf-8')
self.file.write('[\n')
self.first_item = True
self.processed_items = set()

def close_spider(self, spider):
self.file.write('\n]')
self.file.close()

def process_item(self, item, spider):
item_dict = dict(item)
# 爬下来的市辖区可能会有重复,所以需要去重
if spider.name == 'bjdistrict' or spider.name == 'gzdistrict' or spider.name=='shdistrict' or spider.name=='szdistrict' or spider.name=='tjdistrict':
item_tuple = tuple(item_dict.items())

if item_tuple in self.processed_items:
return None
self.processed_items.add(item_tuple)

if not self.first_item:
self.file.write(",\n")
else:
self.first_item = False

line = json.dumps(item_dict, ensure_ascii=False)
self.file.write(line)
return item

任务二

比较5个城市的总体房租情况,包含租金的均价、最高价、最低价、中位数等信息,单位面积租金(元/平米)的均价、最高价、最低价、中位数等信息。采用合适的图或表形式进行展示。

分析

获取到所有爬取数据后,简单画图发现其中存在一些极大值(100k+)和极小值(100-),这些数据很明显会对图产生较大影响,因此需要去除超过两倍标准差的数据

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# 北京租金的均价、最高价、最低价、中位数计算;单位面积租金(元/平米)的均价、最高价、最低价、中位数等信息
beijing_array = np.array(json.load(open('beijing.json','r',encoding='utf-8')))
beijing_array = [entry for entry in beijing_array if entry.get('面积') and '㎡' in entry['面积']]
# 去除超过两倍标准差的数据
sigma=np.std([float(entry['月租金'].split('-')[0]) for entry in beijing_array ])
mean=np.mean([float(entry['月租金'].split('-')[0]) for entry in beijing_array ])
beijing_array = [entry for entry in beijing_array if (float(entry['月租金'].split('-')[0])-mean) < 2*sigma]
# 取北京各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in beijing_array ])

# 计算北京租金的均价、最高价、最低价、中位数
beijing_max_rent_price = rent_prices.max()
beijing_min_rent_price = rent_prices.min()
beijing_median_rent_price = np.median(rent_prices)
beijing_mean_rent_price = np.mean(rent_prices)

# 输出结果
print("北京市最高租金:", beijing_max_rent_price)
print("北京市最低租金:", beijing_min_rent_price)
print("北京市租金中位数:", beijing_median_rent_price)
print("北京市租金均价:", beijing_mean_rent_price)

# 取北京各房子的面积信息
rent_areas = np.array([float(entry['面积'].split('㎡')[0]) for entry in beijing_array])
# 计算单位面积租金(元/平米)的均价、最高价、最低价、中位数等信息
rent_per_area = rent_prices / rent_areas
beijing_max_rent_per_area = rent_per_area.max()
beijing_min_rent_per_area = rent_per_area.min()
beijing_median_rent_per_area = np.median(rent_per_area)
beijing_mean_rent_per_area = np.mean(rent_per_area)

# 输出结果
print("北京市最高单位面积租金:", beijing_max_rent_per_area)
print("北京市最低单位面积租金:", beijing_min_rent_per_area)
print("北京市单位面积租金中位数:", beijing_median_rent_per_area)
print("北京市单位面积租金均价:", beijing_mean_rent_per_area)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# 广州租金的均价、最高价、最低价、中位数计算;单位面积租金(元/平米)的均价、最高价、最低价、中位数等信息
guangzhou_array = np.array(json.load(open('guangzhou.json','r',encoding='utf-8')))
guangzhou_array = [entry for entry in guangzhou_array if entry.get('面积') and '㎡' in entry['面积']]
# 去除超过三倍标准差的数据
sigma=np.std([float(entry['月租金'].split('-')[0]) for entry in guangzhou_array ])
mean=np.mean([float(entry['月租金'].split('-')[0]) for entry in guangzhou_array ])
guangzhou_array = [entry for entry in guangzhou_array if (float(entry['月租金'].split('-')[0])-mean) < 2*sigma]
# 取广州各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in guangzhou_array ])

# 计算广州租金的均价、最高价、最低价、中位数
guangzhou_max_rent_price = rent_prices.max()
guangzhou_min_rent_price = rent_prices.min()
guangzhou_median_rent_price = np.median(rent_prices)
guangzhou_mean_rent_price = np.mean(rent_prices)

# 输出结果
print("广州市最高租金:", guangzhou_max_rent_price)
print("广州市最低租金:", guangzhou_min_rent_price)
print("广州市租金中位数:", guangzhou_median_rent_price)
print("广州市租金均价:", guangzhou_mean_rent_price)

# 取广州各房子的面积信息
rent_areas = np.array([float(entry['面积'].split('㎡')[0]) for entry in guangzhou_array])
# 计算单位面积租金(元/平米)的均价、最高价、最低价、中位数等信息
rent_per_area = rent_prices / rent_areas
guangzhou_max_rent_per_area = rent_per_area.max()
guangzhou_min_rent_per_area = rent_per_area.min()
guangzhou_median_rent_per_area = np.median(rent_per_area)
guangzhou_mean_rent_per_area = np.mean(rent_per_area)

# 输出结果
print("广州市最高单位面积租金:", guangzhou_max_rent_per_area)
print("广州市最低单位面积租金:", guangzhou_min_rent_per_area)
print("广州市单位面积租金中位数:", guangzhou_median_rent_per_area)
print("广州市单位面积租金均价:", guangzhou_mean_rent_per_area)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# 深圳租金的均价、最高价、最低价、中位数计算;单位面积租金(元/平米)的均价、最高价、最低价、中位数等信息
shenzhen_array = np.array(json.load(open('shenzhen.json','r',encoding='utf-8')))
shenzhen_array = [entry for entry in shenzhen_array if entry.get('面积') and '㎡' in entry['面积']]
# 去除超过两倍标准差的数据
sigma=np.std([float(entry['月租金'].split('-')[0]) for entry in shenzhen_array ])
mean=np.mean([float(entry['月租金'].split('-')[0]) for entry in shenzhen_array ])
shenzhen_array = [entry for entry in shenzhen_array if (float(entry['月租金'].split('-')[0])-mean) < 2*sigma]
# 取深圳各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in shenzhen_array ])

# 计算深圳租金的均价、最高价、最低价、中位数
shenzhen_max_rent_price = rent_prices.max()
shenzhen_min_rent_price = rent_prices.min()
shenzhen_median_rent_price = np.median(rent_prices)
shenzhen_mean_reng_price = np.mean(rent_prices)

# 输出结果
print("深圳市最高租金:", shenzhen_max_rent_price)
print("深圳市最低租金:", shenzhen_min_rent_price)
print("深圳市租金中位数:", shenzhen_median_rent_price)
print("深圳市租金均价:", shenzhen_mean_reng_price)

# 取深圳各房子的面积信息
rent_areas = np.array([float(entry['面积'].split('㎡')[0]) for entry in shenzhen_array])
# 计算单位面积租金(元/平米)的均价、最高价、最低价、中位数等信息
rent_per_area = rent_prices / rent_areas
shenzhen_max_rent_per_area = rent_per_area.max()
shenzhen_min_rent_per_area = rent_per_area.min()
shenzhen_median_rent_per_area = np.median(rent_per_area)
shenzhen_mean_rent_per_area = np.mean(rent_per_area)

# 输出结果
print("深圳市最高单位面积租金:", shenzhen_max_rent_per_area)
print("深圳市最低单位面积租金:", shenzhen_min_rent_per_area)
print("深圳市单位面积租金中位数:", shenzhen_median_rent_per_area)
print("深圳市单位面积租金均价:", shenzhen_mean_rent_per_area)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 上海租金的均价、最高价、最低价、中位数计算;单位面积租金(元/平米)的均价、最高价、最低价、中位数等信息
shanghai_array = np.array(json.load(open('shanghai.json','r',encoding='utf-8')))
shanghai_array = [entry for entry in shanghai_array if entry.get('面积') and '㎡' in entry['面积']]
# 去除超过两倍标准差的数据
sigma=np.std([float(entry['月租金'].split('-')[0]) for entry in shanghai_array ])
mean=np.mean([float(entry['月租金'].split('-')[0]) for entry in shanghai_array ])
shanghai_array = [entry for entry in shanghai_array if (float(entry['月租金'].split('-')[0])-mean) < 2*sigma]
# 取上海各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in shanghai_array ])

# 计算上海租金的均价、最高价、最低价、中位数
shanghai_max_rent_price = rent_prices.max()
shanghai_min_rent_price = rent_prices.min()
shanghai_median_rent_price = np.median(rent_prices)
shanghai_mean_rent_price = np.mean(rent_prices)

# 输出结果
print("上海市最高租金:", shanghai_max_rent_price)
print("上海市最低租金:", shanghai_min_rent_price)
print("上海市租金中位数:", shanghai_median_rent_price)
print("上海市租金均价:", shanghai_mean_rent_price)

# 取上海各房子的面积信息
rent_areas = np.array([float(entry['面积'].split('㎡')[0]) for entry in shanghai_array])

# 计算单位面积租金(元/平米)的均价、最高价、最低价、中位数等信息
rent_per_area = rent_prices / rent_areas
shanghai_max_rent_per_area = rent_per_area.max()
shanghai_min_rent_per_area = rent_per_area.min()
shanghai_median_rent_per_area = np.median(rent_per_area)
shanghai_mean_rent_per_araa = np.mean(rent_per_area)

# 输出结果
print("上海市最高单位面积租金:", shanghai_max_rent_per_area)
print("上海市最低单位面积租金:", shanghai_min_rent_per_area)
print("上海市单位面积租金中位数:", shanghai_median_rent_per_area)
print("上海市单位面积租金均价:", shanghai_mean_rent_per_araa)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# 天津租金的均价、最高价、最低价、中位数计算;单位面积租金(元/平米)的均价、最高价、最低价、中位数等信息
tianjin_array = np.array(json.load(open('tianjin.json','r',encoding='utf-8')))
tianjin_array = [entry for entry in tianjin_array if entry.get('面积') and '㎡' in entry['面积']]
# 去除超过两倍标准差的数据
sigma=np.std([float(entry['月租金'].split('-')[0]) for entry in tianjin_array ])
mean=np.mean([float(entry['月租金'].split('-')[0]) for entry in tianjin_array ])
tianjin_array = [entry for entry in tianjin_array if (float(entry['月租金'].split('-')[0])-mean) < 2*sigma]
# 取天津各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in tianjin_array ])

# 计算天津租金的均价、最高价、最低价、中位数
tianjin_max_rent_price = rent_prices.max()
tianjin_min_rent_price = rent_prices.min()
tianjin_median_rent_price = np.median(rent_prices)
tianjin_mean_rent_price = np.mean(rent_prices)

# 输出结果
print("天津市最高租金:", tianjin_max_rent_price)
print("天津市最低租金:", tianjin_min_rent_price)
print("天津市租金中位数:", tianjin_median_rent_price)
print("天津市租金均价:", tianjin_mean_rent_price)

# 取天津各房子的面积信息
rent_areas = np.array([float(entry['面积'].split('㎡')[0]) for entry in tianjin_array])
# 计算单位面积租金(元/平米)的均价、最高价、最低价、中位数等信息
rent_per_area = rent_prices / rent_areas
tianjin_max_rent_per_area = rent_per_area.max()
tianjin_min_rent_per_area = rent_per_area.min()
tianjin_median_rent_per_area = np.median(rent_per_area)
tianjin_mean_rent_per_area = np.mean(rent_per_area)

# 输出结果
print("天津市最高单位面积租金:", tianjin_max_rent_per_area)
print("天津市最低单位面积租金:", tianjin_min_rent_per_area)
print("天津市单位面积租金中位数:", tianjin_median_rent_per_area)
print("天津市单位面积租金均价:", tianjin_mean_rent_per_area)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
data = {
'城市': ['北京', '广州', '深圳', '天津','上海'],
'租金最高价': [beijing_max_rent_price, guangzhou_max_rent_price, shenzhen_max_rent_price, tianjin_max_rent_price, shanghai_max_rent_price],
'租金最低价': [beijing_min_rent_price, guangzhou_min_rent_price, shenzhen_min_rent_price, tianjin_min_rent_price, shanghai_min_rent_price],
'租金中位数': [beijing_median_rent_price, guangzhou_median_rent_price, shenzhen_median_rent_price, tianjin_median_rent_price, shanghai_median_rent_price],
'租金平均数': [beijing_mean_rent_price, guangzhou_mean_rent_price, shenzhen_mean_reng_price, tianjin_mean_rent_price, shanghai_mean_rent_price],
}

# 转换为DataFrame
df = pd.DataFrame(data)

# 设置Seaborn样式
sns.set(style="whitegrid", font='Microsoft YaHei')

# 创建一个包含4个子图的画布
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))

# 绘制租金最高价的柱状图
sns.barplot(x='城市', y='租金最高价', data=df, color='skyblue', ax=axes[0, 0])
axes[0, 0].set_title('租金最高价对比图')
axes[0, 0].set_ylabel('租金(元)')
# 添加标注
for p in axes[0, 0].patches:
axes[0, 0].annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# 绘制租金最低价的柱状图
sns.barplot(x='城市', y='租金最低价', data=df, color='lightcoral', ax=axes[0, 1])
axes[0, 1].set_title('租金最低价对比图')
axes[0, 1].set_ylabel('租金(元)')
# 添加标注
for p in axes[0, 1].patches:
axes[0, 1].annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# 绘制租金中位数的柱状图
sns.barplot(x='城市', y='租金中位数', data=df, color='lightgreen', ax=axes[1, 0])
axes[1, 0].set_title('租金中位数对比图')
axes[1, 0].set_ylabel('租金(元)')
# 添加标注
for p in axes[1, 0].patches:
axes[1, 0].annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# 绘制租金平均数的柱状图
sns.barplot(x='城市', y='租金平均数', data=df, color='lightyellow', ax=axes[1, 1])
axes[1, 1].set_title('租金平均数对比图')
axes[1, 1].set_ylabel('租金(元)')
# 添加标注
for p in axes[1, 1].patches:
axes[1, 1].annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# 调整子图之间的间距
plt.tight_layout()

# 显示图形
plt.show()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# 将数据组织成一个字典
data = {
'城市': ['北京', '广州', '深圳', '天津','上海'],
'单位租金最高价': [beijing_max_rent_per_area, guangzhou_max_rent_per_area, shenzhen_max_rent_per_area, tianjin_max_rent_per_area, shanghai_max_rent_per_area],
'单位租金最低价': [beijing_min_rent_per_area, guangzhou_min_rent_per_area, shenzhen_min_rent_per_area, tianjin_min_rent_per_area, shanghai_min_rent_per_area],
'单位租金中位数': [beijing_median_rent_per_area, guangzhou_median_rent_per_area, shenzhen_median_rent_per_area, tianjin_median_rent_per_area, shanghai_median_rent_per_area],
'单位租金平均数': [beijing_mean_rent_per_area, guangzhou_mean_rent_per_area, shenzhen_mean_rent_per_area, tianjin_mean_rent_per_area, shanghai_mean_rent_per_araa],
}

# 转换为DataFrame
df = pd.DataFrame(data)

# 设置Seaborn样式
sns.set(style="whitegrid", font='Microsoft YaHei')

# 创建一个包含4个子图的画布
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))

# 绘制单位租金最高价的柱状图
ax = sns.barplot(x='城市', y='单位租金最高价', data=df, color='skyblue', ax=axes[0, 0])
ax.set_title('单位租金最高价对比图')
ax.set_ylabel('租金(元/平方米)')

# 添加标注
for p in ax.patches:
ax.annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# 绘制单位租金最低价的柱状图
ax = sns.barplot(x='城市', y='单位租金最低价', data=df, color='lightcoral', ax=axes[0, 1])
ax.set_title('单位租金最低价对比图')
ax.set_ylabel('租金(元/平方米)')

# 添加标注
for p in ax.patches:
ax.annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# 绘制单位租金中位数的柱状图
ax = sns.barplot(x='城市', y='单位租金中位数', data=df, color='lightgreen', ax=axes[1, 0])
ax.set_title('单位租金中位数对比图')
ax.set_ylabel('租金(元/平方米)')

# 添加标注
for p in ax.patches:
ax.annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# 绘制单位租金平均数的柱状图
ax = sns.barplot(x='城市', y='单位租金平均数', data=df, color='lightyellow', ax=axes[1, 1])
ax.set_title('单位租金平均数对比图')
ax.set_ylabel('租金(元/平方米)')

# 添加标注
for p in ax.patches:
ax.annotate(f'{p.get_height():.2f}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# 调整子图之间的间距
plt.tight_layout()

# 显示图形
plt.show()

任务三

比较5个城市一居、二居、三居的情况,包含均价、最高价、最低价、中位数等信息,采用合适的图或表形式进行展示。

分析

由于最高价和最低价差距较大,因此需要采用对数价格

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# 北京一居、二居、三居的均价、最高价、最低价、中位数等信息
# 一居
beijing_one_room_array = [entry for entry in beijing_array if entry.get('类型') and '1室' in entry['类型']]
# 取北京各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in beijing_one_room_array])

# 计算北京租金的均价、最高价、最低价、中位数
beijing_one_room_max_rent_price = rent_prices.max()
beijing_one_room_min_rent_price = rent_prices.min()
beijing_one_room_median_rent_price = np.median(rent_prices)
beijing_one_room_mean_rent_price = np.mean(rent_prices)

# 二居
beijing_two_room_array = [entry for entry in beijing_array if entry.get('类型') and '2室' in entry['类型']]
# 取北京各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in beijing_two_room_array])

# 计算北京租金的均价、最高价、最低价、中位数
beijing_two_room_max_rent_price = rent_prices.max()
beijing_two_room_min_rent_price = rent_prices.min()
beijing_two_room_median_rent_price = np.median(rent_prices)
beijing_two_room_mean_rent_price = np.mean(rent_prices)

# 三居
beijing_three_room_array = [entry for entry in beijing_array if entry.get('类型') and '3室' in entry['类型']]
# 取北京各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in beijing_three_room_array])

# 计算北京租金的均价、最高价、最低价、中位数
beijing_three_room_max_rent_price = rent_prices.max()
beijing_three_room_min_rent_price = rent_prices.min()
beijing_three_room_median_rent_price = np.median(rent_prices)
beijing_three_room_mean_rent_price = np.mean(rent_prices)

# 构建数据字典
data_dict = [
{'户型': '一居', '最高价': beijing_one_room_max_rent_price, '最低价': beijing_one_room_min_rent_price, '中位数': beijing_one_room_median_rent_price, '均价': beijing_one_room_mean_rent_price},
{'户型': '二居', '最高价': beijing_two_room_max_rent_price, '最低价': beijing_two_room_min_rent_price, '中位数': beijing_two_room_median_rent_price, '均价': beijing_two_room_mean_rent_price},
{'户型': '三居', '最高价': beijing_three_room_max_rent_price, '最低价': beijing_three_room_min_rent_price, '中位数': beijing_three_room_median_rent_price, '均价': beijing_three_room_mean_rent_price},
]

# 转换为DataFrame
df = pd.DataFrame(data_dict)

# 将数据进行重塑以适应Seaborn的lineplot
df_melted = pd.melt(df, id_vars=['户型'], var_name='租金类型', value_name='价格')

# 对价格进行对数处理
df_melted['对数价格'] = np.log(df_melted['价格'])

# 输出三种居室折线图展示
plt.figure(figsize=(10, 6))
ax = sns.lineplot(x="户型", y="对数价格", hue="租金类型", data=df_melted, marker='o')

# 在每个数据点处添加标签
for index, row in df_melted.iterrows():
if(index!=6 and index!=7 and index!=8):
ax.annotate(f"{np.exp(row['对数价格']):.0f}", (index%3, row['对数价格']), textcoords="offset points", xytext=(0,5), ha='center')
else:
ax.annotate(f"{np.exp(row['对数价格']):.0f}", (index%3, row['对数价格']), textcoords="offset points", xytext=(0,-10), ha='center')

plt.title('不同户型的租金(对数)及原始价格对比折线图')
plt.ylabel('对数价格')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.show()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# 广州一居、二居、三居的均价、最高价、最低价、中位数等信息
# 一居
guangzhou_one_room_array = [entry for entry in guangzhou_array if entry.get('类型') and '1室' in entry['类型']]
# 取广州各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in guangzhou_one_room_array])

# 计算广州租金的均价、最高价、最低价、中位数
guangzhou_one_room_max_rent_price = rent_prices.max()
guangzhou_one_room_min_rent_price = rent_prices.min()
guangzhou_one_room_median_rent_price = np.median(rent_prices)
guangzhou_one_room_mean_rent_price = np.mean(rent_prices)

# 输出结果
print("广州市一居最高租金:", guangzhou_one_room_max_rent_price)
print("广州市一居最低租金:", guangzhou_one_room_min_rent_price)
print("广州市一居租金中位数:", guangzhou_one_room_median_rent_price)
print("广州市一居租金均价:", guangzhou_one_room_mean_rent_price)

# 二居
guangzhou_two_room_array = [entry for entry in guangzhou_array if entry.get('类型') and '2室' in entry['类型']]
# 取广州各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in guangzhou_two_room_array])

# 计算广州租金的均价、最高价、最低价、中位数
guangzhou_two_room_max_rent_price = rent_prices.max()
guangzhou_two_room_min_rent_price = rent_prices.min()
guangzhou_two_room_median_rent_price = np.median(rent_prices)
guangzhou_two_room_mean_rent_price = np.mean(rent_prices)

# 输出结果
print("广州市二居最高租金:", guangzhou_two_room_max_rent_price)
print("广州市二居最低租金:", guangzhou_two_room_min_rent_price)
print("广州市二居租金中位数:", guangzhou_two_room_median_rent_price)
print("广州市二居租金均价:", guangzhou_two_room_mean_rent_price)

# 三居
guangzhou_three_room_array = [entry for entry in guangzhou_array if entry.get('类型') and '3室' in entry['类型']]
# 取广州各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in guangzhou_three_room_array])

# 计算广州租金的均价、最高价、最低价、中位数
guangzhou_three_room_max_rent_price = rent_prices.max()
guangzhou_three_room_min_rent_price = rent_prices.min()
guangzhou_three_room_median_rent_price = np.median(rent_prices)
guangzhou_three_room_mean_rent_price = np.mean(rent_prices)

# 输出结果
print("广州市三居最高租金:", guangzhou_three_room_max_rent_price)
print("广州市三居最低租金:", guangzhou_three_room_min_rent_price)
print("广州市三居租金中位数:", guangzhou_three_room_median_rent_price)
print("广州市三居租金均价:", guangzhou_three_room_mean_rent_price)

data_dict = [
{'户型': '一居', '最高价': guangzhou_one_room_max_rent_price, '最低价': guangzhou_one_room_min_rent_price, '中位数': guangzhou_one_room_median_rent_price, '均价': guangzhou_one_room_mean_rent_price},
{'户型': '二居', '最高价': guangzhou_two_room_max_rent_price, '最低价': guangzhou_two_room_min_rent_price, '中位数': guangzhou_two_room_median_rent_price, '均价': guangzhou_two_room_mean_rent_price},
{'户型': '三居', '最高价': guangzhou_three_room_max_rent_price, '最低价': guangzhou_three_room_min_rent_price, '中位数': guangzhou_three_room_median_rent_price, '均价': guangzhou_three_room_mean_rent_price},
]

# 转换为DataFrame
df = pd.DataFrame(data_dict)

# 将数据进行重塑以适应Seaborn的lineplot
df_melted = pd.melt(df, id_vars=['户型'], var_name='租金类型', value_name='价格')

# 对价格进行对数处理
df_melted['对数价格'] = np.log(df_melted['价格'])

# 输出三种居室折线图展示
plt.figure(figsize=(10, 6))
ax = sns.lineplot(x="户型", y="对数价格", hue="租金类型", data=df_melted, marker='o')

# 在每个数据点处添加标签
for index, row in df_melted.iterrows():
if(index!=6 and index!=7 and index!=8):
ax.annotate(f"{np.exp(row['对数价格']):.0f}", (index%3, row['对数价格']), textcoords="offset points", xytext=(0,5), ha='center')
else:
ax.annotate(f"{np.exp(row['对数价格']):.0f}", (index%3, row['对数价格']), textcoords="offset points", xytext=(0,-10), ha='center')

plt.title('不同户型的租金(对数)及原始价格对比折线图')
plt.ylabel('对数价格')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.show()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# 上海一居、二居、三居的均价、最高价、最低价、中位数等信息
# 一居
shanghai_one_room_array = [entry for entry in shanghai_array if entry.get('类型') and '1室' in entry['类型']]
# 取上海各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in shanghai_one_room_array])

# 计算上海租金的均价、最高价、最低价、中位数
shanghai_one_room_max_rent_price = rent_prices.max()
shanghai_one_room_min_rent_price = rent_prices.min()
shanghai_one_room_median_rent_price = np.median(rent_prices)
shanghai_one_room_mean_rent_price = np.mean(rent_prices)

# 输出结果
print("上海市一居最高租金:", shanghai_one_room_max_rent_price)
print("上海市一居最低租金:", shanghai_one_room_min_rent_price)
print("上海市一居租金中位数:", shanghai_one_room_median_rent_price)
print("上海市一居租金均价:", shanghai_one_room_mean_rent_price)

# 二居
shanghai_two_room_array = [entry for entry in shanghai_array if entry.get('类型') and '2室' in entry['类型']]
# 取上海各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in shanghai_two_room_array])

# 计算上海租金的均价、最高价、最低价、中位数
shanghai_two_room_max_rent_price = rent_prices.max()
shanghai_two_room_min_rent_price = rent_prices.min()
shanghai_two_room_median_rent_price = np.median(rent_prices)
shanghai_two_room_mean_rent_price = np.mean(rent_prices)

# 输出结果
print("上海市二居最高租金:", shanghai_two_room_max_rent_price)
print("上海市二居最低租金:", shanghai_two_room_min_rent_price)
print("上海市二居租金中位数:", shanghai_two_room_median_rent_price)
print("上海市二居租金均价:", shanghai_two_room_mean_rent_price)

# 三居
shanghai_three_room_array = [entry for entry in shanghai_array if entry.get('类型') and '3室' in entry['类型']]
# 取上海各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in shanghai_three_room_array])

# 计算上海租金的均价、最高价、最低价、中位数
shanghai_three_room_max_rent_price = rent_prices.max()
shanghai_three_room_min_rent_price = rent_prices.min()
shanghai_three_room_median_rent_price = np.median(rent_prices)
shanghai_three_room_mean_rent_price = np.mean(rent_prices)

# 输出结果
print("上海市三居最高租金:", shanghai_three_room_max_rent_price)
print("上海市三居最低租金:", shanghai_three_room_min_rent_price)
print("上海市三居租金中位数:", shanghai_three_room_median_rent_price)
print("上海市三居租金均价:", shanghai_three_room_mean_rent_price)

data_dict = [
{'户型': '一居', '最高价': shanghai_one_room_max_rent_price, '最低价': shanghai_one_room_min_rent_price, '中位数': shanghai_one_room_median_rent_price, '均价': shanghai_one_room_mean_rent_price},
{'户型': '二居', '最高价': shanghai_two_room_max_rent_price, '最低价': shanghai_two_room_min_rent_price, '中位数': shanghai_two_room_median_rent_price, '均价': shanghai_two_room_mean_rent_price},
{'户型': '三居', '最高价': shanghai_three_room_max_rent_price, '最低价': shanghai_three_room_min_rent_price, '中位数': shanghai_three_room_median_rent_price, '均价': shanghai_three_room_mean_rent_price},
]

# 转换为DataFrame
df = pd.DataFrame(data_dict)

# 将数据进行重塑以适应Seaborn的lineplot
df_melted = pd.melt(df, id_vars=['户型'], var_name='租金类型', value_name='价格')

# 对价格进行对数处理
df_melted['对数价格'] = np.log(df_melted['价格'])

# 输出三种居室折线图展示
plt.figure(figsize=(10, 6))
ax = sns.lineplot(x="户型", y="对数价格", hue="租金类型", data=df_melted, marker='o')

# 在每个数据点处添加标签
for index, row in df_melted.iterrows():
if(index!=6 and index!=7 and index!=8):
ax.annotate(f"{np.exp(row['对数价格']):.0f}", (index%3, row['对数价格']), textcoords="offset points", xytext=(0,5), ha='center')
else:
ax.annotate(f"{np.exp(row['对数价格']):.0f}", (index%3, row['对数价格']), textcoords="offset points", xytext=(0,-10), ha='center')

plt.title('不同户型的租金(对数)及原始价格对比折线图')
plt.ylabel('对数价格')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.show()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# 深圳一居、二居、三居的均价、最高价、最低价、中位数等信息
# 一居
shenzhen_one_room_array = [entry for entry in shenzhen_array if entry.get('类型') and '1室' in entry['类型']]
# 取深圳各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in shenzhen_one_room_array])

# 计算深圳租金的均价、最高价、最低价、中位数
shenzhen_one_room_max_rent_price = rent_prices.max()
shenzhen_one_room_min_rent_price = rent_prices.min()
shenzhen_one_room_median_rent_price = np.median(rent_prices)
shenzhen_one_room_mean_rent_price = np.mean(rent_prices)

# 输出结果
print("深圳市一居最高租金:", shenzhen_one_room_max_rent_price)
print("深圳市一居最低租金:", shenzhen_one_room_min_rent_price)
print("深圳市一居租金中位数:", shenzhen_one_room_median_rent_price)
print("深圳市一居租金均价:", shenzhen_one_room_mean_rent_price)

# 二居
shenzhen_two_room_array = [entry for entry in shenzhen_array if entry.get('类型') and '2室' in entry['类型']]
# 取深圳各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in shenzhen_two_room_array])

# 计算深圳租金的均价、最高价、最低价、中位数
shenzhen_two_room_max_rent_price = rent_prices.max()
shenzhen_two_room_min_rent_price = rent_prices.min()
shenzhen_two_room_median_rent_price = np.median(rent_prices)
shenzhen_two_room_mean_rent_price = np.mean(rent_prices)

# 输出结果
print("深圳市二居最高租金:", shenzhen_two_room_max_rent_price)
print("深圳市二居最低租金:", shenzhen_two_room_min_rent_price)
print("深圳市二居租金中位数:", shenzhen_two_room_median_rent_price)
print("深圳市二居租金均价:", shenzhen_two_room_mean_rent_price)

# 三居
shenzhen_three_room_array = [entry for entry in shenzhen_array if entry.get('类型') and '3室' in entry['类型']]
# 取深圳各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in shenzhen_three_room_array])

# 计算深圳租金的均价、最高价、最低价、中位数
shenzhen_three_room_max_rent_price = rent_prices.max()
shenzhen_three_room_min_rent_price = rent_prices.min()
shenzhen_three_room_median_rent_price = np.median(rent_prices)
shenzhen_three_room_mean_rent_price = np.mean(rent_prices)

# 输出结果
print("深圳市三居最高租金:", shenzhen_three_room_max_rent_price)
print("深圳市三居最低租金:", shenzhen_three_room_min_rent_price)
print("深圳市三居租金中位数:", shenzhen_three_room_median_rent_price)
print("深圳市三居租金均价:", shenzhen_three_room_mean_rent_price)

data_dict = [
{'户型': '一居', '最高价': shenzhen_one_room_max_rent_price, '最低价': shenzhen_one_room_min_rent_price, '中位数': shenzhen_one_room_median_rent_price, '均价': shenzhen_one_room_mean_rent_price},
{'户型': '二居', '最高价': shenzhen_two_room_max_rent_price, '最低价': shenzhen_two_room_min_rent_price, '中位数': shenzhen_two_room_median_rent_price, '均价': shenzhen_two_room_mean_rent_price},
{'户型': '三居', '最高价': shenzhen_three_room_max_rent_price, '最低价': shenzhen_three_room_min_rent_price, '中位数': shenzhen_three_room_median_rent_price, '均价': shenzhen_three_room_mean_rent_price},
]

# 转换为DataFrame
df = pd.DataFrame(data_dict)

# 将数据进行重塑以适应Seaborn的lineplot
df_melted = pd.melt(df, id_vars=['户型'], var_name='租金类型', value_name='价格')

# 对价格进行对数处理
df_melted['对数价格'] = np.log(df_melted['价格'])

# 输出三种居室折线图展示
plt.figure(figsize=(10, 6))
ax = sns.lineplot(x="户型", y="对数价格", hue="租金类型", data=df_melted, marker='o')

# 在每个数据点处添加标签
for index, row in df_melted.iterrows():
if(index!=6 and index!=7 and index!=8):
ax.annotate(f"{np.exp(row['对数价格']):.0f}", (index%3, row['对数价格']), textcoords="offset points", xytext=(0,5), ha='center')
else:
ax.annotate(f"{np.exp(row['对数价格']):.0f}", (index%3, row['对数价格']), textcoords="offset points", xytext=(0,-10), ha='center')

plt.title('不同户型的租金(对数)及原始价格对比折线图')
plt.ylabel('对数价格')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.show()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# 天津一居、二居、三居的均价、最高价、最低价、中位数等信息
# 一居
tianjin_one_room_array = [entry for entry in tianjin_array if entry.get('类型') and '1室' in entry['类型']]
# 取天津各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in tianjin_one_room_array])

# 计算天津租金的均价、最高价、最低价、中位数
tianjin_one_room_max_rent_price = rent_prices.max()
tianjin_one_room_min_rent_price = rent_prices.min()
tianjin_one_room_median_rent_price = np.median(rent_prices)
tianjin_one_room_mean_rent_price = np.mean(rent_prices)

# 输出结果
print("天津市一居最高租金:", tianjin_one_room_max_rent_price)
print("天津市一居最低租金:", tianjin_one_room_min_rent_price)
print("天津市一居租金中位数:", tianjin_one_room_median_rent_price)
print("天津市一居租金均价:", tianjin_one_room_mean_rent_price)

# 二居
tianjin_two_room_array = [entry for entry in tianjin_array if entry.get('类型') and '2室' in entry['类型']]
# 取天津各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in tianjin_two_room_array])

# 计算天津租金的均价、最高价、最低价、中位数
tianjin_two_room_max_rent_price = rent_prices.max()
tianjin_two_room_min_rent_price = rent_prices.min()
tianjin_two_room_median_rent_price = np.median(rent_prices)
tianjin_two_room_mean_rent_price = np.mean(rent_prices)

# 输出结果
print("天津市二居最高租金:", tianjin_two_room_max_rent_price)
print("天津市二居最低租金:", tianjin_two_room_min_rent_price)
print("天津市二居租金中位数:", tianjin_two_room_median_rent_price)
print("天津市二居租金均价:", tianjin_two_room_mean_rent_price)

# 三居
tianjin_three_room_array = [entry for entry in tianjin_array if entry.get('类型') and '3室' in entry['类型']]
# 取天津各房子的租金信息
rent_prices = np.array([float(entry['月租金'].split('-')[0]) for entry in tianjin_three_room_array])

# 计算天津租金的均价、最高价、最低价、中位数
tianjin_three_room_max_rent_price = rent_prices.max()
tianjin_three_room_min_rent_price = rent_prices.min()
tianjin_three_room_median_rent_price = np.median(rent_prices)
tianjin_three_room_mean_rent_price = np.mean(rent_prices)

# 输出结果
print("天津市三居最高租金:", tianjin_three_room_max_rent_price)
print("天津市三居最低租金:", tianjin_three_room_min_rent_price)
print("天津市三居租金中位数:", tianjin_three_room_median_rent_price)
print("天津市三居租金均价:", tianjin_three_room_mean_rent_price)

data_dict = [
{'户型': '一居', '最高价': tianjin_one_room_max_rent_price, '最低价': tianjin_one_room_min_rent_price, '中位数': tianjin_one_room_median_rent_price, '均价': tianjin_one_room_mean_rent_price},
{'户型': '二居', '最高价': tianjin_two_room_max_rent_price, '最低价': tianjin_two_room_min_rent_price, '中位数': tianjin_two_room_median_rent_price, '均价': tianjin_two_room_mean_rent_price},
{'户型': '三居', '最高价': tianjin_three_room_max_rent_price, '最低价': tianjin_three_room_min_rent_price, '中位数': tianjin_three_room_median_rent_price, '均价': tianjin_three_room_mean_rent_price},
]

# 转换为DataFrame
df = pd.DataFrame(data_dict)

# 将数据进行重塑以适应Seaborn的lineplot
df_melted = pd.melt(df, id_vars=['户型'], var_name='租金类型', value_name='价格')

# 对价格进行对数处理
df_melted['对数价格'] = np.log(df_melted['价格'])

# 输出三种居室折线图展示
plt.figure(figsize=(10, 6))
ax = sns.lineplot(x="户型", y="对数价格", hue="租金类型", data=df_melted, marker='o')

# 在每个数据点处添加标签
for index, row in df_melted.iterrows():
if(index!=6 and index!=7 and index!=8):
ax.annotate(f"{np.exp(row['对数价格']):.0f}", (index%3, row['对数价格']), textcoords="offset points", xytext=(0,5), ha='center')
else:
ax.annotate(f"{np.exp(row['对数价格']):.0f}", (index%3, row['对数价格']), textcoords="offset points", xytext=(0,-10), ha='center')

plt.title('不同户型的租金(对数)及原始价格对比折线图')
plt.ylabel('对数价格')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.show()

任务五

计算和分析每个城市不同板块的均价情况,并采用合适的图或表形式进行展示。

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 转换为 DataFrame
df = pd.DataFrame(beijing_array)

# 将月租金列转换为数值型
df['月租金'] = pd.to_numeric(df['月租金'], errors='coerce')

# 按二级地址分组,计算均价
avg_prices = df.groupby('二级地址')['月租金'].mean().reset_index()

# 使用Seaborn绘制柱状图
plt.figure(figsize=(100, 6))
sns.barplot(x='二级地址', y='月租金', hue='二级地址', data=avg_prices, palette='viridis', legend=False)
plt.title('不同板块的均价情况')
plt.xlabel('板块')
plt.ylabel('均价(元)')
plt.xticks(rotation=45, ha='right') # 旋转x轴标签,使其更清晰可读
plt.show()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 转换为 DataFrame
df = pd.DataFrame(guangzhou_array)

# 将月租金列转换为数值型
df['月租金'] = pd.to_numeric(df['月租金'], errors='coerce')

# 按二级地址分组,计算均价
avg_prices = df.groupby('二级地址')['月租金'].mean().reset_index()

# 使用Seaborn绘制柱状图
plt.figure(figsize=(100, 6))
sns.barplot(x='二级地址', y='月租金', hue='二级地址', data=avg_prices, palette='viridis', legend=False)
plt.title('不同板块的均价情况')
plt.xlabel('板块')
plt.ylabel('均价(元)')
plt.xticks(rotation=45, ha='right') # 旋转x轴标签,使其更清晰可读
plt.show()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 转换为 DataFrame
df = pd.DataFrame(shanghai_array)

# 将月租金列转换为数值型
df['月租金'] = pd.to_numeric(df['月租金'], errors='coerce')

# 按二级地址分组,计算均价
avg_prices = df.groupby('二级地址')['月租金'].mean().reset_index()

# 使用Seaborn绘制柱状图
plt.figure(figsize=(100, 6))
sns.barplot(x='二级地址', y='月租金', hue='二级地址', data=avg_prices, palette='viridis', legend=False)
plt.title('不同板块的均价情况')
plt.xlabel('板块')
plt.ylabel('均价(元)')
plt.xticks(rotation=45, ha='right') # 旋转x轴标签,使其更清晰可读
plt.show()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 转换为 DataFrame
df = pd.DataFrame(tianjin_array)

# 将月租金列转换为数值型
df['月租金'] = pd.to_numeric(df['月租金'], errors='coerce')

# 按二级地址分组,计算均价
avg_prices = df.groupby('二级地址')['月租金'].mean().reset_index()

# 使用Seaborn绘制柱状图
plt.figure(figsize=(100, 6))
sns.barplot(x='二级地址', y='月租金', hue='二级地址', data=avg_prices, palette='viridis', legend=False)
plt.title('不同板块的均价情况')
plt.xlabel('板块')
plt.ylabel('均价(元)')
plt.xticks(rotation=45, ha='right') # 旋转x轴标签,使其更清晰可读
plt.show()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 转换为 DataFrame
df = pd.DataFrame(shenzhen_array)

# 将月租金列转换为数值型
df['月租金'] = pd.to_numeric(df['月租金'], errors='coerce')

# 按二级地址分组,计算均价
avg_prices = df.groupby('二级地址')['月租金'].mean().reset_index()

# 使用Seaborn绘制柱状图
plt.figure(figsize=(100, 6))
sns.barplot(x='二级地址', y='月租金', hue='二级地址', data=avg_prices, palette='viridis', legend=False)
plt.title('不同板块的均价情况')
plt.xlabel('板块')
plt.ylabel('均价(元)')
plt.xticks(rotation=45, ha='right') # 旋转x轴标签,使其更清晰可读
plt.show()

任务六

比较各个城市不同朝向的单位面积租金分布情况,采用合适的图或表形式进行展示。哪个方向最高,哪个方向最低?各个城市是否一致?如果不一致,你认为原因是什么?

分析

由箱线图可知,北京东南方单位面积租金最高,北方最低;天津东北方单位面积租金最高,南方最低;上海东南方单位面积租金最高,北方最低;广州比较均匀;深圳西南方最高,东南方最低。

原因可能有以下几点:

  1. 地理位置: 不同城市的地理位置可能对朝向的单位面积租金产生影响。例如,面朝城市繁华区或拥有良好景观的方向可能更受欢迎,从而推高租金。

  2. 经济发展: 不同城市的经济状况和发展水平不同,可能导致不同朝向的单位面积租金存在差异。朝向经济繁荣区的方向可能租金较高。

  3. 交通便利性: 朝向交通便利的方向可能更受租户青睐,从而影响租金水平。

  4. 景观和环境: 朝向有良好景观或较好环境的方向可能租金较高,因为人们更倾向于选择居住在宜人的环境中。

  5. 区域规划: 城市规划和开发策略可能会影响不同方向的发展速度和吸引力,从而影响租金水平。

  6. 人口密度: 不同方向的人口密度和社会经济特征可能影响租金水平。高密度地区可能更容易找到租客,推高租金。

  7. 房屋品质和设施: 不同方向的房屋品质和设施可能存在差异,从而影响租金水平。

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# 将 JSON 数据转换为各地区的 DataFrame
df_beijing = pd.DataFrame(beijing_array)
df_tianjin = pd.DataFrame(tianjin_array)
df_shanghai = pd.DataFrame(shanghai_array)
df_guangzhou = pd.DataFrame(guangzhou_array)
df_shenzhen = pd.DataFrame(shenzhen_array)

# 添加城市信息
df_beijing['城市'] = '北京'
df_tianjin['城市'] = '天津'
df_shanghai['城市'] = '上海'
df_guangzhou['城市'] = '广州'
df_shenzhen['城市'] = '深圳'

# 将各地区的 DataFrame 合并成一个
df = pd.concat([df_beijing, df_tianjin, df_shanghai, df_guangzhou, df_shenzhen])

# 将 '月租金' 转换为数值型
df['月租金'] = pd.to_numeric(df['月租金'], errors='coerce')

# 提取 '朝向' 信息
df['朝向'] = df['名字'].apply(lambda x: x.split(" ")[-1].split("/")[0])
df['朝向'] = df['朝向'].replace(to_replace='.*卧', value='', regex=True)

# 去除不含 '朝向' 信息和朝向为""的行
df = df.dropna(subset=['朝向'])
df = df[df['朝向'] != ""]

# 创建各城市的各个朝向的箱线图
plt.figure(figsize=(20, 15))
sns.boxplot(x='城市', y='月租金', hue='朝向', data=df, palette='viridis', showfliers=False)
plt.title('各城市不同朝向单位面积租金分布情况')
plt.xlabel('城市')
plt.ylabel('单位面积租金(元/㎡)')
plt.show()

任务七

查询各个城市的平均工资,分析并展示其和单位面积租金分布的关系。比较一下在哪个城市租房的负担最重?

分析

由图可知,北京压力最大

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
'''
北京:2022年北京市全口径城镇单位就业人员月平均工资为10449元。
上海:2022年度上海市全口径城镇单位就业人员月平均工资为12183元。
广州:2022年广州市全口径城镇单位就业人员月平均工资为10449元。
深圳:2022年深圳市全口径城镇单位就业人员月平均工资为13730元。
天津:2022年天津市城镇非私营单位就业人员月平均工资为7919元。
'''

# 计算各地单位面积租金站总工资比例
beijing_ratio = beijing_mean_rent_per_area / 10449
shanghai_ratio = shanghai_mean_rent_per_araa / 12183
guangzhou_ratio = guangzhou_mean_rent_per_area / 10449
shenzhen_ratio = shenzhen_mean_rent_per_area / 13730
tianjin_ratio = tianjin_mean_rent_per_area / 7919

# 使用柱状图展示

# 转换为 DataFrame
df = pd.DataFrame({
'城市': ['北京', '上海', '广州', '深圳', '天津'],
'单位面积租金占工资比例': [beijing_ratio, shanghai_ratio, guangzhou_ratio, shenzhen_ratio, tianjin_ratio]
})

# 使用Seaborn绘制柱状图

plt.figure(figsize=(10, 6))
ax = sns.barplot(x='城市', y='单位面积租金占工资比例', data=df)
for p in ax.patches:
ax.annotate(f'{p.get_height():.3f}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10), textcoords='offset points')

plt.title('各城市单位面积租金占工资比例')
plt.xlabel('城市')
plt.ylabel('单位面积租金占工资比例')
plt.show()

任务八

与2022年的租房数据进行对比(只比较北上广深4个城市,原始数据会给出),总结你观察到的变化情况,并用图、表、文字等支撑你得到的结论。

分析

对比了各个市辖区在一年之类租金平均价格的变化。可以看出北京和上海某些地段价格变化十分明显

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
df_2022 = pd.DataFrame(json.loads(open('ShanghaiHouseInfo.json',encoding='gbk').read()))
df_2023 = pd.DataFrame(json.loads(open('shanghai.json',encoding='utf-8').read()))
df_2023['月租金'] = pd.to_numeric(df_2023['月租金'], errors='coerce')

# 去除超过两倍标准差的数据
sigma2022 = df_2022['total_price'].std()
mean2022 = df_2022['total_price'].mean()
df_2022 = df_2022[df_2022['total_price'] < mean2022 + 2 * sigma2022]

sigma2023 = df_2023['月租金'].std()
mean2023 = df_2023['月租金'].mean()
df_2023 = df_2023[df_2023['月租金'] < mean2023 + 2 * sigma2023]

df_2023['月租金'] = pd.to_numeric(df_2023['月租金'], errors='coerce')

# 2023年数据以 "二级地址" 聚类,计算平均价格
average_prices_2023 = df_2023.groupby('二级地址')['月租金'].mean()

# 2022年数据以 "district" 聚类,计算平均价格
average_prices_2022 = df_2022.groupby('district')['total_price'].mean()

# 计算月租金差异
rental_diff = average_prices_2023 - average_prices_2022

# 绘制比较图表
plt.figure(figsize=(100, 6))
sns.set(style="whitegrid", font='Microsoft YaHei')

bar_positions = range(len(rental_diff))
bar_width = 0.35

plt.bar(bar_positions, rental_diff, width=bar_width, color=['green' if diff > 0 else 'red' for diff in rental_diff])

plt.xlabel("地区")
plt.ylabel("月租金差异")
plt.title("2023年和2022年上海各区月租金差异比较")
plt.xticks(bar_positions, rental_diff.index, rotation='vertical')

plt.show()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
df_2022 = pd.DataFrame(json.loads(open('BeijingHouseInfo.json',encoding='utf-8').read()))
df_2023 = pd.DataFrame(json.loads(open('beijing.json',encoding='utf-8').read()))
df_2023['月租金'] = pd.to_numeric(df_2023['月租金'], errors='coerce')

# 去除超过两倍标准差的数据
sigma2022 = df_2022['total_price'].std()
mean2022 = df_2022['total_price'].mean()
df_2022 = df_2022[df_2022['total_price'] < mean2022 + 2 * sigma2022]

sigma2023 = df_2023['月租金'].std()
mean2023 = df_2023['月租金'].mean()
df_2023 = df_2023[df_2023['月租金'] < mean2023 + 2 * sigma2023]

# 2023年数据以 "二级地址" 聚类,计算平均价格
average_prices_2023 = df_2023.groupby('二级地址')['月租金'].mean()

# 2022年数据以 "district" 聚类,计算平均价格
average_prices_2022 = df_2022.groupby('district')['total_price'].mean()

# 计算月租金差异
rental_diff = average_prices_2023 - average_prices_2022

# 绘制比较图表
plt.figure(figsize=(100, 6))
sns.set(style="whitegrid", font='Microsoft YaHei')

bar_positions = range(len(rental_diff))
bar_width = 0.35

plt.bar(bar_positions, rental_diff, width=bar_width, color=['green' if diff > 0 else 'red' for diff in rental_diff])

plt.xlabel("地区")
plt.ylabel("月租金差异")
plt.title("2023年和2022年北京各区月租金差异比较")
plt.xticks(bar_positions, rental_diff.index, rotation='vertical')

plt.show()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
df_2022 = pd.DataFrame(json.loads(open('GuangzhouHouseInfo.json',encoding='utf-8').read()))
df_2023 = pd.DataFrame(json.loads(open('guangzhou.json',encoding='utf-8').read()))
df_2023['月租金'] = pd.to_numeric(df_2023['月租金'], errors='coerce')

# 去除超过两倍标准差的数据
sigma2022 = df_2022['total_price'].std()
mean2022 = df_2022['total_price'].mean()
df_2022 = df_2022[df_2022['total_price'] < mean2022 + 2 * sigma2022]

sigma2023 = df_2023['月租金'].std()
mean2023 = df_2023['月租金'].mean()
df_2023 = df_2023[df_2023['月租金'] < mean2023 + 2 * sigma2023]

# 2023年数据以 "二级地址" 聚类,计算平均价格
average_prices_2023 = df_2023.groupby('二级地址')['月租金'].mean()

# 2022年数据以 "district" 聚类,计算平均价格
average_prices_2022 = df_2022.groupby('district')['total_price'].mean()

# 计算月租金差异
rental_diff = average_prices_2023 - average_prices_2022

# 绘制比较图表
plt.figure(figsize=(100, 6))
sns.set(style="whitegrid", font='Microsoft YaHei')

bar_positions = range(len(rental_diff))
bar_width = 0.35

plt.bar(bar_positions, rental_diff, width=bar_width, color=['green' if diff > 0 else 'red' for diff in rental_diff])

plt.xlabel("地区")
plt.ylabel("月租金差异")
plt.title("2023年和2022年广州各区月租金差异比较")
plt.xticks(bar_positions, rental_diff.index, rotation='vertical')

plt.show()