数据框的生成
using DataFrames
da0 = DataFrame(
name=["张三", "李四", "王五", "赵六"],
age=[33, 42, missing, 51],
sex=["M", "F", "M", "M"])
da1 = copy(da0)
|
String |
Int64? |
String |
1 |
张三 |
33 |
M |
2 |
李四 |
42 |
F |
3 |
王五 |
missing |
M |
4 |
赵六 |
51 |
M |
其中使用了copy
,生成了da0
的副本。数据框属于一个可变数据类型(mutable type),若直接接受da0
赋值给一个变量da1
,则这两个变量实际上指向一个数据框,改变其中一个会同时改变两个。
da0 = DataFrame(
"name" => ["张三", "李四", "王五", "赵六"],
"age" => [33, 42, missing, 51],
"sex" => ["M", "F", "M", "M"])
|
String |
Int64? |
String |
1 |
张三 |
33 |
M |
2 |
李四 |
42 |
F |
3 |
王五 |
missing |
M |
4 |
赵六 |
51 |
M |
di = Dict(
"name" => ["张三", "李四", "王五", "赵六"],
"age" => [33, 42, missing, 51],
"sex" => ["M", "F", "M", "M"])
da0 = DataFrame(di)
|
Int64? |
String |
String |
1 |
33 |
张三 |
M |
2 |
42 |
李四 |
F |
3 |
missing |
王五 |
M |
4 |
51 |
赵六 |
M |
names(da1) = ["name", "age", "sex"]
3-element Vector{String}:
"name"
"age"
"sex"
zip(names(da1),string.(eltype.(eachcol(da1))))|>
DataFrame |>
d -> rename!(d,["Variable", "Type"])
|
String |
String |
1 |
name |
String |
2 |
age |
Union{Missing, Int64} |
3 |
sex |
String |
访问单个元素
|
String |
Int64? |
String |
1 |
张三 |
33 |
M |
2 |
孙七 |
42 |
F |
3 |
王五 |
missing |
M |
4 |
赵六 |
51 |
M |
访问一列
4-element Vector{Union{Missing, Int64}}:
33
42
missing
51
在julia中较为常用的方法是!
,可以实现将多个的变量的操作。
4-element Vector{Union{Missing, Int64}}:
33
42
missing
51
冒号与感叹号之间仍然存在些许差别::
可生成一个副本,而!
会对于原有数据中进行修改。
CSV数据访问
using CSV
using Distributions
import XLSX
using DataFrames
using StatsBase
using StatsPlots
name=DataFrame(year=[1,2,3],id=["jian","qi","huang"])
|
Int64 |
String |
1 |
1 |
jian |
2 |
2 |
qi |
3 |
3 |
huang |
访问url上的数据
using Downloads
urlf = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
dht = CSV.read(Downloads.download(urlf), DataFrame,
header=0)
rename!(dht, ["age", "sex", "cp", "trestbps", "chol",
"fbs", "restecg", "thalach", "exang", "oldpeak",
"slope", "ca", "thal", "num"])
|
Float64 |
Float64 |
Float64 |
Float64 |
Float64 |
Float64 |
Float64 |
Float64 |
Float64 |
1 |
63.0 |
1.0 |
1.0 |
145.0 |
233.0 |
1.0 |
2.0 |
150.0 |
0.0 |
2 |
67.0 |
1.0 |
4.0 |
160.0 |
286.0 |
0.0 |
2.0 |
108.0 |
1.0 |
3 |
67.0 |
1.0 |
4.0 |
120.0 |
229.0 |
0.0 |
2.0 |
129.0 |
1.0 |
4 |
37.0 |
1.0 |
3.0 |
130.0 |
250.0 |
0.0 |
0.0 |
187.0 |
0.0 |
5 |
41.0 |
0.0 |
2.0 |
130.0 |
204.0 |
0.0 |
2.0 |
172.0 |
0.0 |
6 |
56.0 |
1.0 |
2.0 |
120.0 |
236.0 |
0.0 |
0.0 |
178.0 |
0.0 |
7 |
62.0 |
0.0 |
4.0 |
140.0 |
268.0 |
0.0 |
2.0 |
160.0 |
0.0 |
8 |
57.0 |
0.0 |
4.0 |
120.0 |
354.0 |
0.0 |
0.0 |
163.0 |
1.0 |
9 |
63.0 |
1.0 |
4.0 |
130.0 |
254.0 |
0.0 |
2.0 |
147.0 |
0.0 |
10 |
53.0 |
1.0 |
4.0 |
140.0 |
203.0 |
1.0 |
2.0 |
155.0 |
1.0 |
11 |
57.0 |
1.0 |
4.0 |
140.0 |
192.0 |
0.0 |
0.0 |
148.0 |
0.0 |
12 |
56.0 |
0.0 |
2.0 |
140.0 |
294.0 |
0.0 |
2.0 |
153.0 |
0.0 |
13 |
56.0 |
1.0 |
3.0 |
130.0 |
256.0 |
1.0 |
2.0 |
142.0 |
1.0 |
14 |
44.0 |
1.0 |
2.0 |
120.0 |
263.0 |
0.0 |
0.0 |
173.0 |
0.0 |
15 |
52.0 |
1.0 |
3.0 |
172.0 |
199.0 |
1.0 |
0.0 |
162.0 |
0.0 |
16 |
57.0 |
1.0 |
3.0 |
150.0 |
168.0 |
0.0 |
0.0 |
174.0 |
0.0 |
17 |
48.0 |
1.0 |
2.0 |
110.0 |
229.0 |
0.0 |
0.0 |
168.0 |
0.0 |
18 |
54.0 |
1.0 |
4.0 |
140.0 |
239.0 |
0.0 |
0.0 |
160.0 |
0.0 |
19 |
48.0 |
0.0 |
3.0 |
130.0 |
275.0 |
0.0 |
0.0 |
139.0 |
0.0 |
20 |
49.0 |
1.0 |
2.0 |
130.0 |
266.0 |
0.0 |
0.0 |
171.0 |
0.0 |
21 |
64.0 |
1.0 |
1.0 |
110.0 |
211.0 |
0.0 |
2.0 |
144.0 |
1.0 |
22 |
58.0 |
0.0 |
1.0 |
150.0 |
283.0 |
1.0 |
2.0 |
162.0 |
0.0 |
23 |
58.0 |
1.0 |
2.0 |
120.0 |
284.0 |
0.0 |
2.0 |
160.0 |
0.0 |
24 |
58.0 |
1.0 |
3.0 |
132.0 |
224.0 |
0.0 |
2.0 |
173.0 |
0.0 |
25 |
60.0 |
1.0 |
4.0 |
130.0 |
206.0 |
0.0 |
2.0 |
132.0 |
1.0 |
26 |
50.0 |
0.0 |
3.0 |
120.0 |
219.0 |
0.0 |
0.0 |
158.0 |
0.0 |
27 |
58.0 |
0.0 |
3.0 |
120.0 |
340.0 |
0.0 |
0.0 |
172.0 |
0.0 |
28 |
66.0 |
0.0 |
1.0 |
150.0 |
226.0 |
0.0 |
0.0 |
114.0 |
0.0 |
29 |
43.0 |
1.0 |
4.0 |
150.0 |
247.0 |
0.0 |
0.0 |
171.0 |
0.0 |
30 |
40.0 |
1.0 |
4.0 |
110.0 |
167.0 |
0.0 |
2.0 |
114.0 |
1.0 |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
在行下标位置写单独的叹号表示所有行, 在列下标指定一列后可以访问数据框的这一列, 不制作副本。 比如,df[!,2]
, df[!, "age"]
或df[!, :age]
都可以取出df的第二列作为一个一维数组:
29-element Vector{Float64}:
36102.6
35445.1
33106.0
29883.0
27041.2
24779.1
22926.0
21134.6
19024.7
17188.8
14964.0
12900.9
11813.1
⋮
5267.2
4525.7
3861.5
3277.8
2759.8
2439.1
2118.1
1819.4
1516.2
1149.8
888.9
710.2
|
Symbol |
Float64 |
Real |
Float64 |
Real |
Int64 |
DataType |
1 |
Column1 |
2006.0 |
1992 |
2006.0 |
2020 |
0 |
Int64 |
2 |
Beijing |
12719.2 |
710.2 |
8387.0 |
36102.6 |
0 |
Float64 |
3 |
Tianjin |
5536.72 |
411.0 |
3538.2 |
14083.7 |
0 |
Float64 |
4 |
Hebei |
14254.6 |
1278.5 |
10043.0 |
36206.9 |
0 |
Float64 |
5 |
Shanxi |
6732.14 |
551.1 |
4713.6 |
17651.9 |
0 |
Float64 |
6 |
Inner Mongolia |
6485.58 |
421.7 |
4161.8 |
17359.8 |
0 |
Float64 |
7 |
Liaoning |
11209.5 |
1473.0 |
8390.3 |
25115.0 |
0 |
Float64 |
8 |
Jilin |
5135.64 |
558.1 |
3226.5 |
12311.3 |
0 |
Float64 |
9 |
Heilongjiang |
6601.08 |
857.4 |
5329.8 |
13698.5 |
0 |
Float64 |
10 |
Shanghai |
14671.4 |
1114.3 |
10598.9 |
38700.6 |
0 |
Float64 |
11 |
Jiangsu |
35370.5 |
2136.0 |
21240.8 |
102719.0 |
0 |
Float64 |
12 |
Zhejiang |
22816.2 |
1375.7 |
15302.7 |
64613.3 |
0 |
Float64 |
13 |
Anhui |
12282.8 |
827.0 |
6500.3 |
38680.6 |
0 |
Float64 |
14 |
Fujian |
13892.8 |
784.7 |
7468.6 |
43903.9 |
0 |
Float64 |
15 |
Jiangxi |
8414.53 |
572.6 |
4696.8 |
25691.5 |
0 |
Float64 |
16 |
Shandong |
27864.4 |
2196.5 |
18967.8 |
73129.0 |
0 |
Float64 |
17 |
Henan |
19156.4 |
1279.8 |
11977.9 |
54997.1 |
0 |
Float64 |
18 |
Hubei |
14904.0 |
1088.4 |
7531.8 |
45429.0 |
0 |
Float64 |
19 |
Hunan |
13893.0 |
987.0 |
7431.6 |
41781.5 |
0 |
Float64 |
20 |
Guangdong |
38962.5 |
2447.5 |
25961.2 |
1.10761e5 |
0 |
Float64 |
21 |
Guangxi, |
7576.56 |
646.6 |
4417.8 |
22156.7 |
0 |
Float64 |
22 |
Hainan |
1872.47 |
184.9 |
1027.5 |
5532.4 |
0 |
Float64 |
23 |
Chongqing |
7843.67 |
462.5 |
3900.3 |
25002.8 |
0 |
Float64 |
24 |
Sichuan |
15689.6 |
1177.3 |
8494.7 |
48598.8 |
0 |
Float64 |
25 |
Guizhou |
5036.27 |
339.9 |
2264.1 |
17826.6 |
0 |
Float64 |
26 |
Yunnan |
7647.3 |
618.7 |
4090.7 |
24521.9 |
0 |
Float64 |
27 |
Tibet |
529.959 |
33.3 |
285.9 |
1902.7 |
0 |
Float64 |
28 |
Shaanxi |
8744.46 |
531.6 |
4595.6 |
26181.9 |
0 |
Float64 |
29 |
Gansu |
3377.07 |
317.8 |
2203.0 |
9016.7 |
0 |
Float64 |
30 |
Qinghai |
1026.16 |
87.5 |
585.2 |
3005.9 |
0 |
Float64 |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
1 |
北京 |
39138 |
58042 |
53062 |
49455 |
43187 |
143717 |
94956 |
65646 |
64250 |
1 |
北京 |
39138 |
58042 |
53062 |
49455 |
43187 |
143717 |
94956 |
65646 |
64250 |
2 |
天津 |
36007 |
61667 |
47103 |
50372 |
43400 |
68436 |
58365 |
44999 |
51602 |
using Downloads
urlf = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
dht = CSV.read(Downloads.download(urlf), DataFrame,
header=0)
rename!(dht, ["age", "sex", "cp", "trestbps", "chol",
"fbs", "restecg", "thalach", "exang", "oldpeak",
"slope", "ca", "thal", "num"])
|
Float64 |
Float64 |
Float64 |
Float64 |
Float64 |
Float64 |
Float64 |
Float64 |
Float64 |
1 |
63.0 |
1.0 |
1.0 |
145.0 |
233.0 |
1.0 |
2.0 |
150.0 |
0.0 |
2 |
67.0 |
1.0 |
4.0 |
160.0 |
286.0 |
0.0 |
2.0 |
108.0 |
1.0 |
3 |
67.0 |
1.0 |
4.0 |
120.0 |
229.0 |
0.0 |
2.0 |
129.0 |
1.0 |
4 |
37.0 |
1.0 |
3.0 |
130.0 |
250.0 |
0.0 |
0.0 |
187.0 |
0.0 |
5 |
41.0 |
0.0 |
2.0 |
130.0 |
204.0 |
0.0 |
2.0 |
172.0 |
0.0 |
6 |
56.0 |
1.0 |
2.0 |
120.0 |
236.0 |
0.0 |
0.0 |
178.0 |
0.0 |
7 |
62.0 |
0.0 |
4.0 |
140.0 |
268.0 |
0.0 |
2.0 |
160.0 |
0.0 |
8 |
57.0 |
0.0 |
4.0 |
120.0 |
354.0 |
0.0 |
0.0 |
163.0 |
1.0 |
9 |
63.0 |
1.0 |
4.0 |
130.0 |
254.0 |
0.0 |
2.0 |
147.0 |
0.0 |
10 |
53.0 |
1.0 |
4.0 |
140.0 |
203.0 |
1.0 |
2.0 |
155.0 |
1.0 |
11 |
57.0 |
1.0 |
4.0 |
140.0 |
192.0 |
0.0 |
0.0 |
148.0 |
0.0 |
12 |
56.0 |
0.0 |
2.0 |
140.0 |
294.0 |
0.0 |
2.0 |
153.0 |
0.0 |
13 |
56.0 |
1.0 |
3.0 |
130.0 |
256.0 |
1.0 |
2.0 |
142.0 |
1.0 |
14 |
44.0 |
1.0 |
2.0 |
120.0 |
263.0 |
0.0 |
0.0 |
173.0 |
0.0 |
15 |
52.0 |
1.0 |
3.0 |
172.0 |
199.0 |
1.0 |
0.0 |
162.0 |
0.0 |
16 |
57.0 |
1.0 |
3.0 |
150.0 |
168.0 |
0.0 |
0.0 |
174.0 |
0.0 |
17 |
48.0 |
1.0 |
2.0 |
110.0 |
229.0 |
0.0 |
0.0 |
168.0 |
0.0 |
18 |
54.0 |
1.0 |
4.0 |
140.0 |
239.0 |
0.0 |
0.0 |
160.0 |
0.0 |
19 |
48.0 |
0.0 |
3.0 |
130.0 |
275.0 |
0.0 |
0.0 |
139.0 |
0.0 |
20 |
49.0 |
1.0 |
2.0 |
130.0 |
266.0 |
0.0 |
0.0 |
171.0 |
0.0 |
21 |
64.0 |
1.0 |
1.0 |
110.0 |
211.0 |
0.0 |
2.0 |
144.0 |
1.0 |
22 |
58.0 |
0.0 |
1.0 |
150.0 |
283.0 |
1.0 |
2.0 |
162.0 |
0.0 |
23 |
58.0 |
1.0 |
2.0 |
120.0 |
284.0 |
0.0 |
2.0 |
160.0 |
0.0 |
24 |
58.0 |
1.0 |
3.0 |
132.0 |
224.0 |
0.0 |
2.0 |
173.0 |
0.0 |
25 |
60.0 |
1.0 |
4.0 |
130.0 |
206.0 |
0.0 |
2.0 |
132.0 |
1.0 |
26 |
50.0 |
0.0 |
3.0 |
120.0 |
219.0 |
0.0 |
0.0 |
158.0 |
0.0 |
27 |
58.0 |
0.0 |
3.0 |
120.0 |
340.0 |
0.0 |
0.0 |
172.0 |
0.0 |
28 |
66.0 |
0.0 |
1.0 |
150.0 |
226.0 |
0.0 |
0.0 |
114.0 |
0.0 |
29 |
43.0 |
1.0 |
4.0 |
150.0 |
247.0 |
0.0 |
0.0 |
171.0 |
0.0 |
30 |
40.0 |
1.0 |
4.0 |
110.0 |
167.0 |
0.0 |
2.0 |
114.0 |
1.0 |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
10-element Vector{Float64}:
63.0
67.0
67.0
37.0
41.0
56.0
62.0
57.0
63.0
53.0
dline01 = DataFrame(
x = 1:5,
y = [11, 13, 18, 15, 14])
1 |
1 |
11 |
2 |
2 |
13 |
3 |
3 |
18 |
4 |
4 |
15 |
5 |
5 |
14 |
sum=0
for i in 0:0.1:6
sum+=i
end
@show sum
LoadError: cannot assign a value to variable Base.sum from module Main
using DataFrames, DataFramesMeta
using CategoricalArrays
using Makie
using LinearAlgebra
读取数据库数据
iris=dataset("datasets","iris")
|
Float64 |
Float64 |
Float64 |
Float64 |
Cat… |
1 |
5.1 |
3.5 |
1.4 |
0.2 |
setosa |
2 |
4.9 |
3.0 |
1.4 |
0.2 |
setosa |
3 |
4.7 |
3.2 |
1.3 |
0.2 |
setosa |
4 |
4.6 |
3.1 |
1.5 |
0.2 |
setosa |
5 |
5.0 |
3.6 |
1.4 |
0.2 |
setosa |
6 |
5.4 |
3.9 |
1.7 |
0.4 |
setosa |
7 |
4.6 |
3.4 |
1.4 |
0.3 |
setosa |
8 |
5.0 |
3.4 |
1.5 |
0.2 |
setosa |
9 |
4.4 |
2.9 |
1.4 |
0.2 |
setosa |
10 |
4.9 |
3.1 |
1.5 |
0.1 |
setosa |
11 |
5.4 |
3.7 |
1.5 |
0.2 |
setosa |
12 |
4.8 |
3.4 |
1.6 |
0.2 |
setosa |
13 |
4.8 |
3.0 |
1.4 |
0.1 |
setosa |
14 |
4.3 |
3.0 |
1.1 |
0.1 |
setosa |
15 |
5.8 |
4.0 |
1.2 |
0.2 |
setosa |
16 |
5.7 |
4.4 |
1.5 |
0.4 |
setosa |
17 |
5.4 |
3.9 |
1.3 |
0.4 |
setosa |
18 |
5.1 |
3.5 |
1.4 |
0.3 |
setosa |
19 |
5.7 |
3.8 |
1.7 |
0.3 |
setosa |
20 |
5.1 |
3.8 |
1.5 |
0.3 |
setosa |
21 |
5.4 |
3.4 |
1.7 |
0.2 |
setosa |
22 |
5.1 |
3.7 |
1.5 |
0.4 |
setosa |
23 |
4.6 |
3.6 |
1.0 |
0.2 |
setosa |
24 |
5.1 |
3.3 |
1.7 |
0.5 |
setosa |
25 |
4.8 |
3.4 |
1.9 |
0.2 |
setosa |
26 |
5.0 |
3.0 |
1.6 |
0.2 |
setosa |
27 |
5.0 |
3.4 |
1.6 |
0.4 |
setosa |
28 |
5.2 |
3.5 |
1.5 |
0.2 |
setosa |
29 |
5.2 |
3.4 |
1.4 |
0.2 |
setosa |
30 |
4.7 |
3.2 |
1.6 |
0.2 |
setosa |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
⋮ |
sample=iris[:,1:4]
label=iris[:,end]
#训练数据
train=sample[1:2:end,:]
train_albel=label[1:2:end]
#测试数据
test=sample[2:2:end,:]
test_label=label[2:2:end]
#需要把Iris数据DataFrame类型转换为Array类型
75-element CategoricalArray{String,1,UInt8}:
"setosa"
"setosa"
"setosa"
"setosa"
"setosa"
"setosa"
"setosa"
"setosa"
"setosa"
"setosa"
"setosa"
"setosa"
"setosa"
⋮
"virginica"
"virginica"
"virginica"
"virginica"
"virginica"
"virginica"
"virginica"
"virginica"
"virginica"
"virginica"
"virginica"
"virginica"
LoadError: UndefVarError: train not defined