1(* Title: HOL/Probability/Information.thy 2 Author: Johannes H��lzl, TU M��nchen 3 Author: Armin Heller, TU M��nchen 4*) 5 6section \<open>Information theory\<close> 7 8theory Information 9imports 10 Independent_Family 11begin 12 13lemma log_le: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log a x \<le> log a y" 14 by (subst log_le_cancel_iff) auto 15 16lemma log_less: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x < y \<Longrightarrow> log a x < log a y" 17 by (subst log_less_cancel_iff) auto 18 19lemma sum_cartesian_product': 20 "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. sum (\<lambda>y. f (x, y)) B)" 21 unfolding sum.cartesian_product by simp 22 23lemma split_pairs: 24 "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and 25 "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto 26 27subsection "Information theory" 28 29locale information_space = prob_space + 30 fixes b :: real assumes b_gt_1: "1 < b" 31 32context information_space 33begin 34 35text \<open>Introduce some simplification rules for logarithm of base \<^term>\<open>b\<close>.\<close> 36 37lemma log_neg_const: 38 assumes "x \<le> 0" 39 shows "log b x = log b 0" 40proof - 41 { fix u :: real 42 have "x \<le> 0" by fact 43 also have "0 < exp u" 44 using exp_gt_zero . 45 finally have "exp u \<noteq> x" 46 by auto } 47 then show "log b x = log b 0" 48 by (simp add: log_def ln_real_def) 49qed 50 51lemma log_mult_eq: 52 "log b (A * B) = (if 0 < A * B then log b \<bar>A\<bar> + log b \<bar>B\<bar> else log b 0)" 53 using log_mult[of b "\<bar>A\<bar>" "\<bar>B\<bar>"] b_gt_1 log_neg_const[of "A * B"] 54 by (auto simp: zero_less_mult_iff mult_le_0_iff) 55 56lemma log_inverse_eq: 57 "log b (inverse B) = (if 0 < B then - log b B else log b 0)" 58 using log_inverse[of b B] log_neg_const[of "inverse B"] b_gt_1 by simp 59 60lemma log_divide_eq: 61 "log b (A / B) = (if 0 < A * B then log b \<bar>A\<bar> - log b \<bar>B\<bar> else log b 0)" 62 unfolding divide_inverse log_mult_eq log_inverse_eq abs_inverse 63 by (auto simp: zero_less_mult_iff mult_le_0_iff) 64 65lemmas log_simps = log_mult_eq log_inverse_eq log_divide_eq 66 67end 68 69subsection "Kullback$-$Leibler divergence" 70 71text \<open>The Kullback$-$Leibler divergence is also known as relative entropy or 72Kullback$-$Leibler distance.\<close> 73 74definition 75 "entropy_density b M N = log b \<circ> enn2real \<circ> RN_deriv M N" 76 77definition 78 "KL_divergence b M N = integral\<^sup>L N (entropy_density b M N)" 79 80lemma measurable_entropy_density[measurable]: "entropy_density b M N \<in> borel_measurable M" 81 unfolding entropy_density_def by auto 82 83lemma (in sigma_finite_measure) KL_density: 84 fixes f :: "'a \<Rightarrow> real" 85 assumes "1 < b" 86 assumes f[measurable]: "f \<in> borel_measurable M" and nn: "AE x in M. 0 \<le> f x" 87 shows "KL_divergence b M (density M f) = (\<integral>x. f x * log b (f x) \<partial>M)" 88 unfolding KL_divergence_def 89proof (subst integral_real_density) 90 show [measurable]: "entropy_density b M (density M (\<lambda>x. ennreal (f x))) \<in> borel_measurable M" 91 using f 92 by (auto simp: comp_def entropy_density_def) 93 have "density M (RN_deriv M (density M f)) = density M f" 94 using f nn by (intro density_RN_deriv_density) auto 95 then have eq: "AE x in M. RN_deriv M (density M f) x = f x" 96 using f nn by (intro density_unique) auto 97 show "(\<integral>x. f x * entropy_density b M (density M (\<lambda>x. ennreal (f x))) x \<partial>M) = (\<integral>x. f x * log b (f x) \<partial>M)" 98 apply (intro integral_cong_AE) 99 apply measurable 100 using eq nn 101 apply eventually_elim 102 apply (auto simp: entropy_density_def) 103 done 104qed fact+ 105 106lemma (in sigma_finite_measure) KL_density_density: 107 fixes f g :: "'a \<Rightarrow> real" 108 assumes "1 < b" 109 assumes f: "f \<in> borel_measurable M" "AE x in M. 0 \<le> f x" 110 assumes g: "g \<in> borel_measurable M" "AE x in M. 0 \<le> g x" 111 assumes ac: "AE x in M. f x = 0 \<longrightarrow> g x = 0" 112 shows "KL_divergence b (density M f) (density M g) = (\<integral>x. g x * log b (g x / f x) \<partial>M)" 113proof - 114 interpret Mf: sigma_finite_measure "density M f" 115 using f by (subst sigma_finite_iff_density_finite) auto 116 have "KL_divergence b (density M f) (density M g) = 117 KL_divergence b (density M f) (density (density M f) (\<lambda>x. g x / f x))" 118 using f g ac by (subst density_density_divide) simp_all 119 also have "\<dots> = (\<integral>x. (g x / f x) * log b (g x / f x) \<partial>density M f)" 120 using f g \<open>1 < b\<close> by (intro Mf.KL_density) (auto simp: AE_density) 121 also have "\<dots> = (\<integral>x. g x * log b (g x / f x) \<partial>M)" 122 using ac f g \<open>1 < b\<close> by (subst integral_density) (auto intro!: integral_cong_AE) 123 finally show ?thesis . 124qed 125 126lemma (in information_space) KL_gt_0: 127 fixes D :: "'a \<Rightarrow> real" 128 assumes "prob_space (density M D)" 129 assumes D: "D \<in> borel_measurable M" "AE x in M. 0 \<le> D x" 130 assumes int: "integrable M (\<lambda>x. D x * log b (D x))" 131 assumes A: "density M D \<noteq> M" 132 shows "0 < KL_divergence b M (density M D)" 133proof - 134 interpret N: prob_space "density M D" by fact 135 136 obtain A where "A \<in> sets M" "emeasure (density M D) A \<noteq> emeasure M A" 137 using measure_eqI[of "density M D" M] \<open>density M D \<noteq> M\<close> by auto 138 139 let ?D_set = "{x\<in>space M. D x \<noteq> 0}" 140 have [simp, intro]: "?D_set \<in> sets M" 141 using D by auto 142 143 have D_neg: "(\<integral>\<^sup>+ x. ennreal (- D x) \<partial>M) = 0" 144 using D by (subst nn_integral_0_iff_AE) (auto simp: ennreal_neg) 145 146 have "(\<integral>\<^sup>+ x. ennreal (D x) \<partial>M) = emeasure (density M D) (space M)" 147 using D by (simp add: emeasure_density cong: nn_integral_cong) 148 then have D_pos: "(\<integral>\<^sup>+ x. ennreal (D x) \<partial>M) = 1" 149 using N.emeasure_space_1 by simp 150 151 have "integrable M D" 152 using D D_pos D_neg unfolding real_integrable_def real_lebesgue_integral_def by simp_all 153 then have "integral\<^sup>L M D = 1" 154 using D D_pos D_neg by (simp add: real_lebesgue_integral_def) 155 156 have "0 \<le> 1 - measure M ?D_set" 157 using prob_le_1 by (auto simp: field_simps) 158 also have "\<dots> = (\<integral> x. D x - indicator ?D_set x \<partial>M)" 159 using \<open>integrable M D\<close> \<open>integral\<^sup>L M D = 1\<close> 160 by (simp add: emeasure_eq_measure) 161 also have "\<dots> < (\<integral> x. D x * (ln b * log b (D x)) \<partial>M)" 162 proof (rule integral_less_AE) 163 show "integrable M (\<lambda>x. D x - indicator ?D_set x)" 164 using \<open>integrable M D\<close> by (auto simp: less_top[symmetric]) 165 next 166 from integrable_mult_left(1)[OF int, of "ln b"] 167 show "integrable M (\<lambda>x. D x * (ln b * log b (D x)))" 168 by (simp add: ac_simps) 169 next 170 show "emeasure M {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<noteq> 0" 171 proof 172 assume eq_0: "emeasure M {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} = 0" 173 then have disj: "AE x in M. D x = 1 \<or> D x = 0" 174 using D(1) by (auto intro!: AE_I[OF subset_refl] sets.sets_Collect) 175 176 have "emeasure M {x\<in>space M. D x = 1} = (\<integral>\<^sup>+ x. indicator {x\<in>space M. D x = 1} x \<partial>M)" 177 using D(1) by auto 178 also have "\<dots> = (\<integral>\<^sup>+ x. ennreal (D x) \<partial>M)" 179 using disj by (auto intro!: nn_integral_cong_AE simp: indicator_def one_ennreal_def) 180 finally have "AE x in M. D x = 1" 181 using D D_pos by (intro AE_I_eq_1) auto 182 then have "(\<integral>\<^sup>+x. indicator A x\<partial>M) = (\<integral>\<^sup>+x. ennreal (D x) * indicator A x\<partial>M)" 183 by (intro nn_integral_cong_AE) (auto simp: one_ennreal_def[symmetric]) 184 also have "\<dots> = density M D A" 185 using \<open>A \<in> sets M\<close> D by (simp add: emeasure_density) 186 finally show False using \<open>A \<in> sets M\<close> \<open>emeasure (density M D) A \<noteq> emeasure M A\<close> by simp 187 qed 188 show "{x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<in> sets M" 189 using D(1) by (auto intro: sets.sets_Collect_conj) 190 191 show "AE t in M. t \<in> {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<longrightarrow> 192 D t - indicator ?D_set t \<noteq> D t * (ln b * log b (D t))" 193 using D(2) 194 proof (eventually_elim, safe) 195 fix t assume Dt: "t \<in> space M" "D t \<noteq> 1" "D t \<noteq> 0" "0 \<le> D t" 196 and eq: "D t - indicator ?D_set t = D t * (ln b * log b (D t))" 197 198 have "D t - 1 = D t - indicator ?D_set t" 199 using Dt by simp 200 also note eq 201 also have "D t * (ln b * log b (D t)) = - D t * ln (1 / D t)" 202 using b_gt_1 \<open>D t \<noteq> 0\<close> \<open>0 \<le> D t\<close> 203 by (simp add: log_def ln_div less_le) 204 finally have "ln (1 / D t) = 1 / D t - 1" 205 using \<open>D t \<noteq> 0\<close> by (auto simp: field_simps) 206 from ln_eq_minus_one[OF _ this] \<open>D t \<noteq> 0\<close> \<open>0 \<le> D t\<close> \<open>D t \<noteq> 1\<close> 207 show False by auto 208 qed 209 210 show "AE t in M. D t - indicator ?D_set t \<le> D t * (ln b * log b (D t))" 211 using D(2) AE_space 212 proof eventually_elim 213 fix t assume "t \<in> space M" "0 \<le> D t" 214 show "D t - indicator ?D_set t \<le> D t * (ln b * log b (D t))" 215 proof cases 216 assume asm: "D t \<noteq> 0" 217 then have "0 < D t" using \<open>0 \<le> D t\<close> by auto 218 then have "0 < 1 / D t" by auto 219 have "D t - indicator ?D_set t \<le> - D t * (1 / D t - 1)" 220 using asm \<open>t \<in> space M\<close> by (simp add: field_simps) 221 also have "- D t * (1 / D t - 1) \<le> - D t * ln (1 / D t)" 222 using ln_le_minus_one \<open>0 < 1 / D t\<close> by (intro mult_left_mono_neg) auto 223 also have "\<dots> = D t * (ln b * log b (D t))" 224 using \<open>0 < D t\<close> b_gt_1 225 by (simp_all add: log_def ln_div) 226 finally show ?thesis by simp 227 qed simp 228 qed 229 qed 230 also have "\<dots> = (\<integral> x. ln b * (D x * log b (D x)) \<partial>M)" 231 by (simp add: ac_simps) 232 also have "\<dots> = ln b * (\<integral> x. D x * log b (D x) \<partial>M)" 233 using int by simp 234 finally show ?thesis 235 using b_gt_1 D by (subst KL_density) (auto simp: zero_less_mult_iff) 236qed 237 238lemma (in sigma_finite_measure) KL_same_eq_0: "KL_divergence b M M = 0" 239proof - 240 have "AE x in M. 1 = RN_deriv M M x" 241 proof (rule RN_deriv_unique) 242 show "density M (\<lambda>x. 1) = M" 243 apply (auto intro!: measure_eqI emeasure_density) 244 apply (subst emeasure_density) 245 apply auto 246 done 247 qed auto 248 then have "AE x in M. log b (enn2real (RN_deriv M M x)) = 0" 249 by (elim AE_mp) simp 250 from integral_cong_AE[OF _ _ this] 251 have "integral\<^sup>L M (entropy_density b M M) = 0" 252 by (simp add: entropy_density_def comp_def) 253 then show "KL_divergence b M M = 0" 254 unfolding KL_divergence_def 255 by auto 256qed 257 258lemma (in information_space) KL_eq_0_iff_eq: 259 fixes D :: "'a \<Rightarrow> real" 260 assumes "prob_space (density M D)" 261 assumes D: "D \<in> borel_measurable M" "AE x in M. 0 \<le> D x" 262 assumes int: "integrable M (\<lambda>x. D x * log b (D x))" 263 shows "KL_divergence b M (density M D) = 0 \<longleftrightarrow> density M D = M" 264 using KL_same_eq_0[of b] KL_gt_0[OF assms] 265 by (auto simp: less_le) 266 267lemma (in information_space) KL_eq_0_iff_eq_ac: 268 fixes D :: "'a \<Rightarrow> real" 269 assumes "prob_space N" 270 assumes ac: "absolutely_continuous M N" "sets N = sets M" 271 assumes int: "integrable N (entropy_density b M N)" 272 shows "KL_divergence b M N = 0 \<longleftrightarrow> N = M" 273proof - 274 interpret N: prob_space N by fact 275 have "finite_measure N" by unfold_locales 276 from real_RN_deriv[OF this ac] guess D . note D = this 277 278 have "N = density M (RN_deriv M N)" 279 using ac by (rule density_RN_deriv[symmetric]) 280 also have "\<dots> = density M D" 281 using D by (auto intro!: density_cong) 282 finally have N: "N = density M D" . 283 284 from absolutely_continuous_AE[OF ac(2,1) D(2)] D b_gt_1 ac measurable_entropy_density 285 have "integrable N (\<lambda>x. log b (D x))" 286 by (intro integrable_cong_AE[THEN iffD2, OF _ _ _ int]) 287 (auto simp: N entropy_density_def) 288 with D b_gt_1 have "integrable M (\<lambda>x. D x * log b (D x))" 289 by (subst integrable_real_density[symmetric]) (auto simp: N[symmetric] comp_def) 290 with \<open>prob_space N\<close> D show ?thesis 291 unfolding N 292 by (intro KL_eq_0_iff_eq) auto 293qed 294 295lemma (in information_space) KL_nonneg: 296 assumes "prob_space (density M D)" 297 assumes D: "D \<in> borel_measurable M" "AE x in M. 0 \<le> D x" 298 assumes int: "integrable M (\<lambda>x. D x * log b (D x))" 299 shows "0 \<le> KL_divergence b M (density M D)" 300 using KL_gt_0[OF assms] by (cases "density M D = M") (auto simp: KL_same_eq_0) 301 302lemma (in sigma_finite_measure) KL_density_density_nonneg: 303 fixes f g :: "'a \<Rightarrow> real" 304 assumes "1 < b" 305 assumes f: "f \<in> borel_measurable M" "AE x in M. 0 \<le> f x" "prob_space (density M f)" 306 assumes g: "g \<in> borel_measurable M" "AE x in M. 0 \<le> g x" "prob_space (density M g)" 307 assumes ac: "AE x in M. f x = 0 \<longrightarrow> g x = 0" 308 assumes int: "integrable M (\<lambda>x. g x * log b (g x / f x))" 309 shows "0 \<le> KL_divergence b (density M f) (density M g)" 310proof - 311 interpret Mf: prob_space "density M f" by fact 312 interpret Mf: information_space "density M f" b by standard fact 313 have eq: "density (density M f) (\<lambda>x. g x / f x) = density M g" (is "?DD = _") 314 using f g ac by (subst density_density_divide) simp_all 315 316 have "0 \<le> KL_divergence b (density M f) (density (density M f) (\<lambda>x. g x / f x))" 317 proof (rule Mf.KL_nonneg) 318 show "prob_space ?DD" unfolding eq by fact 319 from f g show "(\<lambda>x. g x / f x) \<in> borel_measurable (density M f)" 320 by auto 321 show "AE x in density M f. 0 \<le> g x / f x" 322 using f g by (auto simp: AE_density) 323 show "integrable (density M f) (\<lambda>x. g x / f x * log b (g x / f x))" 324 using \<open>1 < b\<close> f g ac 325 by (subst integrable_density) 326 (auto intro!: integrable_cong_AE[THEN iffD2, OF _ _ _ int] measurable_If) 327 qed 328 also have "\<dots> = KL_divergence b (density M f) (density M g)" 329 using f g ac by (subst density_density_divide) simp_all 330 finally show ?thesis . 331qed 332 333subsection \<open>Finite Entropy\<close> 334 335definition (in information_space) finite_entropy :: "'b measure \<Rightarrow> ('a \<Rightarrow> 'b) \<Rightarrow> ('b \<Rightarrow> real) \<Rightarrow> bool" 336where 337 "finite_entropy S X f \<longleftrightarrow> 338 distributed M S X f \<and> 339 integrable S (\<lambda>x. f x * log b (f x)) \<and> 340 (\<forall>x\<in>space S. 0 \<le> f x)" 341 342lemma (in information_space) finite_entropy_simple_function: 343 assumes X: "simple_function M X" 344 shows "finite_entropy (count_space (X`space M)) X (\<lambda>a. measure M {x \<in> space M. X x = a})" 345 unfolding finite_entropy_def 346proof safe 347 have [simp]: "finite (X ` space M)" 348 using X by (auto simp: simple_function_def) 349 then show "integrable (count_space (X ` space M)) 350 (\<lambda>x. prob {xa \<in> space M. X xa = x} * log b (prob {xa \<in> space M. X xa = x}))" 351 by (rule integrable_count_space) 352 have d: "distributed M (count_space (X ` space M)) X (\<lambda>x. ennreal (if x \<in> X`space M then prob {xa \<in> space M. X xa = x} else 0))" 353 by (rule distributed_simple_function_superset[OF X]) (auto intro!: arg_cong[where f=prob]) 354 show "distributed M (count_space (X ` space M)) X (\<lambda>x. ennreal (prob {xa \<in> space M. X xa = x}))" 355 by (rule distributed_cong_density[THEN iffD1, OF _ _ _ d]) auto 356qed (rule measure_nonneg) 357 358lemma ac_fst: 359 assumes "sigma_finite_measure T" 360 shows "absolutely_continuous S (distr (S \<Otimes>\<^sub>M T) S fst)" 361proof - 362 interpret sigma_finite_measure T by fact 363 { fix A assume A: "A \<in> sets S" "emeasure S A = 0" 364 then have "fst -` A \<inter> space (S \<Otimes>\<^sub>M T) = A \<times> space T" 365 by (auto simp: space_pair_measure dest!: sets.sets_into_space) 366 with A have "emeasure (S \<Otimes>\<^sub>M T) (fst -` A \<inter> space (S \<Otimes>\<^sub>M T)) = 0" 367 by (simp add: emeasure_pair_measure_Times) } 368 then show ?thesis 369 unfolding absolutely_continuous_def 370 apply (auto simp: null_sets_distr_iff) 371 apply (auto simp: null_sets_def intro!: measurable_sets) 372 done 373qed 374 375lemma ac_snd: 376 assumes "sigma_finite_measure T" 377 shows "absolutely_continuous T (distr (S \<Otimes>\<^sub>M T) T snd)" 378proof - 379 interpret sigma_finite_measure T by fact 380 { fix A assume A: "A \<in> sets T" "emeasure T A = 0" 381 then have "snd -` A \<inter> space (S \<Otimes>\<^sub>M T) = space S \<times> A" 382 by (auto simp: space_pair_measure dest!: sets.sets_into_space) 383 with A have "emeasure (S \<Otimes>\<^sub>M T) (snd -` A \<inter> space (S \<Otimes>\<^sub>M T)) = 0" 384 by (simp add: emeasure_pair_measure_Times) } 385 then show ?thesis 386 unfolding absolutely_continuous_def 387 apply (auto simp: null_sets_distr_iff) 388 apply (auto simp: null_sets_def intro!: measurable_sets) 389 done 390qed 391 392lemma (in information_space) finite_entropy_integrable: 393 "finite_entropy S X Px \<Longrightarrow> integrable S (\<lambda>x. Px x * log b (Px x))" 394 unfolding finite_entropy_def by auto 395 396lemma (in information_space) finite_entropy_distributed: 397 "finite_entropy S X Px \<Longrightarrow> distributed M S X Px" 398 unfolding finite_entropy_def by auto 399 400lemma (in information_space) finite_entropy_nn: 401 "finite_entropy S X Px \<Longrightarrow> x \<in> space S \<Longrightarrow> 0 \<le> Px x" 402 by (auto simp: finite_entropy_def) 403 404lemma (in information_space) finite_entropy_measurable: 405 "finite_entropy S X Px \<Longrightarrow> Px \<in> S \<rightarrow>\<^sub>M borel" 406 using distributed_real_measurable[of S Px M X] 407 finite_entropy_nn[of S X Px] finite_entropy_distributed[of S X Px] by auto 408 409lemma (in information_space) subdensity_finite_entropy: 410 fixes g :: "'b \<Rightarrow> real" and f :: "'c \<Rightarrow> real" 411 assumes T: "T \<in> measurable P Q" 412 assumes f: "finite_entropy P X f" 413 assumes g: "finite_entropy Q Y g" 414 assumes Y: "Y = T \<circ> X" 415 shows "AE x in P. g (T x) = 0 \<longrightarrow> f x = 0" 416 using subdensity[OF T, of M X "\<lambda>x. ennreal (f x)" Y "\<lambda>x. ennreal (g x)"] 417 finite_entropy_distributed[OF f] finite_entropy_distributed[OF g] 418 finite_entropy_nn[OF f] finite_entropy_nn[OF g] 419 assms 420 by auto 421 422lemma (in information_space) finite_entropy_integrable_transform: 423 "finite_entropy S X Px \<Longrightarrow> distributed M T Y Py \<Longrightarrow> (\<And>x. x \<in> space T \<Longrightarrow> 0 \<le> Py x) \<Longrightarrow> 424 X = (\<lambda>x. f (Y x)) \<Longrightarrow> f \<in> measurable T S \<Longrightarrow> integrable T (\<lambda>x. Py x * log b (Px (f x)))" 425 using distributed_transform_integrable[of M T Y Py S X Px f "\<lambda>x. log b (Px x)"] 426 using distributed_real_measurable[of S Px M X] 427 by (auto simp: finite_entropy_def) 428 429subsection \<open>Mutual Information\<close> 430 431definition (in prob_space) 432 "mutual_information b S T X Y = 433 KL_divergence b (distr M S X \<Otimes>\<^sub>M distr M T Y) (distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)))" 434 435lemma (in information_space) mutual_information_indep_vars: 436 fixes S T X Y 437 defines "P \<equiv> distr M S X \<Otimes>\<^sub>M distr M T Y" 438 defines "Q \<equiv> distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))" 439 shows "indep_var S X T Y \<longleftrightarrow> 440 (random_variable S X \<and> random_variable T Y \<and> 441 absolutely_continuous P Q \<and> integrable Q (entropy_density b P Q) \<and> 442 mutual_information b S T X Y = 0)" 443 unfolding indep_var_distribution_eq 444proof safe 445 assume rv[measurable]: "random_variable S X" "random_variable T Y" 446 447 interpret X: prob_space "distr M S X" 448 by (rule prob_space_distr) fact 449 interpret Y: prob_space "distr M T Y" 450 by (rule prob_space_distr) fact 451 interpret XY: pair_prob_space "distr M S X" "distr M T Y" by standard 452 interpret P: information_space P b unfolding P_def by standard (rule b_gt_1) 453 454 interpret Q: prob_space Q unfolding Q_def 455 by (rule prob_space_distr) simp 456 457 { assume "distr M S X \<Otimes>\<^sub>M distr M T Y = distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))" 458 then have [simp]: "Q = P" unfolding Q_def P_def by simp 459 460 show ac: "absolutely_continuous P Q" by (simp add: absolutely_continuous_def) 461 then have ed: "entropy_density b P Q \<in> borel_measurable P" 462 by simp 463 464 have "AE x in P. 1 = RN_deriv P Q x" 465 proof (rule P.RN_deriv_unique) 466 show "density P (\<lambda>x. 1) = Q" 467 unfolding \<open>Q = P\<close> by (intro measure_eqI) (auto simp: emeasure_density) 468 qed auto 469 then have ae_0: "AE x in P. entropy_density b P Q x = 0" 470 by eventually_elim (auto simp: entropy_density_def) 471 then have "integrable P (entropy_density b P Q) \<longleftrightarrow> integrable Q (\<lambda>x. 0::real)" 472 using ed unfolding \<open>Q = P\<close> by (intro integrable_cong_AE) auto 473 then show "integrable Q (entropy_density b P Q)" by simp 474 475 from ae_0 have "mutual_information b S T X Y = (\<integral>x. 0 \<partial>P)" 476 unfolding mutual_information_def KL_divergence_def P_def[symmetric] Q_def[symmetric] \<open>Q = P\<close> 477 by (intro integral_cong_AE) auto 478 then show "mutual_information b S T X Y = 0" 479 by simp } 480 481 { assume ac: "absolutely_continuous P Q" 482 assume int: "integrable Q (entropy_density b P Q)" 483 assume I_eq_0: "mutual_information b S T X Y = 0" 484 485 have eq: "Q = P" 486 proof (rule P.KL_eq_0_iff_eq_ac[THEN iffD1]) 487 show "prob_space Q" by unfold_locales 488 show "absolutely_continuous P Q" by fact 489 show "integrable Q (entropy_density b P Q)" by fact 490 show "sets Q = sets P" by (simp add: P_def Q_def sets_pair_measure) 491 show "KL_divergence b P Q = 0" 492 using I_eq_0 unfolding mutual_information_def by (simp add: P_def Q_def) 493 qed 494 then show "distr M S X \<Otimes>\<^sub>M distr M T Y = distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))" 495 unfolding P_def Q_def .. } 496qed 497 498abbreviation (in information_space) 499 mutual_information_Pow ("\<I>'(_ ; _')") where 500 "\<I>(X ; Y) \<equiv> mutual_information b (count_space (X`space M)) (count_space (Y`space M)) X Y" 501 502lemma (in information_space) 503 fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" 504 assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" 505 assumes Fx: "finite_entropy S X Px" and Fy: "finite_entropy T Y Py" 506 assumes Fxy: "finite_entropy (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy" 507 defines "f \<equiv> \<lambda>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))" 508 shows mutual_information_distr': "mutual_information b S T X Y = integral\<^sup>L (S \<Otimes>\<^sub>M T) f" (is "?M = ?R") 509 and mutual_information_nonneg': "0 \<le> mutual_information b S T X Y" 510proof - 511 have Px: "distributed M S X Px" and Px_nn: "\<And>x. x \<in> space S \<Longrightarrow> 0 \<le> Px x" 512 using Fx by (auto simp: finite_entropy_def) 513 have Py: "distributed M T Y Py" and Py_nn: "\<And>x. x \<in> space T \<Longrightarrow> 0 \<le> Py x" 514 using Fy by (auto simp: finite_entropy_def) 515 have Pxy: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy" 516 and Pxy_nn: "\<And>x. x \<in> space (S \<Otimes>\<^sub>M T) \<Longrightarrow> 0 \<le> Pxy x" 517 "\<And>x y. x \<in> space S \<Longrightarrow> y \<in> space T \<Longrightarrow> 0 \<le> Pxy (x, y)" 518 using Fxy by (auto simp: finite_entropy_def space_pair_measure) 519 520 have [measurable]: "Px \<in> S \<rightarrow>\<^sub>M borel" 521 using Px Px_nn by (intro distributed_real_measurable) 522 have [measurable]: "Py \<in> T \<rightarrow>\<^sub>M borel" 523 using Py Py_nn by (intro distributed_real_measurable) 524 have measurable_Pxy[measurable]: "Pxy \<in> (S \<Otimes>\<^sub>M T) \<rightarrow>\<^sub>M borel" 525 using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure) 526 527 have X[measurable]: "random_variable S X" 528 using Px by auto 529 have Y[measurable]: "random_variable T Y" 530 using Py by auto 531 interpret S: sigma_finite_measure S by fact 532 interpret T: sigma_finite_measure T by fact 533 interpret ST: pair_sigma_finite S T .. 534 interpret X: prob_space "distr M S X" using X by (rule prob_space_distr) 535 interpret Y: prob_space "distr M T Y" using Y by (rule prob_space_distr) 536 interpret XY: pair_prob_space "distr M S X" "distr M T Y" .. 537 let ?P = "S \<Otimes>\<^sub>M T" 538 let ?D = "distr M ?P (\<lambda>x. (X x, Y x))" 539 540 { fix A assume "A \<in> sets S" 541 with X[THEN measurable_space] Y[THEN measurable_space] 542 have "emeasure (distr M S X) A = emeasure ?D (A \<times> space T)" 543 by (auto simp: emeasure_distr intro!: arg_cong[where f="emeasure M"]) } 544 note marginal_eq1 = this 545 { fix A assume "A \<in> sets T" 546 with X[THEN measurable_space] Y[THEN measurable_space] 547 have "emeasure (distr M T Y) A = emeasure ?D (space S \<times> A)" 548 by (auto simp: emeasure_distr intro!: arg_cong[where f="emeasure M"]) } 549 note marginal_eq2 = this 550 551 have distr_eq: "distr M S X \<Otimes>\<^sub>M distr M T Y = density ?P (\<lambda>x. ennreal (Px (fst x) * Py (snd x)))" 552 unfolding Px(1)[THEN distributed_distr_eq_density] Py(1)[THEN distributed_distr_eq_density] 553 proof (subst pair_measure_density) 554 show "(\<lambda>x. ennreal (Px x)) \<in> borel_measurable S" "(\<lambda>y. ennreal (Py y)) \<in> borel_measurable T" 555 using Px Py by (auto simp: distributed_def) 556 show "sigma_finite_measure (density T Py)" unfolding Py(1)[THEN distributed_distr_eq_density, symmetric] .. 557 show "density (S \<Otimes>\<^sub>M T) (\<lambda>(x, y). ennreal (Px x) * ennreal (Py y)) = 558 density (S \<Otimes>\<^sub>M T) (\<lambda>x. ennreal (Px (fst x) * Py (snd x)))" 559 using Px_nn Py_nn by (auto intro!: density_cong simp: distributed_def ennreal_mult space_pair_measure) 560 qed fact 561 562 have M: "?M = KL_divergence b (density ?P (\<lambda>x. ennreal (Px (fst x) * Py (snd x)))) (density ?P (\<lambda>x. ennreal (Pxy x)))" 563 unfolding mutual_information_def distr_eq Pxy(1)[THEN distributed_distr_eq_density] .. 564 565 from Px Py have f: "(\<lambda>x. Px (fst x) * Py (snd x)) \<in> borel_measurable ?P" 566 by (intro borel_measurable_times) (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'') 567 have PxPy_nonneg: "AE x in ?P. 0 \<le> Px (fst x) * Py (snd x)" 568 using Px_nn Py_nn by (auto simp: space_pair_measure) 569 570 have A: "(AE x in ?P. Px (fst x) = 0 \<longrightarrow> Pxy x = 0)" 571 by (rule subdensity_real[OF measurable_fst Pxy Px]) (insert Px_nn Pxy_nn, auto simp: space_pair_measure) 572 moreover 573 have B: "(AE x in ?P. Py (snd x) = 0 \<longrightarrow> Pxy x = 0)" 574 by (rule subdensity_real[OF measurable_snd Pxy Py]) (insert Py_nn Pxy_nn, auto simp: space_pair_measure) 575 ultimately have ac: "AE x in ?P. Px (fst x) * Py (snd x) = 0 \<longrightarrow> Pxy x = 0" 576 by eventually_elim auto 577 578 show "?M = ?R" 579 unfolding M f_def using Pxy_nn Px_nn Py_nn 580 by (intro ST.KL_density_density b_gt_1 f PxPy_nonneg ac) (auto simp: space_pair_measure) 581 582 have X: "X = fst \<circ> (\<lambda>x. (X x, Y x))" and Y: "Y = snd \<circ> (\<lambda>x. (X x, Y x))" 583 by auto 584 585 have "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x) - Pxy x * log b (Px (fst x)) - Pxy x * log b (Py (snd x)))" 586 using finite_entropy_integrable[OF Fxy] 587 using finite_entropy_integrable_transform[OF Fx Pxy, of fst] 588 using finite_entropy_integrable_transform[OF Fy Pxy, of snd] 589 by (simp add: Pxy_nn) 590 moreover have "f \<in> borel_measurable (S \<Otimes>\<^sub>M T)" 591 unfolding f_def using Px Py Pxy 592 by (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'' 593 intro!: borel_measurable_times borel_measurable_log borel_measurable_divide) 594 ultimately have int: "integrable (S \<Otimes>\<^sub>M T) f" 595 apply (rule integrable_cong_AE_imp) 596 using A B AE_space 597 by eventually_elim 598 (auto simp: f_def log_divide_eq log_mult_eq field_simps space_pair_measure Px_nn Py_nn Pxy_nn 599 less_le) 600 601 show "0 \<le> ?M" unfolding M 602 proof (intro ST.KL_density_density_nonneg) 603 show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ennreal (Pxy x))) " 604 unfolding distributed_distr_eq_density[OF Pxy, symmetric] 605 using distributed_measurable[OF Pxy] by (rule prob_space_distr) 606 show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ennreal (Px (fst x) * Py (snd x))))" 607 unfolding distr_eq[symmetric] by unfold_locales 608 show "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))))" 609 using int unfolding f_def . 610 qed (insert ac, auto simp: b_gt_1 Px_nn Py_nn Pxy_nn space_pair_measure) 611qed 612 613lemma (in information_space) 614 fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" 615 assumes "sigma_finite_measure S" "sigma_finite_measure T" 616 assumes Px: "distributed M S X Px" and Px_nn: "\<And>x. x \<in> space S \<Longrightarrow> 0 \<le> Px x" 617 and Py: "distributed M T Y Py" and Py_nn: "\<And>y. y \<in> space T \<Longrightarrow> 0 \<le> Py y" 618 and Pxy: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy" 619 and Pxy_nn: "\<And>x y. x \<in> space S \<Longrightarrow> y \<in> space T \<Longrightarrow> 0 \<le> Pxy (x, y)" 620 defines "f \<equiv> \<lambda>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))" 621 shows mutual_information_distr: "mutual_information b S T X Y = integral\<^sup>L (S \<Otimes>\<^sub>M T) f" (is "?M = ?R") 622 and mutual_information_nonneg: "integrable (S \<Otimes>\<^sub>M T) f \<Longrightarrow> 0 \<le> mutual_information b S T X Y" 623proof - 624 have X[measurable]: "random_variable S X" 625 using Px by (auto simp: distributed_def) 626 have Y[measurable]: "random_variable T Y" 627 using Py by (auto simp: distributed_def) 628 have [measurable]: "Px \<in> S \<rightarrow>\<^sub>M borel" 629 using Px Px_nn by (intro distributed_real_measurable) 630 have [measurable]: "Py \<in> T \<rightarrow>\<^sub>M borel" 631 using Py Py_nn by (intro distributed_real_measurable) 632 have measurable_Pxy[measurable]: "Pxy \<in> (S \<Otimes>\<^sub>M T) \<rightarrow>\<^sub>M borel" 633 using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure) 634 635 interpret S: sigma_finite_measure S by fact 636 interpret T: sigma_finite_measure T by fact 637 interpret ST: pair_sigma_finite S T .. 638 interpret X: prob_space "distr M S X" using X by (rule prob_space_distr) 639 interpret Y: prob_space "distr M T Y" using Y by (rule prob_space_distr) 640 interpret XY: pair_prob_space "distr M S X" "distr M T Y" .. 641 let ?P = "S \<Otimes>\<^sub>M T" 642 let ?D = "distr M ?P (\<lambda>x. (X x, Y x))" 643 644 { fix A assume "A \<in> sets S" 645 with X[THEN measurable_space] Y[THEN measurable_space] 646 have "emeasure (distr M S X) A = emeasure ?D (A \<times> space T)" 647 by (auto simp: emeasure_distr intro!: arg_cong[where f="emeasure M"]) } 648 note marginal_eq1 = this 649 { fix A assume "A \<in> sets T" 650 with X[THEN measurable_space] Y[THEN measurable_space] 651 have "emeasure (distr M T Y) A = emeasure ?D (space S \<times> A)" 652 by (auto simp: emeasure_distr intro!: arg_cong[where f="emeasure M"]) } 653 note marginal_eq2 = this 654 655 have distr_eq: "distr M S X \<Otimes>\<^sub>M distr M T Y = density ?P (\<lambda>x. ennreal (Px (fst x) * Py (snd x)))" 656 unfolding Px(1)[THEN distributed_distr_eq_density] Py(1)[THEN distributed_distr_eq_density] 657 proof (subst pair_measure_density) 658 show "(\<lambda>x. ennreal (Px x)) \<in> borel_measurable S" "(\<lambda>y. ennreal (Py y)) \<in> borel_measurable T" 659 using Px Py by (auto simp: distributed_def) 660 show "sigma_finite_measure (density T Py)" unfolding Py(1)[THEN distributed_distr_eq_density, symmetric] .. 661 show "density (S \<Otimes>\<^sub>M T) (\<lambda>(x, y). ennreal (Px x) * ennreal (Py y)) = 662 density (S \<Otimes>\<^sub>M T) (\<lambda>x. ennreal (Px (fst x) * Py (snd x)))" 663 using Px_nn Py_nn by (auto intro!: density_cong simp: distributed_def ennreal_mult space_pair_measure) 664 qed fact 665 666 have M: "?M = KL_divergence b (density ?P (\<lambda>x. ennreal (Px (fst x) * Py (snd x)))) (density ?P (\<lambda>x. ennreal (Pxy x)))" 667 unfolding mutual_information_def distr_eq Pxy(1)[THEN distributed_distr_eq_density] .. 668 669 from Px Py have f: "(\<lambda>x. Px (fst x) * Py (snd x)) \<in> borel_measurable ?P" 670 by (intro borel_measurable_times) (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'') 671 have PxPy_nonneg: "AE x in ?P. 0 \<le> Px (fst x) * Py (snd x)" 672 using Px_nn Py_nn by (auto simp: space_pair_measure) 673 674 have "(AE x in ?P. Px (fst x) = 0 \<longrightarrow> Pxy x = 0)" 675 by (rule subdensity_real[OF measurable_fst Pxy Px]) (insert Px_nn Pxy_nn, auto simp: space_pair_measure) 676 moreover 677 have "(AE x in ?P. Py (snd x) = 0 \<longrightarrow> Pxy x = 0)" 678 by (rule subdensity_real[OF measurable_snd Pxy Py]) (insert Py_nn Pxy_nn, auto simp: space_pair_measure) 679 ultimately have ac: "AE x in ?P. Px (fst x) * Py (snd x) = 0 \<longrightarrow> Pxy x = 0" 680 by eventually_elim auto 681 682 show "?M = ?R" 683 unfolding M f_def 684 using b_gt_1 f PxPy_nonneg ac Pxy_nn 685 by (intro ST.KL_density_density) (auto simp: space_pair_measure) 686 687 assume int: "integrable (S \<Otimes>\<^sub>M T) f" 688 show "0 \<le> ?M" unfolding M 689 proof (intro ST.KL_density_density_nonneg) 690 show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ennreal (Pxy x))) " 691 unfolding distributed_distr_eq_density[OF Pxy, symmetric] 692 using distributed_measurable[OF Pxy] by (rule prob_space_distr) 693 show "prob_space (density (S \<Otimes>\<^sub>M T) (\<lambda>x. ennreal (Px (fst x) * Py (snd x))))" 694 unfolding distr_eq[symmetric] by unfold_locales 695 show "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))))" 696 using int unfolding f_def . 697 qed (insert ac, auto simp: b_gt_1 Px_nn Py_nn Pxy_nn space_pair_measure) 698qed 699 700lemma (in information_space) 701 fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" 702 assumes "sigma_finite_measure S" "sigma_finite_measure T" 703 assumes Px[measurable]: "distributed M S X Px" and Px_nn: "\<And>x. x \<in> space S \<Longrightarrow> 0 \<le> Px x" 704 and Py[measurable]: "distributed M T Y Py" and Py_nn: "\<And>x. x \<in> space T \<Longrightarrow> 0 \<le> Py x" 705 and Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy" 706 and Pxy_nn: "\<And>x. x \<in> space (S \<Otimes>\<^sub>M T) \<Longrightarrow> 0 \<le> Pxy x" 707 assumes ae: "AE x in S. AE y in T. Pxy (x, y) = Px x * Py y" 708 shows mutual_information_eq_0: "mutual_information b S T X Y = 0" 709proof - 710 interpret S: sigma_finite_measure S by fact 711 interpret T: sigma_finite_measure T by fact 712 interpret ST: pair_sigma_finite S T .. 713 note 714 distributed_real_measurable[OF Px_nn Px, measurable] 715 distributed_real_measurable[OF Py_nn Py, measurable] 716 distributed_real_measurable[OF Pxy_nn Pxy, measurable] 717 718 have "AE x in S \<Otimes>\<^sub>M T. Px (fst x) = 0 \<longrightarrow> Pxy x = 0" 719 by (rule subdensity_real[OF measurable_fst Pxy Px]) (auto simp: Px_nn Pxy_nn space_pair_measure) 720 moreover 721 have "AE x in S \<Otimes>\<^sub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0" 722 by (rule subdensity_real[OF measurable_snd Pxy Py]) (auto simp: Py_nn Pxy_nn space_pair_measure) 723 moreover 724 have "AE x in S \<Otimes>\<^sub>M T. Pxy x = Px (fst x) * Py (snd x)" 725 by (intro ST.AE_pair_measure) (auto simp: ae intro!: measurable_snd'' measurable_fst'') 726 ultimately have "AE x in S \<Otimes>\<^sub>M T. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) = 0" 727 by eventually_elim simp 728 then have "(\<integral>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) \<partial>(S \<Otimes>\<^sub>M T)) = (\<integral>x. 0 \<partial>(S \<Otimes>\<^sub>M T))" 729 by (intro integral_cong_AE) auto 730 then show ?thesis 731 by (subst mutual_information_distr[OF assms(1-8)]) (auto simp add: space_pair_measure) 732qed 733 734lemma (in information_space) mutual_information_simple_distributed: 735 assumes X: "simple_distributed M X Px" and Y: "simple_distributed M Y Py" 736 assumes XY: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy" 737 shows "\<I>(X ; Y) = (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x))`space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y)))" 738proof (subst mutual_information_distr[OF _ _ simple_distributed[OF X] _ simple_distributed[OF Y] _ simple_distributed_joint[OF XY]]) 739 note fin = simple_distributed_joint_finite[OF XY, simp] 740 show "sigma_finite_measure (count_space (X ` space M))" 741 by (simp add: sigma_finite_measure_count_space_finite) 742 show "sigma_finite_measure (count_space (Y ` space M))" 743 by (simp add: sigma_finite_measure_count_space_finite) 744 let ?Pxy = "\<lambda>x. (if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x else 0)" 745 let ?f = "\<lambda>x. ?Pxy x * log b (?Pxy x / (Px (fst x) * Py (snd x)))" 746 have "\<And>x. ?f x = (if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) else 0)" 747 by auto 748 with fin show "(\<integral> x. ?f x \<partial>(count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M))) = 749 (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y)))" 750 by (auto simp add: pair_measure_count_space lebesgue_integral_count_space_finite sum.If_cases split_beta' 751 intro!: sum.cong) 752qed (insert X Y XY, auto simp: simple_distributed_def) 753 754lemma (in information_space) 755 fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" 756 assumes Px: "simple_distributed M X Px" and Py: "simple_distributed M Y Py" 757 assumes Pxy: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy" 758 assumes ae: "\<forall>x\<in>space M. Pxy (X x, Y x) = Px (X x) * Py (Y x)" 759 shows mutual_information_eq_0_simple: "\<I>(X ; Y) = 0" 760proof (subst mutual_information_simple_distributed[OF Px Py Pxy]) 761 have "(\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y))) = 762 (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. 0)" 763 by (intro sum.cong) (auto simp: ae) 764 then show "(\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. 765 Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y))) = 0" by simp 766qed 767 768subsection \<open>Entropy\<close> 769 770definition (in prob_space) entropy :: "real \<Rightarrow> 'b measure \<Rightarrow> ('a \<Rightarrow> 'b) \<Rightarrow> real" where 771 "entropy b S X = - KL_divergence b S (distr M S X)" 772 773abbreviation (in information_space) 774 entropy_Pow ("\<H>'(_')") where 775 "\<H>(X) \<equiv> entropy b (count_space (X`space M)) X" 776 777lemma (in prob_space) distributed_RN_deriv: 778 assumes X: "distributed M S X Px" 779 shows "AE x in S. RN_deriv S (density S Px) x = Px x" 780proof - 781 note D = distributed_measurable[OF X] distributed_borel_measurable[OF X] 782 interpret X: prob_space "distr M S X" 783 using D(1) by (rule prob_space_distr) 784 785 have sf: "sigma_finite_measure (distr M S X)" by standard 786 show ?thesis 787 using D 788 apply (subst eq_commute) 789 apply (intro RN_deriv_unique_sigma_finite) 790 apply (auto simp: distributed_distr_eq_density[symmetric, OF X] sf) 791 done 792qed 793 794lemma (in information_space) 795 fixes X :: "'a \<Rightarrow> 'b" 796 assumes X[measurable]: "distributed M MX X f" and nn: "\<And>x. x \<in> space MX \<Longrightarrow> 0 \<le> f x" 797 shows entropy_distr: "entropy b MX X = - (\<integral>x. f x * log b (f x) \<partial>MX)" (is ?eq) 798proof - 799 note D = distributed_measurable[OF X] distributed_borel_measurable[OF X] 800 note ae = distributed_RN_deriv[OF X] 801 note distributed_real_measurable[OF nn X, measurable] 802 803 have ae_eq: "AE x in distr M MX X. log b (enn2real (RN_deriv MX (distr M MX X) x)) = 804 log b (f x)" 805 unfolding distributed_distr_eq_density[OF X] 806 apply (subst AE_density) 807 using D apply simp 808 using ae apply eventually_elim 809 apply auto 810 done 811 812 have int_eq: "(\<integral> x. f x * log b (f x) \<partial>MX) = (\<integral> x. log b (f x) \<partial>distr M MX X)" 813 unfolding distributed_distr_eq_density[OF X] 814 using D 815 by (subst integral_density) (auto simp: nn) 816 817 show ?eq 818 unfolding entropy_def KL_divergence_def entropy_density_def comp_def int_eq neg_equal_iff_equal 819 using ae_eq by (intro integral_cong_AE) (auto simp: nn) 820qed 821 822lemma (in information_space) entropy_le: 823 fixes Px :: "'b \<Rightarrow> real" and MX :: "'b measure" 824 assumes X[measurable]: "distributed M MX X Px" and Px_nn[simp]: "\<And>x. x \<in> space MX \<Longrightarrow> 0 \<le> Px x" 825 and fin: "emeasure MX {x \<in> space MX. Px x \<noteq> 0} \<noteq> top" 826 and int: "integrable MX (\<lambda>x. - Px x * log b (Px x))" 827 shows "entropy b MX X \<le> log b (measure MX {x \<in> space MX. Px x \<noteq> 0})" 828proof - 829 note Px = distributed_borel_measurable[OF X] 830 interpret X: prob_space "distr M MX X" 831 using distributed_measurable[OF X] by (rule prob_space_distr) 832 833 have " - log b (measure MX {x \<in> space MX. Px x \<noteq> 0}) = 834 - log b (\<integral> x. indicator {x \<in> space MX. Px x \<noteq> 0} x \<partial>MX)" 835 using Px Px_nn fin by (auto simp: measure_def) 836 also have "- log b (\<integral> x. indicator {x \<in> space MX. Px x \<noteq> 0} x \<partial>MX) = - log b (\<integral> x. 1 / Px x \<partial>distr M MX X)" 837 proof - 838 have "integral\<^sup>L MX (indicator {x \<in> space MX. Px x \<noteq> 0}) = LINT x|MX. Px x *\<^sub>R (1 / Px x)" 839 by (rule Bochner_Integration.integral_cong) auto 840 also have "... = LINT x|density MX (\<lambda>x. ennreal (Px x)). 1 / Px x" 841 by (rule integral_density [symmetric]) (use Px Px_nn in auto) 842 finally show ?thesis 843 unfolding distributed_distr_eq_density[OF X] by simp 844 qed 845 also have "\<dots> \<le> (\<integral> x. - log b (1 / Px x) \<partial>distr M MX X)" 846 proof (rule X.jensens_inequality[of "\<lambda>x. 1 / Px x" "{0<..}" 0 1 "\<lambda>x. - log b x"]) 847 show "AE x in distr M MX X. 1 / Px x \<in> {0<..}" 848 unfolding distributed_distr_eq_density[OF X] 849 using Px by (auto simp: AE_density) 850 have [simp]: "\<And>x. x \<in> space MX \<Longrightarrow> ennreal (if Px x = 0 then 0 else 1) = indicator {x \<in> space MX. Px x \<noteq> 0} x" 851 by (auto simp: one_ennreal_def) 852 have "(\<integral>\<^sup>+ x. ennreal (- (if Px x = 0 then 0 else 1)) \<partial>MX) = (\<integral>\<^sup>+ x. 0 \<partial>MX)" 853 by (intro nn_integral_cong) (auto simp: ennreal_neg) 854 then show "integrable (distr M MX X) (\<lambda>x. 1 / Px x)" 855 unfolding distributed_distr_eq_density[OF X] using Px 856 by (auto simp: nn_integral_density real_integrable_def fin ennreal_neg ennreal_mult[symmetric] 857 cong: nn_integral_cong) 858 have "integrable MX (\<lambda>x. Px x * log b (1 / Px x)) = 859 integrable MX (\<lambda>x. - Px x * log b (Px x))" 860 using Px 861 by (intro integrable_cong_AE) (auto simp: log_divide_eq less_le) 862 then show "integrable (distr M MX X) (\<lambda>x. - log b (1 / Px x))" 863 unfolding distributed_distr_eq_density[OF X] 864 using Px int 865 by (subst integrable_real_density) auto 866 qed (auto simp: minus_log_convex[OF b_gt_1]) 867 also have "\<dots> = (\<integral> x. log b (Px x) \<partial>distr M MX X)" 868 unfolding distributed_distr_eq_density[OF X] using Px 869 by (intro integral_cong_AE) (auto simp: AE_density log_divide_eq) 870 also have "\<dots> = - entropy b MX X" 871 unfolding distributed_distr_eq_density[OF X] using Px 872 by (subst entropy_distr[OF X]) (auto simp: integral_density) 873 finally show ?thesis 874 by simp 875qed 876 877lemma (in information_space) entropy_le_space: 878 fixes Px :: "'b \<Rightarrow> real" and MX :: "'b measure" 879 assumes X: "distributed M MX X Px" and Px_nn[simp]: "\<And>x. x \<in> space MX \<Longrightarrow> 0 \<le> Px x" 880 and fin: "finite_measure MX" 881 and int: "integrable MX (\<lambda>x. - Px x * log b (Px x))" 882 shows "entropy b MX X \<le> log b (measure MX (space MX))" 883proof - 884 note Px = distributed_borel_measurable[OF X] 885 interpret finite_measure MX by fact 886 have "entropy b MX X \<le> log b (measure MX {x \<in> space MX. Px x \<noteq> 0})" 887 using int X by (intro entropy_le) auto 888 also have "\<dots> \<le> log b (measure MX (space MX))" 889 using Px distributed_imp_emeasure_nonzero[OF X] 890 by (intro log_le) 891 (auto intro!: finite_measure_mono b_gt_1 less_le[THEN iffD2] 892 simp: emeasure_eq_measure cong: conj_cong) 893 finally show ?thesis . 894qed 895 896lemma (in information_space) entropy_uniform: 897 assumes X: "distributed M MX X (\<lambda>x. indicator A x / measure MX A)" (is "distributed _ _ _ ?f") 898 shows "entropy b MX X = log b (measure MX A)" 899proof (subst entropy_distr[OF X]) 900 have [simp]: "emeasure MX A \<noteq> \<infinity>" 901 using uniform_distributed_params[OF X] by (auto simp add: measure_def) 902 have eq: "(\<integral> x. indicator A x / measure MX A * log b (indicator A x / measure MX A) \<partial>MX) = 903 (\<integral> x. (- log b (measure MX A) / measure MX A) * indicator A x \<partial>MX)" 904 using uniform_distributed_params[OF X] 905 by (intro Bochner_Integration.integral_cong) (auto split: split_indicator simp: log_divide_eq zero_less_measure_iff) 906 show "- (\<integral> x. indicator A x / measure MX A * log b (indicator A x / measure MX A) \<partial>MX) = 907 log b (measure MX A)" 908 unfolding eq using uniform_distributed_params[OF X] 909 by (subst Bochner_Integration.integral_mult_right) (auto simp: measure_def less_top[symmetric] intro!: integrable_real_indicator) 910qed simp 911 912lemma (in information_space) entropy_simple_distributed: 913 "simple_distributed M X f \<Longrightarrow> \<H>(X) = - (\<Sum>x\<in>X`space M. f x * log b (f x))" 914 by (subst entropy_distr[OF simple_distributed]) 915 (auto simp add: lebesgue_integral_count_space_finite) 916 917lemma (in information_space) entropy_le_card_not_0: 918 assumes X: "simple_distributed M X f" 919 shows "\<H>(X) \<le> log b (card (X ` space M \<inter> {x. f x \<noteq> 0}))" 920proof - 921 let ?X = "count_space (X`space M)" 922 have "\<H>(X) \<le> log b (measure ?X {x \<in> space ?X. f x \<noteq> 0})" 923 by (rule entropy_le[OF simple_distributed[OF X]]) 924 (insert X, auto simp: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space) 925 also have "measure ?X {x \<in> space ?X. f x \<noteq> 0} = card (X ` space M \<inter> {x. f x \<noteq> 0})" 926 by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def Int_def) 927 finally show ?thesis . 928qed 929 930lemma (in information_space) entropy_le_card: 931 assumes X: "simple_distributed M X f" 932 shows "\<H>(X) \<le> log b (real (card (X ` space M)))" 933proof - 934 let ?X = "count_space (X`space M)" 935 have "\<H>(X) \<le> log b (measure ?X (space ?X))" 936 by (rule entropy_le_space[OF simple_distributed[OF X]]) 937 (insert X, auto simp: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space finite_measure_count_space) 938 also have "measure ?X (space ?X) = card (X ` space M)" 939 by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def) 940 finally show ?thesis . 941qed 942 943subsection \<open>Conditional Mutual Information\<close> 944 945definition (in prob_space) 946 "conditional_mutual_information b MX MY MZ X Y Z \<equiv> 947 mutual_information b MX (MY \<Otimes>\<^sub>M MZ) X (\<lambda>x. (Y x, Z x)) - 948 mutual_information b MX MZ X Z" 949 950abbreviation (in information_space) 951 conditional_mutual_information_Pow ("\<I>'( _ ; _ | _ ')") where 952 "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b 953 (count_space (X ` space M)) (count_space (Y ` space M)) (count_space (Z ` space M)) X Y Z" 954 955lemma (in information_space) 956 assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" and P: "sigma_finite_measure P" 957 assumes Px[measurable]: "distributed M S X Px" 958 and Px_nn[simp]: "\<And>x. x \<in> space S \<Longrightarrow> 0 \<le> Px x" 959 assumes Pz[measurable]: "distributed M P Z Pz" 960 and Pz_nn[simp]: "\<And>z. z \<in> space P \<Longrightarrow> 0 \<le> Pz z" 961 assumes Pyz[measurable]: "distributed M (T \<Otimes>\<^sub>M P) (\<lambda>x. (Y x, Z x)) Pyz" 962 and Pyz_nn[simp]: "\<And>y z. y \<in> space T \<Longrightarrow> z \<in> space P \<Longrightarrow> 0 \<le> Pyz (y, z)" 963 assumes Pxz[measurable]: "distributed M (S \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Z x)) Pxz" 964 and Pxz_nn[simp]: "\<And>x z. x \<in> space S \<Longrightarrow> z \<in> space P \<Longrightarrow> 0 \<le> Pxz (x, z)" 965 assumes Pxyz[measurable]: "distributed M (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x)) Pxyz" 966 and Pxyz_nn[simp]: "\<And>x y z. x \<in> space S \<Longrightarrow> y \<in> space T \<Longrightarrow> z \<in> space P \<Longrightarrow> 0 \<le> Pxyz (x, y, z)" 967 assumes I1: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))" 968 assumes I2: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))" 969 shows conditional_mutual_information_generic_eq: "conditional_mutual_information b S T P X Y Z 970 = (\<integral>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" (is "?eq") 971 and conditional_mutual_information_generic_nonneg: "0 \<le> conditional_mutual_information b S T P X Y Z" (is "?nonneg") 972proof - 973 have [measurable]: "Px \<in> S \<rightarrow>\<^sub>M borel" 974 using Px Px_nn by (intro distributed_real_measurable) 975 have [measurable]: "Pz \<in> P \<rightarrow>\<^sub>M borel" 976 using Pz Pz_nn by (intro distributed_real_measurable) 977 have measurable_Pyz[measurable]: "Pyz \<in> (T \<Otimes>\<^sub>M P) \<rightarrow>\<^sub>M borel" 978 using Pyz Pyz_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure) 979 have measurable_Pxz[measurable]: "Pxz \<in> (S \<Otimes>\<^sub>M P) \<rightarrow>\<^sub>M borel" 980 using Pxz Pxz_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure) 981 have measurable_Pxyz[measurable]: "Pxyz \<in> (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) \<rightarrow>\<^sub>M borel" 982 using Pxyz Pxyz_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure) 983 984 interpret S: sigma_finite_measure S by fact 985 interpret T: sigma_finite_measure T by fact 986 interpret P: sigma_finite_measure P by fact 987 interpret TP: pair_sigma_finite T P .. 988 interpret SP: pair_sigma_finite S P .. 989 interpret ST: pair_sigma_finite S T .. 990 interpret SPT: pair_sigma_finite "S \<Otimes>\<^sub>M P" T .. 991 interpret STP: pair_sigma_finite S "T \<Otimes>\<^sub>M P" .. 992 interpret TPS: pair_sigma_finite "T \<Otimes>\<^sub>M P" S .. 993 have TP: "sigma_finite_measure (T \<Otimes>\<^sub>M P)" .. 994 have SP: "sigma_finite_measure (S \<Otimes>\<^sub>M P)" .. 995 have YZ: "random_variable (T \<Otimes>\<^sub>M P) (\<lambda>x. (Y x, Z x))" 996 using Pyz by (simp add: distributed_measurable) 997 998 from Pxz Pxyz have distr_eq: "distr M (S \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Z x)) = 999 distr (distr M (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x))) (S \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). (x, z))" 1000 by (simp add: comp_def distr_distr) 1001 1002 have "mutual_information b S P X Z = 1003 (\<integral>x. Pxz x * log b (Pxz x / (Px (fst x) * Pz (snd x))) \<partial>(S \<Otimes>\<^sub>M P))" 1004 by (rule mutual_information_distr[OF S P Px Px_nn Pz Pz_nn Pxz Pxz_nn]) 1005 also have "\<dots> = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" 1006 using b_gt_1 Pxz Px Pz 1007 by (subst distributed_transform_integral[OF Pxyz _ Pxz _, where T="\<lambda>(x, y, z). (x, z)"]) 1008 (auto simp: split_beta' space_pair_measure) 1009 finally have mi_eq: 1010 "mutual_information b S P X Z = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" . 1011 1012 have ae1: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0" 1013 by (intro subdensity_real[of fst, OF _ Pxyz Px]) (auto simp: space_pair_measure) 1014 moreover have ae2: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0" 1015 by (intro subdensity_real[of "\<lambda>x. snd (snd x)", OF _ Pxyz Pz]) (auto simp: space_pair_measure) 1016 moreover have ae3: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0" 1017 by (intro subdensity_real[of "\<lambda>x. (fst x, snd (snd x))", OF _ Pxyz Pxz]) (auto simp: space_pair_measure) 1018 moreover have ae4: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0" 1019 by (intro subdensity_real[of snd, OF _ Pxyz Pyz]) (auto simp: space_pair_measure) 1020 ultimately have ae: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 1021 Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) - 1022 Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) = 1023 Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) " 1024 using AE_space 1025 proof eventually_elim 1026 case (elim x) 1027 show ?case 1028 proof cases 1029 assume "Pxyz x \<noteq> 0" 1030 with elim have "0 < Px (fst x)" "0 < Pz (snd (snd x))" "0 < Pxz (fst x, snd (snd x))" 1031 "0 < Pyz (snd x)" "0 < Pxyz x" 1032 by (auto simp: space_pair_measure less_le) 1033 then show ?thesis 1034 using b_gt_1 by (simp add: log_simps less_imp_le field_simps) 1035 qed simp 1036 qed 1037 with I1 I2 show ?eq 1038 unfolding conditional_mutual_information_def 1039 apply (subst mi_eq) 1040 apply (subst mutual_information_distr[OF S TP Px Px_nn Pyz _ Pxyz]) 1041 apply (auto simp: space_pair_measure) 1042 apply (subst Bochner_Integration.integral_diff[symmetric]) 1043 apply (auto intro!: integral_cong_AE simp: split_beta' simp del: Bochner_Integration.integral_diff) 1044 done 1045 1046 let ?P = "density (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) Pxyz" 1047 interpret P: prob_space ?P 1048 unfolding distributed_distr_eq_density[OF Pxyz, symmetric] 1049 by (rule prob_space_distr) simp 1050 1051 let ?Q = "density (T \<Otimes>\<^sub>M P) Pyz" 1052 interpret Q: prob_space ?Q 1053 unfolding distributed_distr_eq_density[OF Pyz, symmetric] 1054 by (rule prob_space_distr) simp 1055 1056 let ?f = "\<lambda>(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) / Pxyz (x, y, z)" 1057 1058 from subdensity_real[of snd, OF _ Pyz Pz _ AE_I2 AE_I2] 1059 have aeX1: "AE x in T \<Otimes>\<^sub>M P. Pz (snd x) = 0 \<longrightarrow> Pyz x = 0" 1060 by (auto simp: comp_def space_pair_measure) 1061 have aeX2: "AE x in T \<Otimes>\<^sub>M P. 0 \<le> Pz (snd x)" 1062 using Pz by (intro TP.AE_pair_measure) (auto simp: comp_def) 1063 1064 have aeX3: "AE y in T \<Otimes>\<^sub>M P. (\<integral>\<^sup>+ x. ennreal (Pxz (x, snd y)) \<partial>S) = ennreal (Pz (snd y))" 1065 using Pz distributed_marginal_eq_joint2[OF P S Pz Pxz] 1066 by (intro TP.AE_pair_measure) auto 1067 1068 have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> (\<integral>\<^sup>+ (x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" 1069 by (subst nn_integral_density) 1070 (auto intro!: nn_integral_mono simp: space_pair_measure ennreal_mult[symmetric]) 1071 also have "\<dots> = (\<integral>\<^sup>+(y, z). (\<integral>\<^sup>+ x. ennreal (Pxz (x, z)) * ennreal (Pyz (y, z) / Pz z) \<partial>S) \<partial>(T \<Otimes>\<^sub>M P))" 1072 by (subst STP.nn_integral_snd[symmetric]) 1073 (auto simp add: split_beta' ennreal_mult[symmetric] space_pair_measure intro!: nn_integral_cong) 1074 also have "\<dots> = (\<integral>\<^sup>+x. ennreal (Pyz x) * 1 \<partial>T \<Otimes>\<^sub>M P)" 1075 apply (rule nn_integral_cong_AE) 1076 using aeX1 aeX2 aeX3 AE_space 1077 apply eventually_elim 1078 proof (case_tac x, simp add: space_pair_measure) 1079 fix a b assume "Pz b = 0 \<longrightarrow> Pyz (a, b) = 0" "a \<in> space T \<and> b \<in> space P" 1080 "(\<integral>\<^sup>+ x. ennreal (Pxz (x, b)) \<partial>S) = ennreal (Pz b)" 1081 then show "(\<integral>\<^sup>+ x. ennreal (Pxz (x, b)) * ennreal (Pyz (a, b) / Pz b) \<partial>S) = ennreal (Pyz (a, b))" 1082 by (subst nn_integral_multc) (auto split: prod.split simp: ennreal_mult[symmetric]) 1083 qed 1084 also have "\<dots> = 1" 1085 using Q.emeasure_space_1 distributed_distr_eq_density[OF Pyz] 1086 by (subst nn_integral_density[symmetric]) auto 1087 finally have le1: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> 1" . 1088 also have "\<dots> < \<infinity>" by simp 1089 finally have fin: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>" by simp 1090 1091 have pos: "(\<integral>\<^sup>+x. ?f x \<partial>?P) \<noteq> 0" 1092 apply (subst nn_integral_density) 1093 apply (simp_all add: split_beta') 1094 proof 1095 let ?g = "\<lambda>x. ennreal (Pxyz x) * (Pxz (fst x, snd (snd x)) * Pyz (snd x) / (Pz (snd (snd x)) * Pxyz x))" 1096 assume "(\<integral>\<^sup>+x. ?g x \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)) = 0" 1097 then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. ?g x = 0" 1098 by (intro nn_integral_0_iff_AE[THEN iffD1]) auto 1099 then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxyz x = 0" 1100 using ae1 ae2 ae3 ae4 AE_space 1101 by eventually_elim (auto split: if_split_asm simp: mult_le_0_iff divide_le_0_iff space_pair_measure) 1102 then have "(\<integral>\<^sup>+ x. ennreal (Pxyz x) \<partial>S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) = 0" 1103 by (subst nn_integral_cong_AE[of _ "\<lambda>x. 0"]) auto 1104 with P.emeasure_space_1 show False 1105 by (subst (asm) emeasure_density) (auto cong: nn_integral_cong) 1106 qed 1107 1108 have neg: "(\<integral>\<^sup>+ x. - ?f x \<partial>?P) = 0" 1109 apply (rule nn_integral_0_iff_AE[THEN iffD2]) 1110 apply simp 1111 apply (subst AE_density) 1112 apply (auto simp: space_pair_measure ennreal_neg) 1113 done 1114 1115 have I3: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))" 1116 apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ Bochner_Integration.integrable_diff[OF I1 I2]]) 1117 using ae 1118 apply (auto simp: split_beta') 1119 done 1120 1121 have "- log b 1 \<le> - log b (integral\<^sup>L ?P ?f)" 1122 proof (intro le_imp_neg_le log_le[OF b_gt_1]) 1123 have If: "integrable ?P ?f" 1124 unfolding real_integrable_def 1125 proof (intro conjI) 1126 from neg show "(\<integral>\<^sup>+ x. - ?f x \<partial>?P) \<noteq> \<infinity>" 1127 by simp 1128 from fin show "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>" 1129 by simp 1130 qed simp 1131 then have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) = (\<integral>x. ?f x \<partial>?P)" 1132 apply (rule nn_integral_eq_integral) 1133 apply (subst AE_density) 1134 apply simp 1135 apply (auto simp: space_pair_measure ennreal_neg) 1136 done 1137 with pos le1 1138 show "0 < (\<integral>x. ?f x \<partial>?P)" "(\<integral>x. ?f x \<partial>?P) \<le> 1" 1139 by (simp_all add: one_ennreal.rep_eq zero_less_iff_neq_zero[symmetric]) 1140 qed 1141 also have "- log b (integral\<^sup>L ?P ?f) \<le> (\<integral> x. - log b (?f x) \<partial>?P)" 1142 proof (rule P.jensens_inequality[where a=0 and b=1 and I="{0<..}"]) 1143 show "AE x in ?P. ?f x \<in> {0<..}" 1144 unfolding AE_density[OF distributed_borel_measurable[OF Pxyz]] 1145 using ae1 ae2 ae3 ae4 AE_space 1146 by eventually_elim (auto simp: space_pair_measure less_le) 1147 show "integrable ?P ?f" 1148 unfolding real_integrable_def 1149 using fin neg by (auto simp: split_beta') 1150 show "integrable ?P (\<lambda>x. - log b (?f x))" 1151 apply (subst integrable_real_density) 1152 apply simp 1153 apply (auto simp: space_pair_measure) [] 1154 apply simp 1155 apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ I3]) 1156 apply simp 1157 apply simp 1158 using ae1 ae2 ae3 ae4 AE_space 1159 apply eventually_elim 1160 apply (auto simp: log_divide_eq log_mult_eq zero_le_mult_iff zero_less_mult_iff zero_less_divide_iff field_simps 1161 less_le space_pair_measure) 1162 done 1163 qed (auto simp: b_gt_1 minus_log_convex) 1164 also have "\<dots> = conditional_mutual_information b S T P X Y Z" 1165 unfolding \<open>?eq\<close> 1166 apply (subst integral_real_density) 1167 apply simp 1168 apply (auto simp: space_pair_measure) [] 1169 apply simp 1170 apply (intro integral_cong_AE) 1171 using ae1 ae2 ae3 ae4 1172 apply (auto simp: log_divide_eq zero_less_mult_iff zero_less_divide_iff field_simps 1173 space_pair_measure less_le) 1174 done 1175 finally show ?nonneg 1176 by simp 1177qed 1178 1179lemma (in information_space) 1180 fixes Px :: "_ \<Rightarrow> real" 1181 assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" and P: "sigma_finite_measure P" 1182 assumes Fx: "finite_entropy S X Px" 1183 assumes Fz: "finite_entropy P Z Pz" 1184 assumes Fyz: "finite_entropy (T \<Otimes>\<^sub>M P) (\<lambda>x. (Y x, Z x)) Pyz" 1185 assumes Fxz: "finite_entropy (S \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Z x)) Pxz" 1186 assumes Fxyz: "finite_entropy (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x)) Pxyz" 1187 shows conditional_mutual_information_generic_eq': "conditional_mutual_information b S T P X Y Z 1188 = (\<integral>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" (is "?eq") 1189 and conditional_mutual_information_generic_nonneg': "0 \<le> conditional_mutual_information b S T P X Y Z" (is "?nonneg") 1190proof - 1191 note Px = Fx[THEN finite_entropy_distributed, measurable] 1192 note Pz = Fz[THEN finite_entropy_distributed, measurable] 1193 note Pyz = Fyz[THEN finite_entropy_distributed, measurable] 1194 note Pxz = Fxz[THEN finite_entropy_distributed, measurable] 1195 note Pxyz = Fxyz[THEN finite_entropy_distributed, measurable] 1196 1197 note Px_nn = Fx[THEN finite_entropy_nn] 1198 note Pz_nn = Fz[THEN finite_entropy_nn] 1199 note Pyz_nn = Fyz[THEN finite_entropy_nn] 1200 note Pxz_nn = Fxz[THEN finite_entropy_nn] 1201 note Pxyz_nn = Fxyz[THEN finite_entropy_nn] 1202 1203 note Px' = Fx[THEN finite_entropy_measurable, measurable] 1204 note Pz' = Fz[THEN finite_entropy_measurable, measurable] 1205 note Pyz' = Fyz[THEN finite_entropy_measurable, measurable] 1206 note Pxz' = Fxz[THEN finite_entropy_measurable, measurable] 1207 note Pxyz' = Fxyz[THEN finite_entropy_measurable, measurable] 1208 1209 interpret S: sigma_finite_measure S by fact 1210 interpret T: sigma_finite_measure T by fact 1211 interpret P: sigma_finite_measure P by fact 1212 interpret TP: pair_sigma_finite T P .. 1213 interpret SP: pair_sigma_finite S P .. 1214 interpret ST: pair_sigma_finite S T .. 1215 interpret SPT: pair_sigma_finite "S \<Otimes>\<^sub>M P" T .. 1216 interpret STP: pair_sigma_finite S "T \<Otimes>\<^sub>M P" .. 1217 interpret TPS: pair_sigma_finite "T \<Otimes>\<^sub>M P" S .. 1218 have TP: "sigma_finite_measure (T \<Otimes>\<^sub>M P)" .. 1219 have SP: "sigma_finite_measure (S \<Otimes>\<^sub>M P)" .. 1220 1221 from Pxz Pxyz have distr_eq: "distr M (S \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Z x)) = 1222 distr (distr M (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>x. (X x, Y x, Z x))) (S \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). (x, z))" 1223 by (simp add: distr_distr comp_def) 1224 1225 have "mutual_information b S P X Z = 1226 (\<integral>x. Pxz x * log b (Pxz x / (Px (fst x) * Pz (snd x))) \<partial>(S \<Otimes>\<^sub>M P))" 1227 using Px Px_nn Pz Pz_nn Pxz Pxz_nn 1228 by (rule mutual_information_distr[OF S P]) (auto simp: space_pair_measure) 1229 also have "\<dots> = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" 1230 using b_gt_1 Pxz Pxz_nn Pxyz Pxyz_nn 1231 by (subst distributed_transform_integral[OF Pxyz _ Pxz, where T="\<lambda>(x, y, z). (x, z)"]) 1232 (auto simp: split_beta') 1233 finally have mi_eq: 1234 "mutual_information b S P X Z = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" . 1235 1236 have ae1: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0" 1237 by (intro subdensity_finite_entropy[of fst, OF _ Fxyz Fx]) auto 1238 moreover have ae2: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0" 1239 by (intro subdensity_finite_entropy[of "\<lambda>x. snd (snd x)", OF _ Fxyz Fz]) auto 1240 moreover have ae3: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0" 1241 by (intro subdensity_finite_entropy[of "\<lambda>x. (fst x, snd (snd x))", OF _ Fxyz Fxz]) auto 1242 moreover have ae4: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0" 1243 by (intro subdensity_finite_entropy[of snd, OF _ Fxyz Fyz]) auto 1244 ultimately have ae: "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. 1245 Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) - 1246 Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) = 1247 Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) " 1248 using AE_space 1249 proof eventually_elim 1250 case (elim x) 1251 show ?case 1252 proof cases 1253 assume "Pxyz x \<noteq> 0" 1254 with elim have "0 < Px (fst x)" "0 < Pz (snd (snd x))" "0 < Pxz (fst x, snd (snd x))" 1255 "0 < Pyz (snd x)" "0 < Pxyz x" 1256 using Px_nn Pz_nn Pxz_nn Pyz_nn Pxyz_nn 1257 by (auto simp: space_pair_measure less_le) 1258 then show ?thesis 1259 using b_gt_1 by (simp add: log_simps less_imp_le field_simps) 1260 qed simp 1261 qed 1262 1263 have "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) 1264 (\<lambda>x. Pxyz x * log b (Pxyz x) - Pxyz x * log b (Px (fst x)) - Pxyz x * log b (Pyz (snd x)))" 1265 using finite_entropy_integrable[OF Fxyz] 1266 using finite_entropy_integrable_transform[OF Fx Pxyz Pxyz_nn, of fst] 1267 using finite_entropy_integrable_transform[OF Fyz Pxyz Pxyz_nn, of snd] 1268 by simp 1269 moreover have "(\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z)))) \<in> borel_measurable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)" 1270 using Pxyz Px Pyz by simp 1271 ultimately have I1: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))" 1272 apply (rule integrable_cong_AE_imp) 1273 using ae1 ae4 AE_space 1274 by eventually_elim 1275 (insert Px_nn Pyz_nn Pxyz_nn, 1276 auto simp: log_divide_eq log_mult_eq field_simps zero_less_mult_iff space_pair_measure less_le) 1277 1278 have "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) 1279 (\<lambda>x. Pxyz x * log b (Pxz (fst x, snd (snd x))) - Pxyz x * log b (Px (fst x)) - Pxyz x * log b (Pz (snd (snd x))))" 1280 using finite_entropy_integrable_transform[OF Fxz Pxyz Pxyz_nn, of "\<lambda>x. (fst x, snd (snd x))"] 1281 using finite_entropy_integrable_transform[OF Fx Pxyz Pxyz_nn, of fst] 1282 using finite_entropy_integrable_transform[OF Fz Pxyz Pxyz_nn, of "snd \<circ> snd"] 1283 by simp 1284 moreover have "(\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z))) \<in> borel_measurable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)" 1285 using Pxyz Px Pz 1286 by auto 1287 ultimately have I2: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))" 1288 apply (rule integrable_cong_AE_imp) 1289 using ae1 ae2 ae3 ae4 AE_space 1290 by eventually_elim 1291 (insert Px_nn Pz_nn Pxz_nn Pyz_nn Pxyz_nn, 1292 auto simp: log_divide_eq log_mult_eq field_simps zero_less_mult_iff less_le space_pair_measure) 1293 1294 from ae I1 I2 show ?eq 1295 unfolding conditional_mutual_information_def 1296 apply (subst mi_eq) 1297 apply (subst mutual_information_distr[OF S TP Px Px_nn Pyz Pyz_nn Pxyz Pxyz_nn]) 1298 apply simp 1299 apply simp 1300 apply (simp add: space_pair_measure) 1301 apply (subst Bochner_Integration.integral_diff[symmetric]) 1302 apply (auto intro!: integral_cong_AE simp: split_beta' simp del: Bochner_Integration.integral_diff) 1303 done 1304 1305 let ?P = "density (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) Pxyz" 1306 interpret P: prob_space ?P 1307 unfolding distributed_distr_eq_density[OF Pxyz, symmetric] by (rule prob_space_distr) simp 1308 1309 let ?Q = "density (T \<Otimes>\<^sub>M P) Pyz" 1310 interpret Q: prob_space ?Q 1311 unfolding distributed_distr_eq_density[OF Pyz, symmetric] by (rule prob_space_distr) simp 1312 1313 let ?f = "\<lambda>(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) / Pxyz (x, y, z)" 1314 1315 from subdensity_finite_entropy[of snd, OF _ Fyz Fz] 1316 have aeX1: "AE x in T \<Otimes>\<^sub>M P. Pz (snd x) = 0 \<longrightarrow> Pyz x = 0" by (auto simp: comp_def) 1317 have aeX2: "AE x in T \<Otimes>\<^sub>M P. 0 \<le> Pz (snd x)" 1318 using Pz by (intro TP.AE_pair_measure) (auto intro: Pz_nn) 1319 1320 have aeX3: "AE y in T \<Otimes>\<^sub>M P. (\<integral>\<^sup>+ x. ennreal (Pxz (x, snd y)) \<partial>S) = ennreal (Pz (snd y))" 1321 using Pz distributed_marginal_eq_joint2[OF P S Pz Pxz] 1322 by (intro TP.AE_pair_measure) (auto ) 1323 have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> (\<integral>\<^sup>+ (x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P))" 1324 using Px_nn Pz_nn Pxz_nn Pyz_nn Pxyz_nn 1325 by (subst nn_integral_density) 1326 (auto intro!: nn_integral_mono simp: space_pair_measure ennreal_mult[symmetric]) 1327 also have "\<dots> = (\<integral>\<^sup>+(y, z). \<integral>\<^sup>+ x. ennreal (Pxz (x, z)) * ennreal (Pyz (y, z) / Pz z) \<partial>S \<partial>T \<Otimes>\<^sub>M P)" 1328 using Px_nn Pz_nn Pxz_nn Pyz_nn Pxyz_nn 1329 by (subst STP.nn_integral_snd[symmetric]) 1330 (auto simp add: split_beta' ennreal_mult[symmetric] space_pair_measure intro!: nn_integral_cong) 1331 also have "\<dots> = (\<integral>\<^sup>+x. ennreal (Pyz x) * 1 \<partial>T \<Otimes>\<^sub>M P)" 1332 apply (rule nn_integral_cong_AE) 1333 using aeX1 aeX2 aeX3 AE_space 1334 apply eventually_elim 1335 proof (case_tac x, simp add: space_pair_measure) 1336 fix a b assume "Pz b = 0 \<longrightarrow> Pyz (a, b) = 0" "0 \<le> Pz b" "a \<in> space T \<and> b \<in> space P" 1337 "(\<integral>\<^sup>+ x. ennreal (Pxz (x, b)) \<partial>S) = ennreal (Pz b)" 1338 then show "(\<integral>\<^sup>+ x. ennreal (Pxz (x, b)) * ennreal (Pyz (a, b) / Pz b) \<partial>S) = ennreal (Pyz (a, b))" 1339 using Pyz_nn[of "(a,b)"] 1340 by (subst nn_integral_multc) (auto simp: space_pair_measure ennreal_mult[symmetric]) 1341 qed 1342 also have "\<dots> = 1" 1343 using Q.emeasure_space_1 Pyz_nn distributed_distr_eq_density[OF Pyz] 1344 by (subst nn_integral_density[symmetric]) auto 1345 finally have le1: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<le> 1" . 1346 also have "\<dots> < \<infinity>" by simp 1347 finally have fin: "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>" by simp 1348 1349 have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> 0" 1350 using Pxyz_nn 1351 apply (subst nn_integral_density) 1352 apply (simp_all add: split_beta' ennreal_mult'[symmetric] cong: nn_integral_cong) 1353 proof 1354 let ?g = "\<lambda>x. ennreal (if Pxyz x = 0 then 0 else Pxz (fst x, snd (snd x)) * Pyz (snd x) / Pz (snd (snd x)))" 1355 assume "(\<integral>\<^sup>+ x. ?g x \<partial>(S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P)) = 0" 1356 then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. ?g x = 0" 1357 by (intro nn_integral_0_iff_AE[THEN iffD1]) auto 1358 then have "AE x in S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P. Pxyz x = 0" 1359 using ae1 ae2 ae3 ae4 AE_space 1360 by eventually_elim 1361 (insert Px_nn Pz_nn Pxz_nn Pyz_nn, 1362 auto split: if_split_asm simp: mult_le_0_iff divide_le_0_iff space_pair_measure) 1363 then have "(\<integral>\<^sup>+ x. ennreal (Pxyz x) \<partial>S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) = 0" 1364 by (subst nn_integral_cong_AE[of _ "\<lambda>x. 0"]) auto 1365 with P.emeasure_space_1 show False 1366 by (subst (asm) emeasure_density) (auto cong: nn_integral_cong) 1367 qed 1368 then have pos: "0 < (\<integral>\<^sup>+ x. ?f x \<partial>?P)" 1369 by (simp add: zero_less_iff_neq_zero) 1370 1371 have neg: "(\<integral>\<^sup>+ x. - ?f x \<partial>?P) = 0" 1372 using Pz_nn Pxz_nn Pyz_nn Pxyz_nn 1373 by (intro nn_integral_0_iff_AE[THEN iffD2]) 1374 (auto simp: split_beta' AE_density space_pair_measure intro!: AE_I2 ennreal_neg) 1375 1376 have I3: "integrable (S \<Otimes>\<^sub>M T \<Otimes>\<^sub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))" 1377 apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ Bochner_Integration.integrable_diff[OF I1 I2]]) 1378 using ae 1379 apply (auto simp: split_beta') 1380 done 1381 1382 have "- log b 1 \<le> - log b (integral\<^sup>L ?P ?f)" 1383 proof (intro le_imp_neg_le log_le[OF b_gt_1]) 1384 have If: "integrable ?P ?f" 1385 unfolding real_integrable_def 1386 proof (intro conjI) 1387 from neg show "(\<integral>\<^sup>+ x. - ?f x \<partial>?P) \<noteq> \<infinity>" 1388 by simp 1389 from fin show "(\<integral>\<^sup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>" 1390 by simp 1391 qed simp 1392 then have "(\<integral>\<^sup>+ x. ?f x \<partial>?P) = (\<integral>x. ?f x \<partial>?P)" 1393 using Pz_nn Pxz_nn Pyz_nn Pxyz_nn 1394 by (intro nn_integral_eq_integral) 1395 (auto simp: AE_density space_pair_measure) 1396 with pos le1 1397 show "0 < (\<integral>x. ?f x \<partial>?P)" "(\<integral>x. ?f x \<partial>?P) \<le> 1" 1398 by (simp_all add: ) 1399 qed 1400 also have "- log b (integral\<^sup>L ?P ?f) \<le> (\<integral> x. - log b (?f x) \<partial>?P)" 1401 proof (rule P.jensens_inequality[where a=0 and b=1 and I="{0<..}"]) 1402 show "AE x in ?P. ?f x \<in> {0<..}" 1403 unfolding AE_density[OF distributed_borel_measurable[OF Pxyz]] 1404 using ae1 ae2 ae3 ae4 AE_space 1405 by eventually_elim (insert Pxyz_nn Pyz_nn Pz_nn Pxz_nn, auto simp: space_pair_measure less_le) 1406 show "integrable ?P ?f" 1407 unfolding real_integrable_def 1408 using fin neg by (auto simp: split_beta') 1409 show "integrable ?P (\<lambda>x. - log b (?f x))" 1410 using Pz_nn Pxz_nn Pyz_nn Pxyz_nn 1411 apply (subst integrable_real_density) 1412 apply simp 1413 apply simp 1414 apply simp 1415 apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ I3]) 1416 apply simp 1417 apply simp 1418 using ae1 ae2 ae3 ae4 AE_space 1419 apply eventually_elim 1420 apply (auto simp: log_divide_eq log_mult_eq zero_le_mult_iff zero_less_mult_iff 1421 zero_less_divide_iff field_simps space_pair_measure less_le) 1422 done 1423 qed (auto simp: b_gt_1 minus_log_convex) 1424 also have "\<dots> = conditional_mutual_information b S T P X Y Z" 1425 unfolding \<open>?eq\<close> 1426 using Pz_nn Pxz_nn Pyz_nn Pxyz_nn 1427 apply (subst integral_real_density) 1428 apply simp 1429 apply simp 1430 apply simp 1431 apply (intro integral_cong_AE) 1432 using ae1 ae2 ae3 ae4 AE_space 1433 apply (auto simp: log_divide_eq zero_less_mult_iff zero_less_divide_iff 1434 field_simps space_pair_measure less_le) 1435 done 1436 finally show ?nonneg 1437 by simp 1438qed 1439 1440lemma (in information_space) conditional_mutual_information_eq: 1441 assumes Pz: "simple_distributed M Z Pz" 1442 assumes Pyz: "simple_distributed M (\<lambda>x. (Y x, Z x)) Pyz" 1443 assumes Pxz: "simple_distributed M (\<lambda>x. (X x, Z x)) Pxz" 1444 assumes Pxyz: "simple_distributed M (\<lambda>x. (X x, Y x, Z x)) Pxyz" 1445 shows "\<I>(X ; Y | Z) = 1446 (\<Sum>(x, y, z)\<in>(\<lambda>x. (X x, Y x, Z x))`space M. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))" 1447proof (subst conditional_mutual_information_generic_eq[OF _ _ _ _ _ 1448 simple_distributed[OF Pz] _ simple_distributed_joint[OF Pyz] _ simple_distributed_joint[OF Pxz] _ 1449 simple_distributed_joint2[OF Pxyz]]) 1450 note simple_distributed_joint2_finite[OF Pxyz, simp] 1451 show "sigma_finite_measure (count_space (X ` space M))" 1452 by (simp add: sigma_finite_measure_count_space_finite) 1453 show "sigma_finite_measure (count_space (Y ` space M))" 1454 by (simp add: sigma_finite_measure_count_space_finite) 1455 show "sigma_finite_measure (count_space (Z ` space M))" 1456 by (simp add: sigma_finite_measure_count_space_finite) 1457 have "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) \<Otimes>\<^sub>M count_space (Z ` space M) = 1458 count_space (X`space M \<times> Y`space M \<times> Z`space M)" 1459 (is "?P = ?C") 1460 by (simp add: pair_measure_count_space) 1461 1462 let ?Px = "\<lambda>x. measure M (X -` {x} \<inter> space M)" 1463 have "(\<lambda>x. (X x, Z x)) \<in> measurable M (count_space (X ` space M) \<Otimes>\<^sub>M count_space (Z ` space M))" 1464 using simple_distributed_joint[OF Pxz] by (rule distributed_measurable) 1465 from measurable_comp[OF this measurable_fst] 1466 have "random_variable (count_space (X ` space M)) X" 1467 by (simp add: comp_def) 1468 then have "simple_function M X" 1469 unfolding simple_function_def by (auto simp: measurable_count_space_eq2) 1470 then have "simple_distributed M X ?Px" 1471 by (rule simple_distributedI) (auto simp: measure_nonneg) 1472 then show "distributed M (count_space (X ` space M)) X ?Px" 1473 by (rule simple_distributed) 1474 1475 let ?f = "(\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x, Z x)) ` space M then Pxyz x else 0)" 1476 let ?g = "(\<lambda>x. if x \<in> (\<lambda>x. (Y x, Z x)) ` space M then Pyz x else 0)" 1477 let ?h = "(\<lambda>x. if x \<in> (\<lambda>x. (X x, Z x)) ` space M then Pxz x else 0)" 1478 show 1479 "integrable ?P (\<lambda>(x, y, z). ?f (x, y, z) * log b (?f (x, y, z) / (?Px x * ?g (y, z))))" 1480 "integrable ?P (\<lambda>(x, y, z). ?f (x, y, z) * log b (?h (x, z) / (?Px x * Pz z)))" 1481 by (auto intro!: integrable_count_space simp: pair_measure_count_space) 1482 let ?i = "\<lambda>x y z. ?f (x, y, z) * log b (?f (x, y, z) / (?h (x, z) * (?g (y, z) / Pz z)))" 1483 let ?j = "\<lambda>x y z. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z)))" 1484 have "(\<lambda>(x, y, z). ?i x y z) = (\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x, Z x)) ` space M then ?j (fst x) (fst (snd x)) (snd (snd x)) else 0)" 1485 by (auto intro!: ext) 1486 then show "(\<integral> (x, y, z). ?i x y z \<partial>?P) = (\<Sum>(x, y, z)\<in>(\<lambda>x. (X x, Y x, Z x)) ` space M. ?j x y z)" 1487 by (auto intro!: sum.cong simp add: \<open>?P = ?C\<close> lebesgue_integral_count_space_finite simple_distributed_finite sum.If_cases split_beta') 1488qed (insert Pz Pyz Pxz Pxyz, auto intro: measure_nonneg) 1489 1490lemma (in information_space) conditional_mutual_information_nonneg: 1491 assumes X: "simple_function M X" and Y: "simple_function M Y" and Z: "simple_function M Z" 1492 shows "0 \<le> \<I>(X ; Y | Z)" 1493proof - 1494 have [simp]: "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) \<Otimes>\<^sub>M count_space (Z ` space M) = 1495 count_space (X`space M \<times> Y`space M \<times> Z`space M)" 1496 by (simp add: pair_measure_count_space X Y Z simple_functionD) 1497 note sf = sigma_finite_measure_count_space_finite[OF simple_functionD(1)] 1498 note sd = simple_distributedI[OF _ _ refl] 1499 note sp = simple_function_Pair 1500 show ?thesis 1501 apply (rule conditional_mutual_information_generic_nonneg[OF sf[OF X] sf[OF Y] sf[OF Z]]) 1502 apply (rule simple_distributed[OF sd[OF X]]) 1503 apply simp 1504 apply simp 1505 apply (rule simple_distributed[OF sd[OF Z]]) 1506 apply simp 1507 apply simp 1508 apply (rule simple_distributed_joint[OF sd[OF sp[OF Y Z]]]) 1509 apply simp 1510 apply simp 1511 apply (rule simple_distributed_joint[OF sd[OF sp[OF X Z]]]) 1512 apply simp 1513 apply simp 1514 apply (rule simple_distributed_joint2[OF sd[OF sp[OF X sp[OF Y Z]]]]) 1515 apply simp 1516 apply simp 1517 apply (auto intro!: integrable_count_space simp: X Y Z simple_functionD) 1518 done 1519qed 1520 1521subsection \<open>Conditional Entropy\<close> 1522 1523definition (in prob_space) 1524 "conditional_entropy b S T X Y = - (\<integral>(x, y). log b (enn2real (RN_deriv (S \<Otimes>\<^sub>M T) (distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))) (x, y)) / 1525 enn2real (RN_deriv T (distr M T Y) y)) \<partial>distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)))" 1526 1527abbreviation (in information_space) 1528 conditional_entropy_Pow ("\<H>'(_ | _')") where 1529 "\<H>(X | Y) \<equiv> conditional_entropy b (count_space (X`space M)) (count_space (Y`space M)) X Y" 1530 1531lemma (in information_space) conditional_entropy_generic_eq: 1532 fixes Pxy :: "_ \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" 1533 assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" 1534 assumes Py[measurable]: "distributed M T Y Py" and Py_nn[simp]: "\<And>x. x \<in> space T \<Longrightarrow> 0 \<le> Py x" 1535 assumes Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy" 1536 and Pxy_nn[simp]: "\<And>x y. x \<in> space S \<Longrightarrow> y \<in> space T \<Longrightarrow> 0 \<le> Pxy (x, y)" 1537 shows "conditional_entropy b S T X Y = - (\<integral>(x, y). Pxy (x, y) * log b (Pxy (x, y) / Py y) \<partial>(S \<Otimes>\<^sub>M T))" 1538proof - 1539 interpret S: sigma_finite_measure S by fact 1540 interpret T: sigma_finite_measure T by fact 1541 interpret ST: pair_sigma_finite S T .. 1542 1543 have [measurable]: "Py \<in> T \<rightarrow>\<^sub>M borel" 1544 using Py Py_nn by (intro distributed_real_measurable) 1545 have measurable_Pxy[measurable]: "Pxy \<in> (S \<Otimes>\<^sub>M T) \<rightarrow>\<^sub>M borel" 1546 using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure) 1547 1548 have "AE x in density (S \<Otimes>\<^sub>M T) (\<lambda>x. ennreal (Pxy x)). Pxy x = enn2real (RN_deriv (S \<Otimes>\<^sub>M T) (distr M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))) x)" 1549 unfolding AE_density[OF distributed_borel_measurable, OF Pxy] 1550 unfolding distributed_distr_eq_density[OF Pxy] 1551 using distributed_RN_deriv[OF Pxy] 1552 by auto 1553 moreover 1554 have "AE x in density (S \<Otimes>\<^sub>M T) (\<lambda>x. ennreal (Pxy x)). Py (snd x) = enn2real (RN_deriv T (distr M T Y) (snd x))" 1555 unfolding AE_density[OF distributed_borel_measurable, OF Pxy] 1556 unfolding distributed_distr_eq_density[OF Py] 1557 apply (rule ST.AE_pair_measure) 1558 apply auto 1559 using distributed_RN_deriv[OF Py] 1560 apply auto 1561 done 1562 ultimately 1563 have "conditional_entropy b S T X Y = - (\<integral>x. Pxy x * log b (Pxy x / Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))" 1564 unfolding conditional_entropy_def neg_equal_iff_equal 1565 apply (subst integral_real_density[symmetric]) 1566 apply (auto simp: distributed_distr_eq_density[OF Pxy] space_pair_measure 1567 intro!: integral_cong_AE) 1568 done 1569 then show ?thesis by (simp add: split_beta') 1570qed 1571 1572lemma (in information_space) conditional_entropy_eq_entropy: 1573 fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" 1574 assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" 1575 assumes Py[measurable]: "distributed M T Y Py" 1576 and Py_nn[simp]: "\<And>x. x \<in> space T \<Longrightarrow> 0 \<le> Py x" 1577 assumes Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy" 1578 and Pxy_nn[simp]: "\<And>x y. x \<in> space S \<Longrightarrow> y \<in> space T \<Longrightarrow> 0 \<le> Pxy (x, y)" 1579 assumes I1: "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x))" 1580 assumes I2: "integrable (S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))" 1581 shows "conditional_entropy b S T X Y = entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) - entropy b T Y" 1582proof - 1583 interpret S: sigma_finite_measure S by fact 1584 interpret T: sigma_finite_measure T by fact 1585 interpret ST: pair_sigma_finite S T .. 1586 1587 have [measurable]: "Py \<in> T \<rightarrow>\<^sub>M borel" 1588 using Py Py_nn by (intro distributed_real_measurable) 1589 have measurable_Pxy[measurable]: "Pxy \<in> (S \<Otimes>\<^sub>M T) \<rightarrow>\<^sub>M borel" 1590 using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure) 1591 1592 have "entropy b T Y = - (\<integral>y. Py y * log b (Py y) \<partial>T)" 1593 by (rule entropy_distr[OF Py Py_nn]) 1594 also have "\<dots> = - (\<integral>(x,y). Pxy (x,y) * log b (Py y) \<partial>(S \<Otimes>\<^sub>M T))" 1595 using b_gt_1 1596 by (subst distributed_transform_integral[OF Pxy _ Py, where T=snd]) 1597 (auto intro!: Bochner_Integration.integral_cong simp: space_pair_measure) 1598 finally have e_eq: "entropy b T Y = - (\<integral>(x,y). Pxy (x,y) * log b (Py y) \<partial>(S \<Otimes>\<^sub>M T))" . 1599 1600 have **: "\<And>x. x \<in> space (S \<Otimes>\<^sub>M T) \<Longrightarrow> 0 \<le> Pxy x" 1601 by (auto simp: space_pair_measure) 1602 1603 have ae2: "AE x in S \<Otimes>\<^sub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0" 1604 by (intro subdensity_real[of snd, OF _ Pxy Py]) 1605 (auto intro: measurable_Pair simp: space_pair_measure) 1606 moreover have ae4: "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Py (snd x)" 1607 using Py by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'') 1608 ultimately have "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Pxy x \<and> 0 \<le> Py (snd x) \<and> 1609 (Pxy x = 0 \<or> (Pxy x \<noteq> 0 \<longrightarrow> 0 < Pxy x \<and> 0 < Py (snd x)))" 1610 using AE_space by eventually_elim (auto simp: space_pair_measure less_le) 1611 then have ae: "AE x in S \<Otimes>\<^sub>M T. 1612 Pxy x * log b (Pxy x) - Pxy x * log b (Py (snd x)) = Pxy x * log b (Pxy x / Py (snd x))" 1613 by eventually_elim (auto simp: log_simps field_simps b_gt_1) 1614 have "conditional_entropy b S T X Y = 1615 - (\<integral>x. Pxy x * log b (Pxy x) - Pxy x * log b (Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))" 1616 unfolding conditional_entropy_generic_eq[OF S T Py Py_nn Pxy Pxy_nn, simplified] neg_equal_iff_equal 1617 apply (intro integral_cong_AE) 1618 using ae 1619 apply auto 1620 done 1621 also have "\<dots> = - (\<integral>x. Pxy x * log b (Pxy x) \<partial>(S \<Otimes>\<^sub>M T)) - - (\<integral>x. Pxy x * log b (Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))" 1622 by (simp add: Bochner_Integration.integral_diff[OF I1 I2]) 1623 finally show ?thesis 1624 using conditional_entropy_generic_eq[OF S T Py Py_nn Pxy Pxy_nn, simplified] 1625 entropy_distr[OF Pxy **, simplified] e_eq 1626 by (simp add: split_beta') 1627qed 1628 1629lemma (in information_space) conditional_entropy_eq_entropy_simple: 1630 assumes X: "simple_function M X" and Y: "simple_function M Y" 1631 shows "\<H>(X | Y) = entropy b (count_space (X`space M) \<Otimes>\<^sub>M count_space (Y`space M)) (\<lambda>x. (X x, Y x)) - \<H>(Y)" 1632proof - 1633 have "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X`space M \<times> Y`space M)" 1634 (is "?P = ?C") using X Y by (simp add: simple_functionD pair_measure_count_space) 1635 show ?thesis 1636 by (rule conditional_entropy_eq_entropy sigma_finite_measure_count_space_finite 1637 simple_functionD X Y simple_distributed simple_distributedI[OF _ _ refl] 1638 simple_distributed_joint simple_function_Pair integrable_count_space measure_nonneg)+ 1639 (auto simp: \<open>?P = ?C\<close> measure_nonneg intro!: integrable_count_space simple_functionD X Y) 1640qed 1641 1642lemma (in information_space) conditional_entropy_eq: 1643 assumes Y: "simple_distributed M Y Py" 1644 assumes XY: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy" 1645 shows "\<H>(X | Y) = - (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / Py y))" 1646proof (subst conditional_entropy_generic_eq[OF _ _ 1647 simple_distributed[OF Y] _ simple_distributed_joint[OF XY]]) 1648 have "finite ((\<lambda>x. (X x, Y x))`space M)" 1649 using XY unfolding simple_distributed_def by auto 1650 from finite_imageI[OF this, of fst] 1651 have [simp]: "finite (X`space M)" 1652 by (simp add: image_comp comp_def) 1653 note Y[THEN simple_distributed_finite, simp] 1654 show "sigma_finite_measure (count_space (X ` space M))" 1655 by (simp add: sigma_finite_measure_count_space_finite) 1656 show "sigma_finite_measure (count_space (Y ` space M))" 1657 by (simp add: sigma_finite_measure_count_space_finite) 1658 let ?f = "(\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x else 0)" 1659 have "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X`space M \<times> Y`space M)" 1660 (is "?P = ?C") 1661 using Y by (simp add: simple_distributed_finite pair_measure_count_space) 1662 have eq: "(\<lambda>(x, y). ?f (x, y) * log b (?f (x, y) / Py y)) = 1663 (\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x * log b (Pxy x / Py (snd x)) else 0)" 1664 by auto 1665 from Y show "- (\<integral> (x, y). ?f (x, y) * log b (?f (x, y) / Py y) \<partial>?P) = 1666 - (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / Py y))" 1667 by (auto intro!: sum.cong simp add: \<open>?P = ?C\<close> lebesgue_integral_count_space_finite simple_distributed_finite eq sum.If_cases split_beta') 1668qed (insert Y XY, auto) 1669 1670lemma (in information_space) conditional_mutual_information_eq_conditional_entropy: 1671 assumes X: "simple_function M X" and Y: "simple_function M Y" 1672 shows "\<I>(X ; X | Y) = \<H>(X | Y)" 1673proof - 1674 define Py where "Py x = (if x \<in> Y`space M then measure M (Y -` {x} \<inter> space M) else 0)" for x 1675 define Pxy where "Pxy x = 1676 (if x \<in> (\<lambda>x. (X x, Y x))`space M then measure M ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M) else 0)" 1677 for x 1678 define Pxxy where "Pxxy x = 1679 (if x \<in> (\<lambda>x. (X x, X x, Y x))`space M then measure M ((\<lambda>x. (X x, X x, Y x)) -` {x} \<inter> space M) 1680 else 0)" 1681 for x 1682 let ?M = "X`space M \<times> X`space M \<times> Y`space M" 1683 1684 note XY = simple_function_Pair[OF X Y] 1685 note XXY = simple_function_Pair[OF X XY] 1686 have Py: "simple_distributed M Y Py" 1687 using Y by (rule simple_distributedI) (auto simp: Py_def measure_nonneg) 1688 have Pxy: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy" 1689 using XY by (rule simple_distributedI) (auto simp: Pxy_def measure_nonneg) 1690 have Pxxy: "simple_distributed M (\<lambda>x. (X x, X x, Y x)) Pxxy" 1691 using XXY by (rule simple_distributedI) (auto simp: Pxxy_def measure_nonneg) 1692 have eq: "(\<lambda>x. (X x, X x, Y x)) ` space M = (\<lambda>(x, y). (x, x, y)) ` (\<lambda>x. (X x, Y x)) ` space M" 1693 by auto 1694 have inj: "\<And>A. inj_on (\<lambda>(x, y). (x, x, y)) A" 1695 by (auto simp: inj_on_def) 1696 have Pxxy_eq: "\<And>x y. Pxxy (x, x, y) = Pxy (x, y)" 1697 by (auto simp: Pxxy_def Pxy_def intro!: arg_cong[where f=prob]) 1698 have "AE x in count_space ((\<lambda>x. (X x, Y x))`space M). Py (snd x) = 0 \<longrightarrow> Pxy x = 0" 1699 using Py Pxy 1700 by (intro subdensity_real[of snd, OF _ Pxy[THEN simple_distributed] Py[THEN simple_distributed]]) 1701 (auto intro: measurable_Pair simp: AE_count_space) 1702 then show ?thesis 1703 apply (subst conditional_mutual_information_eq[OF Py Pxy Pxy Pxxy]) 1704 apply (subst conditional_entropy_eq[OF Py Pxy]) 1705 apply (auto intro!: sum.cong simp: Pxxy_eq sum_negf[symmetric] eq sum.reindex[OF inj] 1706 log_simps zero_less_mult_iff zero_le_mult_iff field_simps mult_less_0_iff AE_count_space) 1707 using Py[THEN simple_distributed] Pxy[THEN simple_distributed] 1708 apply (auto simp add: not_le AE_count_space less_le antisym 1709 simple_distributed_nonneg[OF Py] simple_distributed_nonneg[OF Pxy]) 1710 done 1711qed 1712 1713lemma (in information_space) conditional_entropy_nonneg: 1714 assumes X: "simple_function M X" and Y: "simple_function M Y" shows "0 \<le> \<H>(X | Y)" 1715 using conditional_mutual_information_eq_conditional_entropy[OF X Y] conditional_mutual_information_nonneg[OF X X Y] 1716 by simp 1717 1718subsection \<open>Equalities\<close> 1719 1720lemma (in information_space) mutual_information_eq_entropy_conditional_entropy_distr: 1721 fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" and Pxy :: "('b \<times> 'c) \<Rightarrow> real" 1722 assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" 1723 assumes Px[measurable]: "distributed M S X Px" 1724 and Px_nn[simp]: "\<And>x. x \<in> space S \<Longrightarrow> 0 \<le> Px x" 1725 and Py[measurable]: "distributed M T Y Py" 1726 and Py_nn[simp]: "\<And>x. x \<in> space T \<Longrightarrow> 0 \<le> Py x" 1727 and Pxy[measurable]: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy" 1728 and Pxy_nn[simp]: "\<And>x y. x \<in> space S \<Longrightarrow> y \<in> space T \<Longrightarrow> 0 \<le> Pxy (x, y)" 1729 assumes Ix: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Px (fst x)))" 1730 assumes Iy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))" 1731 assumes Ixy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x))" 1732 shows "mutual_information b S T X Y = entropy b S X + entropy b T Y - entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x))" 1733proof - 1734 have [measurable]: "Px \<in> S \<rightarrow>\<^sub>M borel" 1735 using Px Px_nn by (intro distributed_real_measurable) 1736 have [measurable]: "Py \<in> T \<rightarrow>\<^sub>M borel" 1737 using Py Py_nn by (intro distributed_real_measurable) 1738 have measurable_Pxy[measurable]: "Pxy \<in> (S \<Otimes>\<^sub>M T) \<rightarrow>\<^sub>M borel" 1739 using Pxy Pxy_nn by (intro distributed_real_measurable) (auto simp: space_pair_measure) 1740 1741 have X: "entropy b S X = - (\<integral>x. Pxy x * log b (Px (fst x)) \<partial>(S \<Otimes>\<^sub>M T))" 1742 using b_gt_1 1743 apply (subst entropy_distr[OF Px Px_nn], simp) 1744 apply (subst distributed_transform_integral[OF Pxy _ Px, where T=fst]) 1745 apply (auto intro!: integral_cong simp: space_pair_measure) 1746 done 1747 1748 have Y: "entropy b T Y = - (\<integral>x. Pxy x * log b (Py (snd x)) \<partial>(S \<Otimes>\<^sub>M T))" 1749 using b_gt_1 1750 apply (subst entropy_distr[OF Py Py_nn], simp) 1751 apply (subst distributed_transform_integral[OF Pxy _ Py, where T=snd]) 1752 apply (auto intro!: integral_cong simp: space_pair_measure) 1753 done 1754 1755 interpret S: sigma_finite_measure S by fact 1756 interpret T: sigma_finite_measure T by fact 1757 interpret ST: pair_sigma_finite S T .. 1758 have ST: "sigma_finite_measure (S \<Otimes>\<^sub>M T)" .. 1759 1760 have XY: "entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) = - (\<integral>x. Pxy x * log b (Pxy x) \<partial>(S \<Otimes>\<^sub>M T))" 1761 by (subst entropy_distr[OF Pxy]) (auto intro!: integral_cong simp: space_pair_measure) 1762 1763 have "AE x in S \<Otimes>\<^sub>M T. Px (fst x) = 0 \<longrightarrow> Pxy x = 0" 1764 by (intro subdensity_real[of fst, OF _ Pxy Px]) (auto intro: measurable_Pair simp: space_pair_measure) 1765 moreover have "AE x in S \<Otimes>\<^sub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0" 1766 by (intro subdensity_real[of snd, OF _ Pxy Py]) (auto intro: measurable_Pair simp: space_pair_measure) 1767 moreover have "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Px (fst x)" 1768 using Px by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_fst'') 1769 moreover have "AE x in S \<Otimes>\<^sub>M T. 0 \<le> Py (snd x)" 1770 using Py by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'') 1771 ultimately have "AE x in S \<Otimes>\<^sub>M T. Pxy x * log b (Pxy x) - Pxy x * log b (Px (fst x)) - Pxy x * log b (Py (snd x)) = 1772 Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))" 1773 (is "AE x in _. ?f x = ?g x") 1774 using AE_space 1775 proof eventually_elim 1776 case (elim x) 1777 show ?case 1778 proof cases 1779 assume "Pxy x \<noteq> 0" 1780 with elim have "0 < Px (fst x)" "0 < Py (snd x)" "0 < Pxy x" 1781 by (auto simp: space_pair_measure less_le) 1782 then show ?thesis 1783 using b_gt_1 by (simp add: log_simps less_imp_le field_simps) 1784 qed simp 1785 qed 1786 1787 have "entropy b S X + entropy b T Y - entropy b (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) = integral\<^sup>L (S \<Otimes>\<^sub>M T) ?f" 1788 unfolding X Y XY 1789 apply (subst Bochner_Integration.integral_diff) 1790 apply (intro Bochner_Integration.integrable_diff Ixy Ix Iy)+ 1791 apply (subst Bochner_Integration.integral_diff) 1792 apply (intro Ixy Ix Iy)+ 1793 apply (simp add: field_simps) 1794 done 1795 also have "\<dots> = integral\<^sup>L (S \<Otimes>\<^sub>M T) ?g" 1796 using \<open>AE x in _. ?f x = ?g x\<close> by (intro integral_cong_AE) auto 1797 also have "\<dots> = mutual_information b S T X Y" 1798 by (rule mutual_information_distr[OF S T Px _ Py _ Pxy _ , symmetric]) 1799 (auto simp: space_pair_measure) 1800 finally show ?thesis .. 1801qed 1802 1803lemma (in information_space) mutual_information_eq_entropy_conditional_entropy': 1804 fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" and Pxy :: "('b \<times> 'c) \<Rightarrow> real" 1805 assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" 1806 assumes Px: "distributed M S X Px" "\<And>x. x \<in> space S \<Longrightarrow> 0 \<le> Px x" 1807 and Py: "distributed M T Y Py" "\<And>x. x \<in> space T \<Longrightarrow> 0 \<le> Py x" 1808 assumes Pxy: "distributed M (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy" 1809 "\<And>x. x \<in> space (S \<Otimes>\<^sub>M T) \<Longrightarrow> 0 \<le> Pxy x" 1810 assumes Ix: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Px (fst x)))" 1811 assumes Iy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))" 1812 assumes Ixy: "integrable(S \<Otimes>\<^sub>M T) (\<lambda>x. Pxy x * log b (Pxy x))" 1813 shows "mutual_information b S T X Y = entropy b S X - conditional_entropy b S T X Y" 1814 using 1815 mutual_information_eq_entropy_conditional_entropy_distr[OF S T Px Py Pxy Ix Iy Ixy] 1816 conditional_entropy_eq_entropy[OF S T Py Pxy Ixy Iy] 1817 by (simp add: space_pair_measure) 1818 1819lemma (in information_space) mutual_information_eq_entropy_conditional_entropy: 1820 assumes sf_X: "simple_function M X" and sf_Y: "simple_function M Y" 1821 shows "\<I>(X ; Y) = \<H>(X) - \<H>(X | Y)" 1822proof - 1823 have X: "simple_distributed M X (\<lambda>x. measure M (X -` {x} \<inter> space M))" 1824 using sf_X by (rule simple_distributedI) (auto simp: measure_nonneg) 1825 have Y: "simple_distributed M Y (\<lambda>x. measure M (Y -` {x} \<inter> space M))" 1826 using sf_Y by (rule simple_distributedI) (auto simp: measure_nonneg) 1827 have sf_XY: "simple_function M (\<lambda>x. (X x, Y x))" 1828 using sf_X sf_Y by (rule simple_function_Pair) 1829 then have XY: "simple_distributed M (\<lambda>x. (X x, Y x)) (\<lambda>x. measure M ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M))" 1830 by (rule simple_distributedI) (auto simp: measure_nonneg) 1831 from simple_distributed_joint_finite[OF this, simp] 1832 have eq: "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X ` space M \<times> Y ` space M)" 1833 by (simp add: pair_measure_count_space) 1834 1835 have "\<I>(X ; Y) = \<H>(X) + \<H>(Y) - entropy b (count_space (X`space M) \<Otimes>\<^sub>M count_space (Y`space M)) (\<lambda>x. (X x, Y x))" 1836 using sigma_finite_measure_count_space_finite 1837 sigma_finite_measure_count_space_finite 1838 simple_distributed[OF X] measure_nonneg simple_distributed[OF Y] measure_nonneg simple_distributed_joint[OF XY] 1839 by (rule mutual_information_eq_entropy_conditional_entropy_distr) 1840 (auto simp: eq integrable_count_space measure_nonneg) 1841 then show ?thesis 1842 unfolding conditional_entropy_eq_entropy_simple[OF sf_X sf_Y] by simp 1843qed 1844 1845lemma (in information_space) mutual_information_nonneg_simple: 1846 assumes sf_X: "simple_function M X" and sf_Y: "simple_function M Y" 1847 shows "0 \<le> \<I>(X ; Y)" 1848proof - 1849 have X: "simple_distributed M X (\<lambda>x. measure M (X -` {x} \<inter> space M))" 1850 using sf_X by (rule simple_distributedI) (auto simp: measure_nonneg) 1851 have Y: "simple_distributed M Y (\<lambda>x. measure M (Y -` {x} \<inter> space M))" 1852 using sf_Y by (rule simple_distributedI) (auto simp: measure_nonneg) 1853 1854 have sf_XY: "simple_function M (\<lambda>x. (X x, Y x))" 1855 using sf_X sf_Y by (rule simple_function_Pair) 1856 then have XY: "simple_distributed M (\<lambda>x. (X x, Y x)) (\<lambda>x. measure M ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M))" 1857 by (rule simple_distributedI) (auto simp: measure_nonneg) 1858 1859 from simple_distributed_joint_finite[OF this, simp] 1860 have eq: "count_space (X ` space M) \<Otimes>\<^sub>M count_space (Y ` space M) = count_space (X ` space M \<times> Y ` space M)" 1861 by (simp add: pair_measure_count_space) 1862 1863 show ?thesis 1864 by (rule mutual_information_nonneg[OF _ _ simple_distributed[OF X] _ simple_distributed[OF Y] _ simple_distributed_joint[OF XY]]) 1865 (simp_all add: eq integrable_count_space sigma_finite_measure_count_space_finite measure_nonneg) 1866qed 1867 1868lemma (in information_space) conditional_entropy_less_eq_entropy: 1869 assumes X: "simple_function M X" and Z: "simple_function M Z" 1870 shows "\<H>(X | Z) \<le> \<H>(X)" 1871proof - 1872 have "0 \<le> \<I>(X ; Z)" using X Z by (rule mutual_information_nonneg_simple) 1873 also have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy[OF assms] . 1874 finally show ?thesis by auto 1875qed 1876 1877lemma (in information_space) 1878 fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" and Pxy :: "('b \<times> 'c) \<Rightarrow> real" 1879 assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" 1880 assumes Px: "finite_entropy S X Px" and Py: "finite_entropy T Y Py" 1881 assumes Pxy: "finite_entropy (S \<Otimes>\<^sub>M T) (\<lambda>x. (X x, Y x)) Pxy" 1882 shows "conditional_entropy b S T X Y \<le> entropy b S X" 1883proof - 1884 1885 have "0 \<le> mutual_information b S T X Y" 1886 by (rule mutual_information_nonneg') fact+ 1887 also have "\<dots> = entropy b S X - conditional_entropy b S T X Y" 1888 apply (rule mutual_information_eq_entropy_conditional_entropy') 1889 using assms 1890 by (auto intro!: finite_entropy_integrable finite_entropy_distributed 1891 finite_entropy_integrable_transform[OF Px] 1892 finite_entropy_integrable_transform[OF Py] 1893 intro: finite_entropy_nn) 1894 finally show ?thesis by auto 1895qed 1896 1897lemma (in information_space) entropy_chain_rule: 1898 assumes X: "simple_function M X" and Y: "simple_function M Y" 1899 shows "\<H>(\<lambda>x. (X x, Y x)) = \<H>(X) + \<H>(Y|X)" 1900proof - 1901 note XY = simple_distributedI[OF simple_function_Pair[OF X Y] measure_nonneg refl] 1902 note YX = simple_distributedI[OF simple_function_Pair[OF Y X] measure_nonneg refl] 1903 note simple_distributed_joint_finite[OF this, simp] 1904 let ?f = "\<lambda>x. prob ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M)" 1905 let ?g = "\<lambda>x. prob ((\<lambda>x. (Y x, X x)) -` {x} \<inter> space M)" 1906 let ?h = "\<lambda>x. if x \<in> (\<lambda>x. (Y x, X x)) ` space M then prob ((\<lambda>x. (Y x, X x)) -` {x} \<inter> space M) else 0" 1907 have "\<H>(\<lambda>x. (X x, Y x)) = - (\<Sum>x\<in>(\<lambda>x. (X x, Y x)) ` space M. ?f x * log b (?f x))" 1908 using XY by (rule entropy_simple_distributed) 1909 also have "\<dots> = - (\<Sum>x\<in>(\<lambda>(x, y). (y, x)) ` (\<lambda>x. (X x, Y x)) ` space M. ?g x * log b (?g x))" 1910 by (subst (2) sum.reindex) (auto simp: inj_on_def intro!: sum.cong arg_cong[where f="\<lambda>A. prob A * log b (prob A)"]) 1911 also have "\<dots> = - (\<Sum>x\<in>(\<lambda>x. (Y x, X x)) ` space M. ?h x * log b (?h x))" 1912 by (auto intro!: sum.cong) 1913 also have "\<dots> = entropy b (count_space (Y ` space M) \<Otimes>\<^sub>M count_space (X ` space M)) (\<lambda>x. (Y x, X x))" 1914 by (subst entropy_distr[OF simple_distributed_joint[OF YX]]) 1915 (auto simp: pair_measure_count_space sigma_finite_measure_count_space_finite lebesgue_integral_count_space_finite 1916 cong del: sum.cong_simp intro!: sum.mono_neutral_left measure_nonneg) 1917 finally have "\<H>(\<lambda>x. (X x, Y x)) = entropy b (count_space (Y ` space M) \<Otimes>\<^sub>M count_space (X ` space M)) (\<lambda>x. (Y x, X x))" . 1918 then show ?thesis 1919 unfolding conditional_entropy_eq_entropy_simple[OF Y X] by simp 1920qed 1921 1922lemma (in information_space) entropy_partition: 1923 assumes X: "simple_function M X" 1924 shows "\<H>(X) = \<H>(f \<circ> X) + \<H>(X|f \<circ> X)" 1925proof - 1926 note fX = simple_function_compose[OF X, of f] 1927 have eq: "(\<lambda>x. ((f \<circ> X) x, X x)) ` space M = (\<lambda>x. (f x, x)) ` X ` space M" by auto 1928 have inj: "\<And>A. inj_on (\<lambda>x. (f x, x)) A" 1929 by (auto simp: inj_on_def) 1930 show ?thesis 1931 apply (subst entropy_chain_rule[symmetric, OF fX X]) 1932 apply (subst entropy_simple_distributed[OF simple_distributedI[OF simple_function_Pair[OF fX X] measure_nonneg refl]]) 1933 apply (subst entropy_simple_distributed[OF simple_distributedI[OF X measure_nonneg refl]]) 1934 unfolding eq 1935 apply (subst sum.reindex[OF inj]) 1936 apply (auto intro!: sum.cong arg_cong[where f="\<lambda>A. prob A * log b (prob A)"]) 1937 done 1938qed 1939 1940corollary (in information_space) entropy_data_processing: 1941 assumes X: "simple_function M X" shows "\<H>(f \<circ> X) \<le> \<H>(X)" 1942proof - 1943 note fX = simple_function_compose[OF X, of f] 1944 from X have "\<H>(X) = \<H>(f\<circ>X) + \<H>(X|f\<circ>X)" by (rule entropy_partition) 1945 then show "\<H>(f \<circ> X) \<le> \<H>(X)" 1946 by (simp only: conditional_entropy_nonneg [OF X fX] le_add_same_cancel1) 1947qed 1948 1949corollary (in information_space) entropy_of_inj: 1950 assumes X: "simple_function M X" and inj: "inj_on f (X`space M)" 1951 shows "\<H>(f \<circ> X) = \<H>(X)" 1952proof (rule antisym) 1953 show "\<H>(f \<circ> X) \<le> \<H>(X)" using entropy_data_processing[OF X] . 1954next 1955 have sf: "simple_function M (f \<circ> X)" 1956 using X by auto 1957 have "\<H>(X) = \<H>(the_inv_into (X`space M) f \<circ> (f \<circ> X))" 1958 unfolding o_assoc 1959 apply (subst entropy_simple_distributed[OF simple_distributedI[OF X measure_nonneg refl]]) 1960 apply (subst entropy_simple_distributed[OF simple_distributedI[OF simple_function_compose[OF X]], where f="\<lambda>x. prob (X -` {x} \<inter> space M)"]) 1961 apply (auto intro!: sum.cong arg_cong[where f=prob] image_eqI simp: the_inv_into_f_f[OF inj] comp_def measure_nonneg) 1962 done 1963 also have "... \<le> \<H>(f \<circ> X)" 1964 using entropy_data_processing[OF sf] . 1965 finally show "\<H>(X) \<le> \<H>(f \<circ> X)" . 1966qed 1967 1968end 1969